LLVM 23.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DIBuilder.h"
39#include "llvm/IR/Function.h"
41#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/LLVMContext.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
50#include "llvm/IR/Value.h"
53#include "llvm/Support/Error.h"
65
66#include <cstdint>
67#include <optional>
68
69#define DEBUG_TYPE "openmp-ir-builder"
70
71using namespace llvm;
72using namespace omp;
73
74static cl::opt<bool>
75 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
76 cl::desc("Use optimistic attributes describing "
77 "'as-if' properties of runtime calls."),
78 cl::init(false));
79
81 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
82 cl::desc("Factor for the unroll threshold to account for code "
83 "simplifications still taking place"),
84 cl::init(1.5));
85
86#ifndef NDEBUG
87/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
88/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
89/// an InsertPoint stores the instruction before something is inserted. For
90/// instance, if both point to the same instruction, two IRBuilders alternating
91/// creating instruction will cause the instructions to be interleaved.
94 if (!IP1.isSet() || !IP2.isSet())
95 return false;
96 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
97}
98
100 // Valid ordered/unordered and base algorithm combinations.
101 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
102 case OMPScheduleType::UnorderedStaticChunked:
103 case OMPScheduleType::UnorderedStatic:
104 case OMPScheduleType::UnorderedDynamicChunked:
105 case OMPScheduleType::UnorderedGuidedChunked:
106 case OMPScheduleType::UnorderedRuntime:
107 case OMPScheduleType::UnorderedAuto:
108 case OMPScheduleType::UnorderedTrapezoidal:
109 case OMPScheduleType::UnorderedGreedy:
110 case OMPScheduleType::UnorderedBalanced:
111 case OMPScheduleType::UnorderedGuidedIterativeChunked:
112 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
113 case OMPScheduleType::UnorderedSteal:
114 case OMPScheduleType::UnorderedStaticBalancedChunked:
115 case OMPScheduleType::UnorderedGuidedSimd:
116 case OMPScheduleType::UnorderedRuntimeSimd:
117 case OMPScheduleType::OrderedStaticChunked:
118 case OMPScheduleType::OrderedStatic:
119 case OMPScheduleType::OrderedDynamicChunked:
120 case OMPScheduleType::OrderedGuidedChunked:
121 case OMPScheduleType::OrderedRuntime:
122 case OMPScheduleType::OrderedAuto:
123 case OMPScheduleType::OrderdTrapezoidal:
124 case OMPScheduleType::NomergeUnorderedStaticChunked:
125 case OMPScheduleType::NomergeUnorderedStatic:
126 case OMPScheduleType::NomergeUnorderedDynamicChunked:
127 case OMPScheduleType::NomergeUnorderedGuidedChunked:
128 case OMPScheduleType::NomergeUnorderedRuntime:
129 case OMPScheduleType::NomergeUnorderedAuto:
130 case OMPScheduleType::NomergeUnorderedTrapezoidal:
131 case OMPScheduleType::NomergeUnorderedGreedy:
132 case OMPScheduleType::NomergeUnorderedBalanced:
133 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
134 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
135 case OMPScheduleType::NomergeUnorderedSteal:
136 case OMPScheduleType::NomergeOrderedStaticChunked:
137 case OMPScheduleType::NomergeOrderedStatic:
138 case OMPScheduleType::NomergeOrderedDynamicChunked:
139 case OMPScheduleType::NomergeOrderedGuidedChunked:
140 case OMPScheduleType::NomergeOrderedRuntime:
141 case OMPScheduleType::NomergeOrderedAuto:
142 case OMPScheduleType::NomergeOrderedTrapezoidal:
143 case OMPScheduleType::OrderedDistributeChunked:
144 case OMPScheduleType::OrderedDistribute:
145 break;
146 default:
147 return false;
148 }
149
150 // Must not set both monotonicity modifiers at the same time.
151 OMPScheduleType MonotonicityFlags =
152 SchedType & OMPScheduleType::MonotonicityMask;
153 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
154 return false;
155
156 return true;
157}
158#endif
159
160/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
161/// debug location to the last instruction in the specified basic block if the
162/// insert point points to the end of the block.
165 Builder.restoreIP(IP);
166 llvm::BasicBlock *BB = Builder.GetInsertBlock();
167 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
168 if (!BB->empty() && I == BB->end())
169 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
170}
171
172static bool hasGridValue(const Triple &T) {
173 return T.isAMDGPU() || T.isNVPTX() || T.isSPIRV();
174}
175
176static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
177 if (T.isAMDGPU()) {
178 StringRef Features =
179 Kernel->getFnAttribute("target-features").getValueAsString();
180 if (Features.count("+wavefrontsize64"))
183 }
184 if (T.isNVPTX())
186 if (T.isSPIRV())
188 llvm_unreachable("No grid value available for this architecture!");
189}
190
191/// Determine which scheduling algorithm to use, determined from schedule clause
192/// arguments.
193static OMPScheduleType
194getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
195 bool HasSimdModifier, bool HasDistScheduleChunks) {
196 // Currently, the default schedule it static.
197 switch (ClauseKind) {
198 case OMP_SCHEDULE_Default:
199 case OMP_SCHEDULE_Static:
200 return HasChunks ? OMPScheduleType::BaseStaticChunked
201 : OMPScheduleType::BaseStatic;
202 case OMP_SCHEDULE_Dynamic:
203 return OMPScheduleType::BaseDynamicChunked;
204 case OMP_SCHEDULE_Guided:
205 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
206 : OMPScheduleType::BaseGuidedChunked;
207 case OMP_SCHEDULE_Auto:
209 case OMP_SCHEDULE_Runtime:
210 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
211 : OMPScheduleType::BaseRuntime;
212 case OMP_SCHEDULE_Distribute:
213 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
214 : OMPScheduleType::BaseDistribute;
215 }
216 llvm_unreachable("unhandled schedule clause argument");
217}
218
219/// Adds ordering modifier flags to schedule type.
220static OMPScheduleType
222 bool HasOrderedClause) {
223 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
224 OMPScheduleType::None &&
225 "Must not have ordering nor monotonicity flags already set");
226
227 OMPScheduleType OrderingModifier = HasOrderedClause
228 ? OMPScheduleType::ModifierOrdered
229 : OMPScheduleType::ModifierUnordered;
230 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
231
232 // Unsupported combinations
233 if (OrderingScheduleType ==
234 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
235 return OMPScheduleType::OrderedGuidedChunked;
236 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
237 OMPScheduleType::ModifierOrdered))
238 return OMPScheduleType::OrderedRuntime;
239
240 return OrderingScheduleType;
241}
242
243/// Adds monotonicity modifier flags to schedule type.
244static OMPScheduleType
246 bool HasSimdModifier, bool HasMonotonic,
247 bool HasNonmonotonic, bool HasOrderedClause) {
248 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
249 OMPScheduleType::None &&
250 "Must not have monotonicity flags already set");
251 assert((!HasMonotonic || !HasNonmonotonic) &&
252 "Monotonic and Nonmonotonic are contradicting each other");
253
254 if (HasMonotonic) {
255 return ScheduleType | OMPScheduleType::ModifierMonotonic;
256 } else if (HasNonmonotonic) {
257 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
258 } else {
259 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
260 // If the static schedule kind is specified or if the ordered clause is
261 // specified, and if the nonmonotonic modifier is not specified, the
262 // effect is as if the monotonic modifier is specified. Otherwise, unless
263 // the monotonic modifier is specified, the effect is as if the
264 // nonmonotonic modifier is specified.
265 OMPScheduleType BaseScheduleType =
266 ScheduleType & ~OMPScheduleType::ModifierMask;
267 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
268 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
269 HasOrderedClause) {
270 // The monotonic is used by default in openmp runtime library, so no need
271 // to set it.
272 return ScheduleType;
273 } else {
274 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
275 }
276 }
277}
278
279/// Determine the schedule type using schedule and ordering clause arguments.
280static OMPScheduleType
281computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
282 bool HasSimdModifier, bool HasMonotonicModifier,
283 bool HasNonmonotonicModifier, bool HasOrderedClause,
284 bool HasDistScheduleChunks) {
286 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
287 OMPScheduleType OrderedSchedule =
288 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
290 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
291 HasNonmonotonicModifier, HasOrderedClause);
292
294 return Result;
295}
296
297/// Given a function, if it represents the entry point of a target kernel, this
298/// returns the execution mode flags associated with that kernel.
299static std::optional<omp::OMPTgtExecModeFlags>
301 CallInst *TargetInitCall = nullptr;
302 for (Instruction &Inst : Kernel.getEntryBlock()) {
303 if (auto *Call = dyn_cast<CallInst>(&Inst)) {
304 if (Call->getCalledFunction()->getName() == "__kmpc_target_init") {
305 TargetInitCall = Call;
306 break;
307 }
308 }
309 }
310
311 if (!TargetInitCall)
312 return std::nullopt;
313
314 // Get the kernel mode information from the global variable associated to the
315 // first argument to the call to __kmpc_target_init. Refer to
316 // createTargetInit() to see how this is initialized.
317 Value *InitOperand = TargetInitCall->getArgOperand(0);
318 GlobalVariable *KernelEnv = nullptr;
319 if (auto *Cast = dyn_cast<ConstantExpr>(InitOperand))
320 KernelEnv = cast<GlobalVariable>(Cast->getOperand(0));
321 else
322 KernelEnv = cast<GlobalVariable>(InitOperand);
323 auto *KernelEnvInit = cast<ConstantStruct>(KernelEnv->getInitializer());
324 auto *ConfigEnv = cast<ConstantStruct>(KernelEnvInit->getOperand(0));
325 auto *KernelMode = cast<ConstantInt>(ConfigEnv->getOperand(2));
326 return static_cast<OMPTgtExecModeFlags>(KernelMode->getZExtValue());
327}
328
329static bool isGenericKernel(Function &Fn) {
330 std::optional<omp::OMPTgtExecModeFlags> ExecMode =
332 return !ExecMode || (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC);
333}
334
335/// Make \p Source branch to \p Target.
336///
337/// Handles two situations:
338/// * \p Source already has an unconditional branch.
339/// * \p Source is a degenerate block (no terminator because the BB is
340/// the current head of the IR construction).
342 if (Instruction *Term = Source->getTerminatorOrNull()) {
343 auto *Br = cast<UncondBrInst>(Term);
344 BasicBlock *Succ = Br->getSuccessor();
345 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
346 Br->setSuccessor(Target);
347 return;
348 }
349
350 auto *NewBr = UncondBrInst::Create(Target, Source);
351 NewBr->setDebugLoc(DL);
352}
353
355 bool CreateBranch, DebugLoc DL) {
356 assert(New->getFirstInsertionPt() == New->begin() &&
357 "Target BB must not have PHI nodes");
358
359 // Move instructions to new block.
360 BasicBlock *Old = IP.getBlock();
361 // If the `Old` block is empty then there are no instructions to move. But in
362 // the new debug scheme, it could have trailing debug records which will be
363 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
364 // reasons:
365 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
366 // 2. Even if `New` is not empty, the rationale to move those records to `New`
367 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
368 // assumes that `Old` is optimized out and is going away. This is not the case
369 // here. The `Old` block is still being used e.g. a branch instruction is
370 // added to it later in this function.
371 // So we call `BasicBlock::splice` only when `Old` is not empty.
372 if (!Old->empty())
373 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
374
375 if (CreateBranch) {
376 auto *NewBr = UncondBrInst::Create(New, Old);
377 NewBr->setDebugLoc(DL);
378 }
379}
380
381void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
382 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
383 BasicBlock *Old = Builder.GetInsertBlock();
384
385 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
386 if (CreateBranch)
387 Builder.SetInsertPoint(Old->getTerminator());
388 else
389 Builder.SetInsertPoint(Old);
390
391 // SetInsertPoint also updates the Builder's debug location, but we want to
392 // keep the one the Builder was configured to use.
393 Builder.SetCurrentDebugLocation(DebugLoc);
394}
395
397 DebugLoc DL, llvm::Twine Name) {
398 BasicBlock *Old = IP.getBlock();
400 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
401 Old->getParent(), Old->getNextNode());
402 spliceBB(IP, New, CreateBranch, DL);
403 New->replaceSuccessorsPhiUsesWith(Old, New);
404 return New;
405}
406
407BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
408 llvm::Twine Name) {
409 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
410 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
411 if (CreateBranch)
412 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
413 else
414 Builder.SetInsertPoint(Builder.GetInsertBlock());
415 // SetInsertPoint also updates the Builder's debug location, but we want to
416 // keep the one the Builder was configured to use.
417 Builder.SetCurrentDebugLocation(DebugLoc);
418 return New;
419}
420
421BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
422 llvm::Twine Name) {
423 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
424 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
425 if (CreateBranch)
426 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
427 else
428 Builder.SetInsertPoint(Builder.GetInsertBlock());
429 // SetInsertPoint also updates the Builder's debug location, but we want to
430 // keep the one the Builder was configured to use.
431 Builder.SetCurrentDebugLocation(DebugLoc);
432 return New;
433}
434
436 llvm::Twine Suffix) {
437 BasicBlock *Old = Builder.GetInsertBlock();
438 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
439}
440
441// This function creates a fake integer value and a fake use for the integer
442// value. It returns the fake value created. This is useful in modeling the
443// extra arguments to the outlined functions.
445 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
447 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
448 const Twine &Name = "", bool AsPtr = true,
449 bool Is64Bit = false) {
450 Builder.restoreIP(OuterAllocaIP);
451 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
452 Instruction *FakeVal;
453 AllocaInst *FakeValAddr =
454 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
455 ToBeDeleted.push_back(FakeValAddr);
456
457 if (AsPtr) {
458 FakeVal = FakeValAddr;
459 } else {
460 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
461 ToBeDeleted.push_back(FakeVal);
462 }
463
464 // Generate a fake use of this value
465 Builder.restoreIP(InnerAllocaIP);
466 Instruction *UseFakeVal;
467 if (AsPtr) {
468 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
469 } else {
470 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
471 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
472 }
473 ToBeDeleted.push_back(UseFakeVal);
474 return FakeVal;
475}
476
477//===----------------------------------------------------------------------===//
478// OpenMPIRBuilderConfig
479//===----------------------------------------------------------------------===//
480
481namespace {
483/// Values for bit flags for marking which requires clauses have been used.
484enum OpenMPOffloadingRequiresDirFlags {
485 /// flag undefined.
486 OMP_REQ_UNDEFINED = 0x000,
487 /// no requires directive present.
488 OMP_REQ_NONE = 0x001,
489 /// reverse_offload clause.
490 OMP_REQ_REVERSE_OFFLOAD = 0x002,
491 /// unified_address clause.
492 OMP_REQ_UNIFIED_ADDRESS = 0x004,
493 /// unified_shared_memory clause.
494 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
495 /// dynamic_allocators clause.
496 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
497 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
498};
499
500class OMPCodeExtractor : public CodeExtractor {
501public:
502 OMPCodeExtractor(OpenMPIRBuilder &OMPBuilder, ArrayRef<BasicBlock *> BBs,
503 DominatorTree *DT = nullptr, bool AggregateArgs = false,
504 BlockFrequencyInfo *BFI = nullptr,
505 BranchProbabilityInfo *BPI = nullptr,
506 AssumptionCache *AC = nullptr, bool AllowVarArgs = false,
507 bool AllowAlloca = false,
508 BasicBlock *AllocationBlock = nullptr,
509 ArrayRef<BasicBlock *> DeallocationBlocks = {},
510 std::string Suffix = "", bool ArgsInZeroAddressSpace = false)
511 : CodeExtractor(BBs, DT, AggregateArgs, BFI, BPI, AC, AllowVarArgs,
512 AllowAlloca, AllocationBlock, DeallocationBlocks, Suffix,
513 ArgsInZeroAddressSpace),
514 OMPBuilder(OMPBuilder) {}
515
516 virtual ~OMPCodeExtractor() = default;
517
518protected:
519 OpenMPIRBuilder &OMPBuilder;
520};
521
522class DeviceSharedMemCodeExtractor : public OMPCodeExtractor {
523public:
524 using OMPCodeExtractor::OMPCodeExtractor;
525 virtual ~DeviceSharedMemCodeExtractor() = default;
526
527protected:
528 virtual Instruction *
529 allocateVar(IRBuilder<>::InsertPoint AllocaIP, Type *VarType,
530 const Twine &Name = Twine(""),
531 AddrSpaceCastInst **CastedAlloc = nullptr) override {
532 return OMPBuilder.createOMPAllocShared(AllocaIP, VarType, Name);
533 }
534
535 virtual Instruction *deallocateVar(IRBuilder<>::InsertPoint DeallocIP,
536 Value *Var, Type *VarType) override {
537 return OMPBuilder.createOMPFreeShared(DeallocIP, Var, VarType);
538 }
539};
540
541/// Helper storing information about regions to outline using device shared
542/// memory for intermediate allocations.
543struct DeviceSharedMemOutlineInfo : public OpenMPIRBuilder::OutlineInfo {
544 OpenMPIRBuilder &OMPBuilder;
545
546 DeviceSharedMemOutlineInfo(OpenMPIRBuilder &OMPBuilder)
547 : OMPBuilder(OMPBuilder) {}
548 virtual ~DeviceSharedMemOutlineInfo() = default;
549
550 virtual std::unique_ptr<CodeExtractor>
551 createCodeExtractor(ArrayRef<BasicBlock *> Blocks,
552 bool ArgsInZeroAddressSpace,
553 Twine Suffix = Twine("")) override;
554};
555
556} // anonymous namespace
557
559 : RequiresFlags(OMP_REQ_UNDEFINED) {}
560
563 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
564 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
567 RequiresFlags(OMP_REQ_UNDEFINED) {
568 if (HasRequiresReverseOffload)
569 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
570 if (HasRequiresUnifiedAddress)
571 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
572 if (HasRequiresUnifiedSharedMemory)
573 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
574 if (HasRequiresDynamicAllocators)
575 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
576}
577
579 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
580}
581
583 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
584}
585
587 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
588}
589
591 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
592}
593
595 return hasRequiresFlags() ? RequiresFlags
596 : static_cast<int64_t>(OMP_REQ_NONE);
597}
598
600 if (Value)
601 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
602 else
603 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
604}
605
607 if (Value)
608 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
609 else
610 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
611}
612
614 if (Value)
615 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
616 else
617 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
618}
619
621 if (Value)
622 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
623 else
624 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
625}
626
627//===----------------------------------------------------------------------===//
628// OpenMPIRBuilder
629//===----------------------------------------------------------------------===//
630
633 SmallVector<Value *> &ArgsVector) {
635 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
636 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
637 constexpr size_t MaxDim = 3;
638 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
639
640 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
641
642 Value *DynCGroupMemFallbackFlag =
643 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
644 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
645 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
646
647 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
648
649 Value *NumTeams3D =
650 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
651 Value *NumThreads3D =
652 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
653 for (unsigned I :
654 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
655 NumTeams3D =
656 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
657 for (unsigned I :
658 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
659 NumThreads3D =
660 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
661
662 ArgsVector = {Version,
663 PointerNum,
664 KernelArgs.RTArgs.BasePointersArray,
665 KernelArgs.RTArgs.PointersArray,
666 KernelArgs.RTArgs.SizesArray,
667 KernelArgs.RTArgs.MapTypesArray,
668 KernelArgs.RTArgs.MapNamesArray,
669 KernelArgs.RTArgs.MappersArray,
670 KernelArgs.NumIterations,
671 Flags,
672 NumTeams3D,
673 NumThreads3D,
674 KernelArgs.DynCGroupMem};
675}
676
678 LLVMContext &Ctx = Fn.getContext();
679
680 // Get the function's current attributes.
681 auto Attrs = Fn.getAttributes();
682 auto FnAttrs = Attrs.getFnAttrs();
683 auto RetAttrs = Attrs.getRetAttrs();
685 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
686 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
687
688 // Add AS to FnAS while taking special care with integer extensions.
689 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
690 bool Param = true) -> void {
691 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
692 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
693 if (HasSignExt || HasZeroExt) {
694 assert(AS.getNumAttributes() == 1 &&
695 "Currently not handling extension attr combined with others.");
696 if (Param) {
697 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
698 FnAS = FnAS.addAttribute(Ctx, AK);
699 } else if (auto AK =
700 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
701 FnAS = FnAS.addAttribute(Ctx, AK);
702 } else {
703 FnAS = FnAS.addAttributes(Ctx, AS);
704 }
705 };
706
707#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
708#include "llvm/Frontend/OpenMP/OMPKinds.def"
709
710 // Add attributes to the function declaration.
711 switch (FnID) {
712#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
713 case Enum: \
714 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
715 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
716 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
717 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
718 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
719 break;
720#include "llvm/Frontend/OpenMP/OMPKinds.def"
721 default:
722 // Attributes are optional.
723 break;
724 }
725}
726
729 FunctionType *FnTy = nullptr;
730 Function *Fn = nullptr;
731
732 // Try to find the declation in the module first.
733 switch (FnID) {
734#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
735 case Enum: \
736 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
737 IsVarArg); \
738 Fn = M.getFunction(Str); \
739 break;
740#include "llvm/Frontend/OpenMP/OMPKinds.def"
741 }
742
743 if (!Fn) {
744 // Create a new declaration if we need one.
745 switch (FnID) {
746#define OMP_RTL(Enum, Str, ...) \
747 case Enum: \
748 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
749 break;
750#include "llvm/Frontend/OpenMP/OMPKinds.def"
751 }
752 Fn->setCallingConv(Config.getRuntimeCC());
753 // Add information if the runtime function takes a callback function
754 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
755 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
756 LLVMContext &Ctx = Fn->getContext();
757 MDBuilder MDB(Ctx);
758 // Annotate the callback behavior of the runtime function:
759 // - The callback callee is argument number 2 (microtask).
760 // - The first two arguments of the callback callee are unknown (-1).
761 // - All variadic arguments to the runtime function are passed to the
762 // callback callee.
763 Fn->addMetadata(
764 LLVMContext::MD_callback,
766 2, {-1, -1}, /* VarArgsArePassed */ true)}));
767 }
768 }
769
770 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
771 << " with type " << *Fn->getFunctionType() << "\n");
772 addAttributes(FnID, *Fn);
773
774 } else {
775 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
776 << " with type " << *Fn->getFunctionType() << "\n");
777 }
778
779 assert(Fn && "Failed to create OpenMP runtime function");
780
781 return {FnTy, Fn};
782}
783
786 if (!FiniBB) {
787 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
789 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
790 Builder.SetInsertPoint(FiniBB);
791 // FiniCB adds the branch to the exit stub.
792 if (Error Err = FiniCB(Builder.saveIP()))
793 return Err;
794 }
795 return FiniBB;
796}
797
799 BasicBlock *OtherFiniBB) {
800 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
801 if (!FiniBB) {
802 FiniBB = OtherFiniBB;
803
804 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
805 if (Error Err = FiniCB(Builder.saveIP()))
806 return Err;
807
808 return Error::success();
809 }
810
811 // Move instructions from FiniBB to the start of OtherFiniBB.
812 auto EndIt = FiniBB->end();
813 if (FiniBB->size() >= 1)
814 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
815 EndIt = Prev;
816 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
817 EndIt);
818
819 FiniBB->replaceAllUsesWith(OtherFiniBB);
820 FiniBB->eraseFromParent();
821 FiniBB = OtherFiniBB;
822 return Error::success();
823}
824
827 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
828 assert(Fn && "Failed to create OpenMP runtime function pointer");
829 return Fn;
830}
831
834 StringRef Name) {
835 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
836 Call->setCallingConv(Config.getRuntimeCC());
837 return Call;
838}
839
840void OpenMPIRBuilder::initialize() { initializeTypes(M); }
841
844 BasicBlock &EntryBlock = Function->getEntryBlock();
845 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
846
847 // Loop over blocks looking for constant allocas, skipping the entry block
848 // as any allocas there are already in the desired location.
849 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
850 Block++) {
851 for (auto Inst = Block->getReverseIterator()->begin();
852 Inst != Block->getReverseIterator()->end();) {
854 Inst++;
856 continue;
857 AllocaInst->moveBeforePreserving(MoveLocInst);
858 } else {
859 Inst++;
860 }
861 }
862 }
863}
864
867
868 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
869 // TODO: For now, we support simple static allocations, we might need to
870 // move non-static ones as well. However, this will need further analysis to
871 // move the lenght arguments as well.
873 };
874
875 for (llvm::Instruction &Inst : Block)
877 if (ShouldHoistAlloca(*AllocaInst))
878 AllocasToMove.push_back(AllocaInst);
879
880 auto InsertPoint =
881 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
882
883 for (llvm::Instruction *AllocaInst : AllocasToMove)
885}
886
888 PostDominatorTree PostDomTree(*Func);
889 for (llvm::BasicBlock &BB : *Func)
890 if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
892}
893
895 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
897 SmallVector<std::unique_ptr<OutlineInfo>, 16> DeferredOutlines;
898 for (std::unique_ptr<OutlineInfo> &OI : OutlineInfos) {
899 // Skip functions that have not finalized yet; may happen with nested
900 // function generation.
901 if (Fn && OI->getFunction() != Fn) {
902 DeferredOutlines.push_back(std::move(OI));
903 continue;
904 }
905
906 ParallelRegionBlockSet.clear();
907 Blocks.clear();
908 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
909
910 Function *OuterFn = OI->getFunction();
911 CodeExtractorAnalysisCache CEAC(*OuterFn);
912 // If we generate code for the target device, we need to allocate
913 // struct for aggregate params in the device default alloca address space.
914 // OpenMP runtime requires that the params of the extracted functions are
915 // passed as zero address space pointers. This flag ensures that
916 // CodeExtractor generates correct code for extracted functions
917 // which are used by OpenMP runtime.
918 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
919 std::unique_ptr<CodeExtractor> Extractor =
920 OI->createCodeExtractor(Blocks, ArgsInZeroAddressSpace, ".omp_par");
921
922 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
923 LLVM_DEBUG(dbgs() << "Entry " << OI->EntryBB->getName()
924 << " Exit: " << OI->ExitBB->getName() << "\n");
925 assert(Extractor->isEligible() &&
926 "Expected OpenMP outlining to be possible!");
927
928 for (auto *V : OI->ExcludeArgsFromAggregate)
929 Extractor->excludeArgFromAggregate(V);
930
931 Function *OutlinedFn =
932 Extractor->extractCodeRegion(CEAC, OI->Inputs, OI->Outputs);
933
934 // Forward target-cpu, target-features attributes to the outlined function.
935 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
936 if (TargetCpuAttr.isStringAttribute())
937 OutlinedFn->addFnAttr(TargetCpuAttr);
938
939 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
940 if (TargetFeaturesAttr.isStringAttribute())
941 OutlinedFn->addFnAttr(TargetFeaturesAttr);
942
943 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
944 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
945 assert(OutlinedFn->getReturnType()->isVoidTy() &&
946 "OpenMP outlined functions should not return a value!");
947
948 // For compability with the clang CG we move the outlined function after the
949 // one with the parallel region.
950 OutlinedFn->removeFromParent();
951 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
952
953 // Remove the artificial entry introduced by the extractor right away, we
954 // made our own entry block after all.
955 {
956 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
957 assert(ArtificialEntry.getUniqueSuccessor() == OI->EntryBB);
958 assert(OI->EntryBB->getUniquePredecessor() == &ArtificialEntry);
959 // Move instructions from the to-be-deleted ArtificialEntry to the entry
960 // basic block of the parallel region. CodeExtractor generates
961 // instructions to unwrap the aggregate argument and may sink
962 // allocas/bitcasts for values that are solely used in the outlined region
963 // and do not escape.
964 assert(!ArtificialEntry.empty() &&
965 "Expected instructions to add in the outlined region entry");
966 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
967 End = ArtificialEntry.rend();
968 It != End;) {
969 Instruction &I = *It;
970 It++;
971
972 if (I.isTerminator()) {
973 // Absorb any debug value that terminator may have
974 if (Instruction *TI = OI->EntryBB->getTerminatorOrNull())
975 TI->adoptDbgRecords(&ArtificialEntry, I.getIterator(), false);
976 continue;
977 }
978
979 I.moveBeforePreserving(*OI->EntryBB,
980 OI->EntryBB->getFirstInsertionPt());
981 }
982
983 OI->EntryBB->moveBefore(&ArtificialEntry);
984 ArtificialEntry.eraseFromParent();
985 }
986 assert(&OutlinedFn->getEntryBlock() == OI->EntryBB);
987 assert(OutlinedFn && OutlinedFn->hasNUses(1));
988
989 // Run a user callback, e.g. to add attributes.
990 if (OI->PostOutlineCB)
991 OI->PostOutlineCB(*OutlinedFn);
992
993 if (OI->FixUpNonEntryAllocas)
995 }
996
997 // Remove work items that have been completed.
998 OutlineInfos = std::move(DeferredOutlines);
999
1000 // The createTarget functions embeds user written code into
1001 // the target region which may inject allocas which need to
1002 // be moved to the entry block of our target or risk malformed
1003 // optimisations by later passes, this is only relevant for
1004 // the device pass which appears to be a little more delicate
1005 // when it comes to optimisations (however, we do not block on
1006 // that here, it's up to the inserter to the list to do so).
1007 // This notbaly has to occur after the OutlinedInfo candidates
1008 // have been extracted so we have an end product that will not
1009 // be implicitly adversely affected by any raises unless
1010 // intentionally appended to the list.
1011 // NOTE: This only does so for ConstantData, it could be extended
1012 // to ConstantExpr's with further effort, however, they should
1013 // largely be folded when they get here. Extending it to runtime
1014 // defined/read+writeable allocation sizes would be non-trivial
1015 // (need to factor in movement of any stores to variables the
1016 // allocation size depends on, as well as the usual loads,
1017 // otherwise it'll yield the wrong result after movement) and
1018 // likely be more suitable as an LLVM optimisation pass.
1021
1022 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
1023 [](EmitMetadataErrorKind Kind,
1024 const TargetRegionEntryInfo &EntryInfo) -> void {
1025 errs() << "Error of kind: " << Kind
1026 << " when emitting offload entries and metadata during "
1027 "OMPIRBuilder finalization \n";
1028 };
1029
1030 if (!OffloadInfoManager.empty())
1032
1033 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
1034 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
1035 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
1036 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
1037 }
1038
1039 IsFinalized = true;
1040}
1041
1042bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
1043
1045 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
1046}
1047
1049 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
1050 auto *GV =
1051 new GlobalVariable(M, I32Ty,
1052 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
1053 ConstantInt::get(I32Ty, Value), Name);
1054 GV->setVisibility(GlobalValue::HiddenVisibility);
1055
1056 return GV;
1057}
1058
1060 if (List.empty())
1061 return;
1062
1063 // Convert List to what ConstantArray needs.
1065 UsedArray.resize(List.size());
1066 for (unsigned I = 0, E = List.size(); I != E; ++I)
1068 cast<Constant>(&*List[I]), Builder.getPtrTy());
1069
1070 if (UsedArray.empty())
1071 return;
1072 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
1073
1074 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
1075 ConstantArray::get(ATy, UsedArray), Name);
1076
1077 GV->setSection("llvm.metadata");
1078}
1079
1082 OMPTgtExecModeFlags Mode) {
1083 auto *Int8Ty = Builder.getInt8Ty();
1084 auto *GVMode = new GlobalVariable(
1085 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
1086 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
1087 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
1088 return GVMode;
1089}
1090
1092 uint32_t SrcLocStrSize,
1093 IdentFlag LocFlags,
1094 unsigned Reserve2Flags) {
1095 // Enable "C-mode".
1096 LocFlags |= OMP_IDENT_FLAG_KMPC;
1097
1098 Constant *&Ident =
1099 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1100 if (!Ident) {
1101 Constant *I32Null = ConstantInt::getNullValue(Int32);
1102 Constant *IdentData[] = {I32Null,
1103 ConstantInt::get(Int32, uint32_t(LocFlags)),
1104 ConstantInt::get(Int32, Reserve2Flags),
1105 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1106
1107 size_t SrcLocStrArgIdx = 4;
1108 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1110 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1111 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1112 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1113 Constant *Initializer =
1114 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1115
1116 // Look for existing encoding of the location + flags, not needed but
1117 // minimizes the difference to the existing solution while we transition.
1118 for (GlobalVariable &GV : M.globals())
1119 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1120 if (GV.getInitializer() == Initializer)
1121 Ident = &GV;
1122
1123 if (!Ident) {
1124 auto *GV = new GlobalVariable(
1125 M, OpenMPIRBuilder::Ident,
1126 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1128 M.getDataLayout().getDefaultGlobalsAddressSpace());
1129 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1130 GV->setAlignment(Align(8));
1131 Ident = GV;
1132 }
1133 }
1134
1135 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1136}
1137
1139 uint32_t &SrcLocStrSize) {
1140 SrcLocStrSize = LocStr.size();
1141 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1142 if (!SrcLocStr) {
1143 Constant *Initializer =
1144 ConstantDataArray::getString(M.getContext(), LocStr);
1145
1146 // Look for existing encoding of the location, not needed but minimizes the
1147 // difference to the existing solution while we transition.
1148 for (GlobalVariable &GV : M.globals())
1149 if (GV.isConstant() && GV.hasInitializer() &&
1150 GV.getInitializer() == Initializer)
1151 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1152
1153 SrcLocStr = Builder.CreateGlobalString(
1154 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1155 &M);
1156 }
1157 return SrcLocStr;
1158}
1159
1161 StringRef FileName,
1162 unsigned Line, unsigned Column,
1163 uint32_t &SrcLocStrSize) {
1164 SmallString<128> Buffer;
1165 Buffer.push_back(';');
1166 Buffer.append(FileName);
1167 Buffer.push_back(';');
1168 Buffer.append(FunctionName);
1169 Buffer.push_back(';');
1170 Buffer.append(std::to_string(Line));
1171 Buffer.push_back(';');
1172 Buffer.append(std::to_string(Column));
1173 Buffer.push_back(';');
1174 Buffer.push_back(';');
1175 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1176}
1177
1178Constant *
1180 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1181 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1182}
1183
1185 uint32_t &SrcLocStrSize,
1186 Function *F) {
1187 DILocation *DIL = DL.get();
1188 if (!DIL)
1189 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1190 StringRef FileName = M.getName();
1191 if (DIFile *DIF = DIL->getFile())
1192 if (std::optional<StringRef> Source = DIF->getSource())
1193 FileName = *Source;
1194 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1195 if (Function.empty() && F)
1196 Function = F->getName();
1197 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1198 DIL->getColumn(), SrcLocStrSize);
1199}
1200
1202 uint32_t &SrcLocStrSize) {
1203 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1204 Loc.IP.getBlock()->getParent());
1205}
1206
1209 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1210 "omp_global_thread_num");
1211}
1212
1215 bool ForceSimpleCall, bool CheckCancelFlag) {
1216 if (!updateToLocation(Loc))
1217 return Loc.IP;
1218
1219 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1220 // __kmpc_barrier(loc, thread_id);
1221
1222 IdentFlag BarrierLocFlags;
1223 switch (Kind) {
1224 case OMPD_for:
1225 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1226 break;
1227 case OMPD_sections:
1228 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1229 break;
1230 case OMPD_single:
1231 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1232 break;
1233 case OMPD_barrier:
1234 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1235 break;
1236 default:
1237 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1238 break;
1239 }
1240
1241 uint32_t SrcLocStrSize;
1242 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1243 Value *Args[] = {
1244 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1245 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1246
1247 // If we are in a cancellable parallel region, barriers are cancellation
1248 // points.
1249 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1250 bool UseCancelBarrier =
1251 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1252
1254 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1255 ? OMPRTL___kmpc_cancel_barrier
1256 : OMPRTL___kmpc_barrier),
1257 Args);
1258
1259 if (UseCancelBarrier && CheckCancelFlag)
1260 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1261 return Err;
1262
1263 return Builder.saveIP();
1264}
1265
1268 Value *IfCondition,
1269 omp::Directive CanceledDirective) {
1270 if (!updateToLocation(Loc))
1271 return Loc.IP;
1272
1273 // LLVM utilities like blocks with terminators.
1274 auto *UI = Builder.CreateUnreachable();
1275
1276 Instruction *ThenTI = UI, *ElseTI = nullptr;
1277 if (IfCondition) {
1278 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1279
1280 // Even if the if condition evaluates to false, this should count as a
1281 // cancellation point
1282 Builder.SetInsertPoint(ElseTI);
1283 auto ElseIP = Builder.saveIP();
1284
1286 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1287 if (!IPOrErr)
1288 return IPOrErr;
1289 }
1290
1291 Builder.SetInsertPoint(ThenTI);
1292
1293 Value *CancelKind = nullptr;
1294 switch (CanceledDirective) {
1295#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1296 case DirectiveEnum: \
1297 CancelKind = Builder.getInt32(Value); \
1298 break;
1299#include "llvm/Frontend/OpenMP/OMPKinds.def"
1300 default:
1301 llvm_unreachable("Unknown cancel kind!");
1302 }
1303
1304 uint32_t SrcLocStrSize;
1305 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1306 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1307 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1309 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1310
1311 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1312 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1313 return Err;
1314
1315 // Update the insertion point and remove the terminator we introduced.
1316 Builder.SetInsertPoint(UI->getParent());
1317 UI->eraseFromParent();
1318
1319 return Builder.saveIP();
1320}
1321
1324 omp::Directive CanceledDirective) {
1325 if (!updateToLocation(Loc))
1326 return Loc.IP;
1327
1328 // LLVM utilities like blocks with terminators.
1329 auto *UI = Builder.CreateUnreachable();
1330 Builder.SetInsertPoint(UI);
1331
1332 Value *CancelKind = nullptr;
1333 switch (CanceledDirective) {
1334#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1335 case DirectiveEnum: \
1336 CancelKind = Builder.getInt32(Value); \
1337 break;
1338#include "llvm/Frontend/OpenMP/OMPKinds.def"
1339 default:
1340 llvm_unreachable("Unknown cancel kind!");
1341 }
1342
1343 uint32_t SrcLocStrSize;
1344 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1345 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1346 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1348 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1349
1350 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1351 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1352 return Err;
1353
1354 // Update the insertion point and remove the terminator we introduced.
1355 Builder.SetInsertPoint(UI->getParent());
1356 UI->eraseFromParent();
1357
1358 return Builder.saveIP();
1359}
1360
1362 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1363 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1364 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1365 if (!updateToLocation(Loc))
1366 return Loc.IP;
1367
1368 Builder.restoreIP(AllocaIP);
1369 auto *KernelArgsPtr =
1370 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1372
1373 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1374 llvm::Value *Arg =
1375 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1376 Builder.CreateAlignedStore(
1377 KernelArgs[I], Arg,
1378 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1379 }
1380
1381 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1382 NumThreads, HostPtr, KernelArgsPtr};
1383
1385 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1386 OffloadingArgs);
1387
1388 return Builder.saveIP();
1389}
1390
1392 const LocationDescription &Loc, Value *OutlinedFnID,
1393 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1394 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1395
1396 if (!updateToLocation(Loc))
1397 return Loc.IP;
1398
1399 // On top of the arrays that were filled up, the target offloading call
1400 // takes as arguments the device id as well as the host pointer. The host
1401 // pointer is used by the runtime library to identify the current target
1402 // region, so it only has to be unique and not necessarily point to
1403 // anything. It could be the pointer to the outlined function that
1404 // implements the target region, but we aren't using that so that the
1405 // compiler doesn't need to keep that, and could therefore inline the host
1406 // function if proven worthwhile during optimization.
1407
1408 // From this point on, we need to have an ID of the target region defined.
1409 assert(OutlinedFnID && "Invalid outlined function ID!");
1410 (void)OutlinedFnID;
1411
1412 // Return value of the runtime offloading call.
1413 Value *Return = nullptr;
1414
1415 // Arguments for the target kernel.
1416 SmallVector<Value *> ArgsVector;
1417 getKernelArgsVector(Args, Builder, ArgsVector);
1418
1419 // The target region is an outlined function launched by the runtime
1420 // via calls to __tgt_target_kernel().
1421 //
1422 // Note that on the host and CPU targets, the runtime implementation of
1423 // these calls simply call the outlined function without forking threads.
1424 // The outlined functions themselves have runtime calls to
1425 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1426 // the compiler in emitTeamsCall() and emitParallelCall().
1427 //
1428 // In contrast, on the NVPTX target, the implementation of
1429 // __tgt_target_teams() launches a GPU kernel with the requested number
1430 // of teams and threads so no additional calls to the runtime are required.
1431 // Check the error code and execute the host version if required.
1432 Builder.restoreIP(emitTargetKernel(
1433 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1434 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1435
1436 BasicBlock *OffloadFailedBlock =
1437 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1438 BasicBlock *OffloadContBlock =
1439 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1440 Value *Failed = Builder.CreateIsNotNull(Return);
1441 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1442
1443 auto CurFn = Builder.GetInsertBlock()->getParent();
1444 emitBlock(OffloadFailedBlock, CurFn);
1445 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1446 if (!AfterIP)
1447 return AfterIP.takeError();
1448 Builder.restoreIP(*AfterIP);
1449 emitBranch(OffloadContBlock);
1450 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1451 return Builder.saveIP();
1452}
1453
1455 Value *CancelFlag, omp::Directive CanceledDirective) {
1456 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1457 "Unexpected cancellation!");
1458
1459 // For a cancel barrier we create two new blocks.
1460 BasicBlock *BB = Builder.GetInsertBlock();
1461 BasicBlock *NonCancellationBlock;
1462 if (Builder.GetInsertPoint() == BB->end()) {
1463 // TODO: This branch will not be needed once we moved to the
1464 // OpenMPIRBuilder codegen completely.
1465 NonCancellationBlock = BasicBlock::Create(
1466 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1467 } else {
1468 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1470 Builder.SetInsertPoint(BB);
1471 }
1472 BasicBlock *CancellationBlock = BasicBlock::Create(
1473 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1474
1475 // Jump to them based on the return value.
1476 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1477 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1478 /* TODO weight */ nullptr, nullptr);
1479
1480 // From the cancellation block we finalize all variables and go to the
1481 // post finalization block that is known to the FiniCB callback.
1482 auto &FI = FinalizationStack.back();
1483 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1484 if (!FiniBBOrErr)
1485 return FiniBBOrErr.takeError();
1486 Builder.SetInsertPoint(CancellationBlock);
1487 Builder.CreateBr(*FiniBBOrErr);
1488
1489 // The continuation block is where code generation continues.
1490 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1491 return Error::success();
1492}
1493
1494/// Create wrapper function used to gather the outlined function's argument
1495/// structure from a shared buffer and to forward them to it when running in
1496/// Generic mode.
1497///
1498/// The outlined function is expected to receive 2 integer arguments followed by
1499/// an optional pointer argument to an argument structure holding the rest.
1501 Function &OutlinedFn) {
1502 size_t NumArgs = OutlinedFn.arg_size();
1503 assert((NumArgs == 2 || NumArgs == 3) &&
1504 "expected a 2-3 argument parallel outlined function");
1505 bool UseArgStruct = NumArgs == 3;
1506
1507 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1508 IRBuilder<>::InsertPointGuard IPG(Builder);
1509 auto *FnTy = FunctionType::get(Builder.getVoidTy(),
1510 {Builder.getInt16Ty(), Builder.getInt32Ty()},
1511 /*isVarArg=*/false);
1512 auto *WrapperFn =
1514 OutlinedFn.getName() + ".wrapper", OMPIRBuilder->M);
1515
1516 WrapperFn->addParamAttr(0, Attribute::NoUndef);
1517 WrapperFn->addParamAttr(0, Attribute::ZExt);
1518 WrapperFn->addParamAttr(1, Attribute::NoUndef);
1519
1520 BasicBlock *EntryBB =
1521 BasicBlock::Create(OMPIRBuilder->M.getContext(), "entry", WrapperFn);
1522 Builder.SetInsertPoint(EntryBB);
1523
1524 // Allocation.
1525 Value *AddrAlloca = Builder.CreateAlloca(Builder.getInt32Ty(),
1526 /*ArraySize=*/nullptr, "addr");
1527 AddrAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1528 AddrAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1529 AddrAlloca->getName() + ".ascast");
1530
1531 Value *ZeroAlloca = Builder.CreateAlloca(Builder.getInt32Ty(),
1532 /*ArraySize=*/nullptr, "zero");
1533 ZeroAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1534 ZeroAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1535 ZeroAlloca->getName() + ".ascast");
1536
1537 Value *ArgsAlloca = nullptr;
1538 if (UseArgStruct) {
1539 ArgsAlloca = Builder.CreateAlloca(Builder.getPtrTy(),
1540 /*ArraySize=*/nullptr, "global_args");
1541 ArgsAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1542 ArgsAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1543 ArgsAlloca->getName() + ".ascast");
1544 }
1545
1546 // Initialization.
1547 Builder.CreateStore(WrapperFn->getArg(1), AddrAlloca);
1548 Builder.CreateStore(Builder.getInt32(0), ZeroAlloca);
1549 if (UseArgStruct) {
1550 Builder.CreateCall(
1551 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(
1552 llvm::omp::RuntimeFunction::OMPRTL___kmpc_get_shared_variables),
1553 {ArgsAlloca});
1554 }
1555
1556 SmallVector<Value *, 3> Args{AddrAlloca, ZeroAlloca};
1557
1558 // Load structArg from global_args.
1559 if (UseArgStruct) {
1560 Value *StructArg = Builder.CreateLoad(Builder.getPtrTy(), ArgsAlloca);
1561 StructArg = Builder.CreateInBoundsGEP(Builder.getPtrTy(), StructArg,
1562 {Builder.getInt64(0)});
1563 StructArg = Builder.CreateLoad(Builder.getPtrTy(), StructArg, "structArg");
1564 Args.push_back(StructArg);
1565 }
1566
1567 // Call the outlined function holding the parallel body.
1568 Builder.CreateCall(&OutlinedFn, Args);
1569 Builder.CreateRetVoid();
1570
1571 return WrapperFn;
1572}
1573
1574// Callback used to create OpenMP runtime calls to support
1575// omp parallel clause for the device.
1576// We need to use this callback to replace call to the OutlinedFn in OuterFn
1577// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1579 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1580 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1581 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1582 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1583 assert(OutlinedFn.arg_size() >= 2 &&
1584 "Expected at least tid and bounded tid as arguments");
1585 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1586
1587 // Add some known attributes.
1588 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1589 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1590 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1591 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1592 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1593 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1594
1595 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1596 assert(CI && "Expected call instruction to outlined function");
1597 CI->getParent()->setName("omp_parallel");
1598
1599 Builder.SetInsertPoint(CI);
1600 Type *PtrTy = OMPIRBuilder->VoidPtr;
1601
1602 // Add alloca for kernel args
1603 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1604 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1605 AllocaInst *ArgsAlloca =
1606 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1607 Value *Args = ArgsAlloca;
1608 // Add address space cast if array for storing arguments is not allocated
1609 // in address space 0
1610 if (ArgsAlloca->getAddressSpace())
1611 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1612 Builder.restoreIP(CurrentIP);
1613
1614 // Store captured vars which are used by kmpc_parallel_60
1615 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1616 Value *V = *(CI->arg_begin() + 2 + Idx);
1617 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1618 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1619 Builder.CreateStore(V, StoreAddress);
1620 }
1621
1622 Value *Cond =
1623 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1624 : Builder.getInt32(1);
1625 Value *NumThreadsArg =
1626 NumThreads ? Builder.CreateZExtOrTrunc(NumThreads, OMPIRBuilder->Int32)
1627 : Builder.getInt32(-1);
1628
1629 // If this is not a Generic kernel, we can skip generating the wrapper.
1630 Value *WrapperFn;
1631 if (isGenericKernel(*OuterFn))
1632 WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn);
1633 else
1634 WrapperFn = Constant::getNullValue(PtrTy);
1635
1636 // Build kmpc_parallel_60 call
1637 Value *Parallel60CallArgs[] = {
1638 /* identifier*/ Ident,
1639 /* global thread num*/ ThreadID,
1640 /* if expression */ Cond,
1641 /* number of threads */ NumThreadsArg,
1642 /* Proc bind */ Builder.getInt32(-1),
1643 /* outlined function */ &OutlinedFn,
1644 /* wrapper function */ WrapperFn,
1645 /* arguments of the outlined funciton*/ Args,
1646 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1647 /* strict for number of threads */ Builder.getInt32(0)};
1648
1649 FunctionCallee RTLFn =
1650 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1651
1652 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1653
1654 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1655 << *Builder.GetInsertBlock()->getParent() << "\n");
1656
1657 // Initialize the local TID stack location with the argument value.
1658 Builder.SetInsertPoint(PrivTID);
1659 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1660 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1661 PrivTIDAddr);
1662
1663 // Remove redundant call to the outlined function.
1664 CI->eraseFromParent();
1665
1666 for (Instruction *I : ToBeDeleted) {
1667 I->eraseFromParent();
1668 }
1669}
1670
1671// Callback used to create OpenMP runtime calls to support
1672// omp parallel clause for the host.
1673// We need to use this callback to replace call to the OutlinedFn in OuterFn
1674// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1675static void
1677 Function *OuterFn, Value *Ident, Value *IfCondition,
1678 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1679 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1680 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1681 FunctionCallee RTLFn;
1682 if (IfCondition) {
1683 RTLFn =
1684 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1685 } else {
1686 RTLFn =
1687 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1688 }
1689 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1690 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1691 LLVMContext &Ctx = F->getContext();
1692 MDBuilder MDB(Ctx);
1693 // Annotate the callback behavior of the __kmpc_fork_call:
1694 // - The callback callee is argument number 2 (microtask).
1695 // - The first two arguments of the callback callee are unknown (-1).
1696 // - All variadic arguments to the __kmpc_fork_call are passed to the
1697 // callback callee.
1698 F->addMetadata(LLVMContext::MD_callback,
1700 2, {-1, -1},
1701 /* VarArgsArePassed */ true)}));
1702 }
1703 }
1704 // Add some known attributes.
1705 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1706 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1707 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1708
1709 assert(OutlinedFn.arg_size() >= 2 &&
1710 "Expected at least tid and bounded tid as arguments");
1711 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1712
1713 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1714 CI->getParent()->setName("omp_parallel");
1715 Builder.SetInsertPoint(CI);
1716
1717 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1718 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1719 &OutlinedFn};
1720
1721 SmallVector<Value *, 16> RealArgs;
1722 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1723 if (IfCondition) {
1724 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1725 RealArgs.push_back(Cond);
1726 }
1727 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1728
1729 // __kmpc_fork_call_if always expects a void ptr as the last argument
1730 // If there are no arguments, pass a null pointer.
1731 auto PtrTy = OMPIRBuilder->VoidPtr;
1732 if (IfCondition && NumCapturedVars == 0) {
1733 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1734 RealArgs.push_back(NullPtrValue);
1735 }
1736
1737 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1738
1739 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1740 << *Builder.GetInsertBlock()->getParent() << "\n");
1741
1742 // Initialize the local TID stack location with the argument value.
1743 Builder.SetInsertPoint(PrivTID);
1744 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1745 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1746 PrivTIDAddr);
1747
1748 // Remove redundant call to the outlined function.
1749 CI->eraseFromParent();
1750
1751 for (Instruction *I : ToBeDeleted) {
1752 I->eraseFromParent();
1753 }
1754}
1755
1757 const LocationDescription &Loc, InsertPointTy OuterAllocIP,
1758 ArrayRef<BasicBlock *> OuterDeallocBlocks, BodyGenCallbackTy BodyGenCB,
1759 PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition,
1760 Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable) {
1761 assert(!isConflictIP(Loc.IP, OuterAllocIP) && "IPs must not be ambiguous");
1762
1763 if (!updateToLocation(Loc))
1764 return Loc.IP;
1765
1766 uint32_t SrcLocStrSize;
1767 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1768 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1769 const bool NeedThreadID = NumThreads || Config.isTargetDevice() ||
1770 (ProcBind != OMP_PROC_BIND_default);
1771 Value *ThreadID = NeedThreadID ? getOrCreateThreadID(Ident) : nullptr;
1772 // If we generate code for the target device, we need to allocate
1773 // struct for aggregate params in the device default alloca address space.
1774 // OpenMP runtime requires that the params of the extracted functions are
1775 // passed as zero address space pointers. This flag ensures that extracted
1776 // function arguments are declared in zero address space
1777 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1778
1779 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1780 // only if we compile for host side.
1781 if (NumThreads && !Config.isTargetDevice()) {
1782 Value *Args[] = {
1783 Ident, ThreadID,
1784 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1786 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1787 }
1788
1789 if (ProcBind != OMP_PROC_BIND_default) {
1790 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1791 Value *Args[] = {
1792 Ident, ThreadID,
1793 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1795 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1796 }
1797
1798 BasicBlock *InsertBB = Builder.GetInsertBlock();
1799 Function *OuterFn = InsertBB->getParent();
1800
1801 // Save the outer alloca block because the insertion iterator may get
1802 // invalidated and we still need this later.
1803 BasicBlock *OuterAllocaBlock = OuterAllocIP.getBlock();
1804
1805 // Vector to remember instructions we used only during the modeling but which
1806 // we want to delete at the end.
1808
1809 // Change the location to the outer alloca insertion point to create and
1810 // initialize the allocas we pass into the parallel region.
1811 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1812 Builder.restoreIP(NewOuter);
1813 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1814 AllocaInst *ZeroAddrAlloca =
1815 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1816 Instruction *TIDAddr = TIDAddrAlloca;
1817 Instruction *ZeroAddr = ZeroAddrAlloca;
1818 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1819 // Add additional casts to enforce pointers in zero address space
1820 TIDAddr = new AddrSpaceCastInst(
1821 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1822 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1823 ToBeDeleted.push_back(TIDAddr);
1824 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1825 PointerType ::get(M.getContext(), 0),
1826 "zero.addr.ascast");
1827 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1828 ToBeDeleted.push_back(ZeroAddr);
1829 }
1830
1831 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1832 // associated arguments in the outlined function, so we delete them later.
1833 ToBeDeleted.push_back(TIDAddrAlloca);
1834 ToBeDeleted.push_back(ZeroAddrAlloca);
1835
1836 // Create an artificial insertion point that will also ensure the blocks we
1837 // are about to split are not degenerated.
1838 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1839
1840 BasicBlock *EntryBB = UI->getParent();
1841 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1842 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1843 BasicBlock *PRegPreFiniBB =
1844 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1845 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1846
1847 auto FiniCBWrapper = [&](InsertPointTy IP) {
1848 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1849 // target to the region exit block.
1850 if (IP.getBlock()->end() == IP.getPoint()) {
1852 Builder.restoreIP(IP);
1853 Instruction *I = Builder.CreateBr(PRegExitBB);
1854 IP = InsertPointTy(I->getParent(), I->getIterator());
1855 }
1856 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1857 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1858 "Unexpected insertion point for finalization call!");
1859 return FiniCB(IP);
1860 };
1861
1862 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1863
1864 // Generate the privatization allocas in the block that will become the entry
1865 // of the outlined function.
1866 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1867 InsertPointTy InnerAllocaIP = Builder.saveIP();
1868
1869 AllocaInst *PrivTIDAddr =
1870 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1871 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1872
1873 // Add some fake uses for OpenMP provided arguments.
1874 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1875 Instruction *ZeroAddrUse =
1876 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1877 ToBeDeleted.push_back(ZeroAddrUse);
1878
1879 // EntryBB
1880 // |
1881 // V
1882 // PRegionEntryBB <- Privatization allocas are placed here.
1883 // |
1884 // V
1885 // PRegionBodyBB <- BodeGen is invoked here.
1886 // |
1887 // V
1888 // PRegPreFiniBB <- The block we will start finalization from.
1889 // |
1890 // V
1891 // PRegionExitBB <- A common exit to simplify block collection.
1892 //
1893
1894 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1895
1896 // Let the caller create the body.
1897 assert(BodyGenCB && "Expected body generation callback!");
1898 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1899 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP, PRegExitBB))
1900 return Err;
1901
1902 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1903
1904 // If OuterFn is a Generic kernel, we need to use device shared memory to
1905 // allocate argument structures. Otherwise, we use stack allocations as usual.
1906 bool UsesDeviceSharedMemory =
1907 Config.isTargetDevice() && isGenericKernel(*OuterFn);
1908 std::unique_ptr<OutlineInfo> OI =
1909 UsesDeviceSharedMemory
1910 ? std::make_unique<DeviceSharedMemOutlineInfo>(*this)
1911 : std::make_unique<OutlineInfo>();
1912
1913 if (Config.isTargetDevice()) {
1914 // Generate OpenMP target specific runtime call
1915 OI->PostOutlineCB = [=, ToBeDeletedVec =
1916 std::move(ToBeDeleted)](Function &OutlinedFn) {
1917 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1918 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1919 ThreadID, ToBeDeletedVec);
1920 };
1921 } else {
1922 // Generate OpenMP host runtime call
1923 OI->PostOutlineCB = [=, ToBeDeletedVec =
1924 std::move(ToBeDeleted)](Function &OutlinedFn) {
1925 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1926 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1927 };
1928 }
1929
1930 OI->FixUpNonEntryAllocas = true;
1931 OI->OuterAllocBB = OuterAllocaBlock;
1932 OI->EntryBB = PRegEntryBB;
1933 OI->ExitBB = PRegExitBB;
1934 OI->OuterDeallocBBs.reserve(OuterDeallocBlocks.size());
1935 copy(OuterDeallocBlocks, OI->OuterDeallocBBs.end());
1936
1937 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1939 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
1940
1941 CodeExtractorAnalysisCache CEAC(*OuterFn);
1942 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1943 /* AggregateArgs */ false,
1944 /* BlockFrequencyInfo */ nullptr,
1945 /* BranchProbabilityInfo */ nullptr,
1946 /* AssumptionCache */ nullptr,
1947 /* AllowVarArgs */ true,
1948 /* AllowAlloca */ true,
1949 /* AllocationBlock */ OuterAllocaBlock,
1950 /* DeallocationBlocks */ {},
1951 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1952
1953 // Find inputs to, outputs from the code region.
1954 BasicBlock *CommonExit = nullptr;
1955 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1956 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1957
1958 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1959 /*CollectGlobalInputs=*/true);
1960
1961 Inputs.remove_if([&](Value *I) {
1963 return GV->getValueType() == OpenMPIRBuilder::Ident;
1964
1965 return false;
1966 });
1967
1968 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1969
1970 FunctionCallee TIDRTLFn =
1971 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1972
1973 auto PrivHelper = [&](Value &V) -> Error {
1974 if (&V == TIDAddr || &V == ZeroAddr) {
1975 OI->ExcludeArgsFromAggregate.push_back(&V);
1976 return Error::success();
1977 }
1978
1980 for (Use &U : V.uses())
1981 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1982 if (ParallelRegionBlockSet.count(UserI->getParent()))
1983 Uses.insert(&U);
1984
1985 // __kmpc_fork_call expects extra arguments as pointers. If the input
1986 // already has a pointer type, everything is fine. Otherwise, store the
1987 // value onto stack and load it back inside the to-be-outlined region. This
1988 // will ensure only the pointer will be passed to the function.
1989 // FIXME: if there are more than 15 trailing arguments, they must be
1990 // additionally packed in a struct.
1991 Value *Inner = &V;
1992 if (!V.getType()->isPointerTy()) {
1994 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1995
1996 Builder.restoreIP(OuterAllocIP);
1997 Value *Ptr;
1998 if (UsesDeviceSharedMemory) {
1999 // Use device shared memory instead, if needed.
2000 Ptr = createOMPAllocShared(OuterAllocIP, V.getType(),
2001 V.getName() + ".reloaded");
2002 for (BasicBlock *DeallocBlock : OuterDeallocBlocks)
2004 InsertPointTy(DeallocBlock, DeallocBlock->getFirstInsertionPt()),
2005 Ptr, V.getType());
2006 } else {
2007 Ptr = Builder.CreateAlloca(V.getType(), nullptr,
2008 V.getName() + ".reloaded");
2009 }
2010
2011 // Store to stack at end of the block that currently branches to the entry
2012 // block of the to-be-outlined region.
2013 Builder.SetInsertPoint(InsertBB,
2014 InsertBB->getTerminator()->getIterator());
2015 Builder.CreateStore(&V, Ptr);
2016
2017 // Load back next to allocations in the to-be-outlined region.
2018 Builder.restoreIP(InnerAllocaIP);
2019 Inner = Builder.CreateLoad(V.getType(), Ptr);
2020 }
2021
2022 Value *ReplacementValue = nullptr;
2023 CallInst *CI = dyn_cast<CallInst>(&V);
2024 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
2025 ReplacementValue = PrivTID;
2026 } else {
2027 InsertPointOrErrorTy AfterIP =
2028 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
2029 if (!AfterIP)
2030 return AfterIP.takeError();
2031 Builder.restoreIP(*AfterIP);
2032 InnerAllocaIP = {
2033 InnerAllocaIP.getBlock(),
2034 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
2035
2036 assert(ReplacementValue &&
2037 "Expected copy/create callback to set replacement value!");
2038 if (ReplacementValue == &V)
2039 return Error::success();
2040 }
2041
2042 for (Use *UPtr : Uses)
2043 UPtr->set(ReplacementValue);
2044
2045 return Error::success();
2046 };
2047
2048 // Reset the inner alloca insertion as it will be used for loading the values
2049 // wrapped into pointers before passing them into the to-be-outlined region.
2050 // Configure it to insert immediately after the fake use of zero address so
2051 // that they are available in the generated body and so that the
2052 // OpenMP-related values (thread ID and zero address pointers) remain leading
2053 // in the argument list.
2054 InnerAllocaIP = IRBuilder<>::InsertPoint(
2055 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
2056
2057 // Reset the outer alloca insertion point to the entry of the relevant block
2058 // in case it was invalidated.
2059 OuterAllocIP = IRBuilder<>::InsertPoint(
2060 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
2061
2062 for (Value *Input : Inputs) {
2063 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
2064 if (Error Err = PrivHelper(*Input))
2065 return Err;
2066 }
2067 LLVM_DEBUG({
2068 for (Value *Output : Outputs)
2069 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
2070 });
2071 assert(Outputs.empty() &&
2072 "OpenMP outlining should not produce live-out values!");
2073
2074 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
2075 LLVM_DEBUG({
2076 for (auto *BB : Blocks)
2077 dbgs() << " PBR: " << BB->getName() << "\n";
2078 });
2079
2080 // Adjust the finalization stack, verify the adjustment, and call the
2081 // finalize function a last time to finalize values between the pre-fini
2082 // block and the exit block if we left the parallel "the normal way".
2083 auto FiniInfo = FinalizationStack.pop_back_val();
2084 (void)FiniInfo;
2085 assert(FiniInfo.DK == OMPD_parallel &&
2086 "Unexpected finalization stack state!");
2087
2088 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
2089
2090 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
2091 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
2092 if (!FiniBBOrErr)
2093 return FiniBBOrErr.takeError();
2094 {
2096 Builder.restoreIP(PreFiniIP);
2097 Builder.CreateBr(*FiniBBOrErr);
2098 // There's currently a branch to omp.par.exit. Delete it. We will get there
2099 // via the fini block
2100 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
2101 Term->eraseFromParent();
2102 }
2103
2104 // Register the outlined info.
2105 addOutlineInfo(std::move(OI));
2106
2107 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
2108 UI->eraseFromParent();
2109
2110 return AfterIP;
2111}
2112
2114 // Build call void __kmpc_flush(ident_t *loc)
2115 uint32_t SrcLocStrSize;
2116 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2117 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
2118
2120 Args);
2121}
2122
2124 if (!updateToLocation(Loc))
2125 return;
2126 emitFlush(Loc);
2127}
2128
2130 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
2131 // global_tid);
2132 uint32_t SrcLocStrSize;
2133 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2134 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2135 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
2136
2137 // Ignore return result until untied tasks are supported.
2139 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
2140}
2141
2147
2149 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
2150 uint32_t SrcLocStrSize;
2151 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2152 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2153 Constant *I32Null = ConstantInt::getNullValue(Int32);
2154 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
2155
2157 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
2158}
2159
2165
2167 const DependData &Dep) {
2168 // Store the pointer to the variable
2169 Value *Addr = Builder.CreateStructGEP(
2170 DependInfo, Entry,
2171 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2172 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, SizeTy);
2173 Builder.CreateStore(DepValPtr, Addr);
2174 // Store the size of the variable
2175 Value *Size = Builder.CreateStructGEP(
2176 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Len));
2177 Builder.CreateStore(
2178 ConstantInt::get(SizeTy,
2179 M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
2180 Size);
2181 // Store the dependency kind
2182 Value *Flags = Builder.CreateStructGEP(
2183 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Flags));
2184 Builder.CreateStore(ConstantInt::get(Builder.getInt8Ty(),
2185 static_cast<unsigned int>(Dep.DepKind)),
2186 Flags);
2187}
2188
2189// Processes the dependencies in Dependencies and does the following
2190// - Allocates space on the stack of an array of DependInfo objects
2191// - Populates each DependInfo object with relevant information of
2192// the corresponding dependence.
2193// - All code is inserted in the entry block of the current function.
2195 OpenMPIRBuilder &OMPBuilder,
2197 // Early return if we have no dependencies to process
2198 if (Dependencies.empty())
2199 return nullptr;
2200
2201 // Given a vector of DependData objects, in this function we create an
2202 // array on the stack that holds kmp_depend_info objects corresponding
2203 // to each dependency. This is then passed to the OpenMP runtime.
2204 // For example, if there are 'n' dependencies then the following psedo
2205 // code is generated. Assume the first dependence is on a variable 'a'
2206 //
2207 // \code{c}
2208 // DepArray = alloc(n x sizeof(kmp_depend_info);
2209 // idx = 0;
2210 // DepArray[idx].base_addr = ptrtoint(&a);
2211 // DepArray[idx].len = 8;
2212 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
2213 // ++idx;
2214 // DepArray[idx].base_addr = ...;
2215 // \endcode
2216
2217 IRBuilderBase &Builder = OMPBuilder.Builder;
2218 Type *DependInfo = OMPBuilder.DependInfo;
2219
2220 Value *DepArray = nullptr;
2221 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
2222 Builder.SetInsertPoint(
2224
2225 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2226 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2227
2228 Builder.restoreIP(OldIP);
2229
2230 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2231 Value *Base =
2232 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2233 OMPBuilder.emitTaskDependency(Builder, Base, Dep);
2234 }
2235 return DepArray;
2236}
2237
2238/// Create the task duplication function passed to kmpc_taskloop.
2239Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2240 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2241 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2242 if (!DupCB)
2244 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2245
2246 // From OpenMP Runtime p_task_dup_t:
2247 // Routine optionally generated by the compiler for setting the lastprivate
2248 // flag and calling needed constructors for private/firstprivate objects (used
2249 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2250 // lastprivate flag.
2251 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2252
2253 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2254
2255 FunctionType *DupFuncTy = FunctionType::get(
2256 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2257 /*isVarArg=*/false);
2258
2259 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2260 "omp_taskloop_dup", M);
2261 Value *DestTaskArg = DupFunction->getArg(0);
2262 Value *SrcTaskArg = DupFunction->getArg(1);
2263 Value *LastprivateFlagArg = DupFunction->getArg(2);
2264 DestTaskArg->setName("dest_task");
2265 SrcTaskArg->setName("src_task");
2266 LastprivateFlagArg->setName("lastprivate_flag");
2267
2268 IRBuilderBase::InsertPointGuard Guard(Builder);
2269 Builder.SetInsertPoint(
2270 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2271
2272 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2273 Type *TaskWithPrivatesTy =
2274 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2275 Value *TaskPrivates = Builder.CreateGEP(
2276 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2277 Value *ContextPtr = Builder.CreateGEP(
2278 PrivatesTy, TaskPrivates,
2279 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2280 return ContextPtr;
2281 };
2282
2283 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2284 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2285
2286 DestTaskContextPtr->setName("destPtr");
2287 SrcTaskContextPtr->setName("srcPtr");
2288
2289 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2290 DupFunction->getEntryBlock().begin());
2291 InsertPointTy CodeGenIP = Builder.saveIP();
2292 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2293 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2294 if (!AfterIPOrError)
2295 return AfterIPOrError.takeError();
2296 Builder.restoreIP(*AfterIPOrError);
2297
2298 Builder.CreateRetVoid();
2299
2300 return DupFunction;
2301}
2302
2303OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2304 const LocationDescription &Loc, InsertPointTy AllocaIP,
2305 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB,
2306 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2307 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2308 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2309 Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
2310 Value *TaskContextStructPtrVal) {
2311
2312 if (!updateToLocation(Loc))
2313 return InsertPointTy();
2314
2315 uint32_t SrcLocStrSize;
2316 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2317 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2318
2319 BasicBlock *TaskloopExitBB =
2320 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2321 BasicBlock *TaskloopBodyBB =
2322 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2323 BasicBlock *TaskloopAllocaBB =
2324 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2325
2326 InsertPointTy TaskloopAllocaIP =
2327 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2328 InsertPointTy TaskloopBodyIP =
2329 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2330
2331 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP, TaskloopExitBB))
2332 return Err;
2333
2334 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2335 if (!result) {
2336 return result.takeError();
2337 }
2338
2339 llvm::CanonicalLoopInfo *CLI = result.get();
2340 auto OI = std::make_unique<OutlineInfo>();
2341 OI->EntryBB = TaskloopAllocaBB;
2342 OI->OuterAllocBB = AllocaIP.getBlock();
2343 OI->ExitBB = TaskloopExitBB;
2344 OI->OuterDeallocBBs.reserve(DeallocBlocks.size());
2345 copy(DeallocBlocks, OI->OuterDeallocBBs.end());
2346
2347 // Add the thread ID argument.
2348 SmallVector<Instruction *> ToBeDeleted;
2349 // dummy instruction to be used as a fake argument
2350 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2351 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2352 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2353 TaskloopAllocaIP, "lb", false, true);
2354 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2355 TaskloopAllocaIP, "ub", false, true);
2356 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2357 TaskloopAllocaIP, "step", false, true);
2358 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2359 // aggregate struct
2360 OI->Inputs.insert(FakeLB);
2361 OI->Inputs.insert(FakeUB);
2362 OI->Inputs.insert(FakeStep);
2363 if (TaskContextStructPtrVal)
2364 OI->Inputs.insert(TaskContextStructPtrVal);
2365 assert(((TaskContextStructPtrVal && DupCB) ||
2366 (!TaskContextStructPtrVal && !DupCB)) &&
2367 "Task context struct ptr and duplication callback must be both set "
2368 "or both null");
2369
2370 // It isn't safe to run the duplication bodygen callback inside the post
2371 // outlining callback so this has to be run now before we know the real task
2372 // shareds structure type.
2373 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2374 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2375 Type *FakeSharedsTy = StructType::get(
2376 Builder.getContext(),
2377 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2378 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2379 FakeSharedsTy,
2380 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2381 if (!TaskDupFnOrErr) {
2382 return TaskDupFnOrErr.takeError();
2383 }
2384 Value *TaskDupFn = *TaskDupFnOrErr;
2385
2386 OI->PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2387 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2388 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2389 FakeStep, FakeSharedsTy, Final, Mergeable, Priority,
2390 NumOfCollapseLoops](Function &OutlinedFn) mutable {
2391 // Replace the Stale CI by appropriate RTL function call.
2392 assert(OutlinedFn.hasOneUse() &&
2393 "there must be a single user for the outlined function");
2394 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2395
2396 /* Create the casting for the Bounds Values that can be used when outlining
2397 * to replace the uses of the fakes with real values */
2398 BasicBlock *CodeReplBB = StaleCI->getParent();
2399 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2400 Value *CastedLBVal =
2401 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2402 Value *CastedUBVal =
2403 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2404 Value *CastedStepVal =
2405 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2406
2407 Builder.SetInsertPoint(StaleCI);
2408
2409 // Gather the arguments for emitting the runtime call for
2410 // @__kmpc_omp_task_alloc
2411 Function *TaskAllocFn =
2412 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2413
2414 Value *ThreadID = getOrCreateThreadID(Ident);
2415
2416 if (!NoGroup) {
2417 // Emit runtime call for @__kmpc_taskgroup
2418 Function *TaskgroupFn =
2419 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2420 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2421 }
2422
2423 // `flags` Argument Configuration
2424 // Task is tied if (Flags & 1) == 1.
2425 // Task is untied if (Flags & 1) == 0.
2426 // Task is final if (Flags & 2) == 2.
2427 // Task is not final if (Flags & 2) == 0.
2428 // Task is mergeable if (Flags & 4) == 4.
2429 // Task is not mergeable if (Flags & 4) == 0.
2430 // Task is priority if (Flags & 32) == 32.
2431 // Task is not priority if (Flags & 32) == 0.
2432 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2433 if (Final)
2434 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2435 if (Mergeable)
2436 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2437 if (Priority)
2438 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2439
2440 Value *TaskSize = Builder.getInt64(
2441 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2442
2443 AllocaInst *ArgStructAlloca =
2445 assert(ArgStructAlloca &&
2446 "Unable to find the alloca instruction corresponding to arguments "
2447 "for extracted function");
2448 std::optional<TypeSize> ArgAllocSize =
2449 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2450 assert(ArgAllocSize &&
2451 "Unable to determine size of arguments for extracted function");
2452 Value *SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2453
2454 // Emit the @__kmpc_omp_task_alloc runtime call
2455 // The runtime call returns a pointer to an area where the task captured
2456 // variables must be copied before the task is run (TaskData)
2457 CallInst *TaskData = Builder.CreateCall(
2458 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2459 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2460 /*task_func=*/&OutlinedFn});
2461
2462 Value *Shareds = StaleCI->getArgOperand(1);
2463 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2464 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2465 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2466 SharedsSize);
2467 // Get the pointer to loop lb, ub, step from task ptr
2468 // and set up the lowerbound,upperbound and step values
2469 llvm::Value *Lb = Builder.CreateGEP(
2470 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2471
2472 llvm::Value *Ub = Builder.CreateGEP(
2473 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2474
2475 llvm::Value *Step = Builder.CreateGEP(
2476 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2477 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2478
2479 // set up the arguments for emitting kmpc_taskloop runtime call
2480 // setting values for ifval, nogroup, sched, grainsize, task_dup
2481 Value *IfCondVal =
2482 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2483 : Builder.getInt32(1);
2484 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2485 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2486 Value *NoGroupVal = Builder.getInt32(1);
2487 Value *SchedVal = Builder.getInt32(Sched);
2488 Value *GrainSizeVal =
2489 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2490 : Builder.getInt64(0);
2491 Value *TaskDup = TaskDupFn;
2492
2493 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2494 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2495
2496 // taskloop runtime call
2497 Function *TaskloopFn =
2498 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2499 Builder.CreateCall(TaskloopFn, Args);
2500
2501 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2502 // nogroup is not defined
2503 if (!NoGroup) {
2504 Function *EndTaskgroupFn =
2505 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2506 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2507 }
2508
2509 StaleCI->eraseFromParent();
2510
2511 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2512
2513 LoadInst *SharedsOutlined =
2514 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2515 OutlinedFn.getArg(1)->replaceUsesWithIf(
2516 SharedsOutlined,
2517 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2518
2519 Value *IV = CLI->getIndVar();
2520 Type *IVTy = IV->getType();
2521 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2522
2523 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2524 // UpperBound. These GEP's can be reused for loading the tasks respective
2525 // bounds.
2526 Value *TaskLB = nullptr;
2527 Value *TaskUB = nullptr;
2528 Value *TaskStep = nullptr;
2529 Value *LoadTaskLB = nullptr;
2530 Value *LoadTaskUB = nullptr;
2531 Value *LoadTaskStep = nullptr;
2532 for (Instruction &I : *TaskloopAllocaBB) {
2533 if (I.getOpcode() == Instruction::GetElementPtr) {
2534 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2535 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2536 switch (CI->getZExtValue()) {
2537 case 0:
2538 TaskLB = &I;
2539 break;
2540 case 1:
2541 TaskUB = &I;
2542 break;
2543 case 2:
2544 TaskStep = &I;
2545 break;
2546 }
2547 }
2548 } else if (I.getOpcode() == Instruction::Load) {
2549 LoadInst &Load = cast<LoadInst>(I);
2550 if (Load.getPointerOperand() == TaskLB) {
2551 assert(TaskLB != nullptr && "Expected value for TaskLB");
2552 LoadTaskLB = &I;
2553 } else if (Load.getPointerOperand() == TaskUB) {
2554 assert(TaskUB != nullptr && "Expected value for TaskUB");
2555 LoadTaskUB = &I;
2556 } else if (Load.getPointerOperand() == TaskStep) {
2557 assert(TaskStep != nullptr && "Expected value for TaskStep");
2558 LoadTaskStep = &I;
2559 }
2560 }
2561 }
2562
2563 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2564
2565 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2566 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2567 assert(LoadTaskStep != nullptr && "Expected value for LoadTaskStep");
2568 Value *TripCountMinusOne = Builder.CreateSDiv(
2569 Builder.CreateSub(LoadTaskUB, LoadTaskLB), LoadTaskStep);
2570 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2571 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2572 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2573 // set the trip count in the CLI
2574 CLI->setTripCount(CastedTripCount);
2575
2576 Builder.SetInsertPoint(CLI->getBody(),
2577 CLI->getBody()->getFirstInsertionPt());
2578
2579 if (NumOfCollapseLoops > 1) {
2580 llvm::SmallVector<User *> UsersToReplace;
2581 // When using the collapse clause, the bounds of the loop have to be
2582 // adjusted to properly represent the iterator of the outer loop.
2583 Value *IVPlusTaskLB = Builder.CreateAdd(
2584 CLI->getIndVar(),
2585 Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
2586 // To ensure every Use is correctly captured, we first want to record
2587 // which users to replace the value in, and then replace the value.
2588 for (auto IVUse = CLI->getIndVar()->uses().begin();
2589 IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
2590 User *IVUser = IVUse->getUser();
2591 if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
2592 if (Op->getOpcode() == Instruction::URem ||
2593 Op->getOpcode() == Instruction::UDiv) {
2594 UsersToReplace.push_back(IVUser);
2595 }
2596 }
2597 }
2598 for (User *User : UsersToReplace) {
2599 User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
2600 }
2601 } else {
2602 // The canonical loop is generated with a fixed lower bound. We need to
2603 // update the index calculation code to use the task's lower bound. The
2604 // generated code looks like this:
2605 // %omp_loop.iv = phi ...
2606 // ...
2607 // %tmp = mul [type] %omp_loop.iv, step
2608 // %user_index = add [type] tmp, lb
2609 // OpenMPIRBuilder constructs canonical loops to have exactly three uses
2610 // of the normalised induction variable:
2611 // 1. This one: converting the normalised IV to the user IV
2612 // 2. The increment (add)
2613 // 3. The comparison against the trip count (icmp)
2614 // (1) is the only use that is a mul followed by an add so this cannot
2615 // match other IR.
2616 assert(CLI->getIndVar()->getNumUses() == 3 &&
2617 "Canonical loop should have exactly three uses of the ind var");
2618 for (User *IVUser : CLI->getIndVar()->users()) {
2619 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2620 if (Mul->getOpcode() == Instruction::Mul) {
2621 for (User *MulUser : Mul->users()) {
2622 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2623 if (Add->getOpcode() == Instruction::Add) {
2624 Add->setOperand(1, CastedTaskLB);
2625 }
2626 }
2627 }
2628 }
2629 }
2630 }
2631 }
2632
2633 FakeLB->replaceAllUsesWith(CastedLBVal);
2634 FakeUB->replaceAllUsesWith(CastedUBVal);
2635 FakeStep->replaceAllUsesWith(CastedStepVal);
2636 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2637 I->eraseFromParent();
2638 }
2639 };
2640
2641 addOutlineInfo(std::move(OI));
2642 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2643 return Builder.saveIP();
2644}
2645
2648 M.getContext(), M.getDataLayout().getPointerSizeInBits());
2649 return llvm::StructType::get(IntPtrTy, IntPtrTy,
2650 llvm::Type::getInt32Ty(M.getContext()));
2651}
2652
2654 const LocationDescription &Loc, InsertPointTy AllocaIP,
2655 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB,
2656 bool Tied, Value *Final, Value *IfCondition,
2657 const DependenciesInfo &Dependencies, const AffinityData &Affinities,
2658 bool Mergeable, Value *EventHandle, Value *Priority) {
2659
2660 if (!updateToLocation(Loc))
2661 return InsertPointTy();
2662
2663 uint32_t SrcLocStrSize;
2664 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2665 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2666 // The current basic block is split into four basic blocks. After outlining,
2667 // they will be mapped as follows:
2668 // ```
2669 // def current_fn() {
2670 // current_basic_block:
2671 // br label %task.exit
2672 // task.exit:
2673 // ; instructions after task
2674 // }
2675 // def outlined_fn() {
2676 // task.alloca:
2677 // br label %task.body
2678 // task.body:
2679 // ret void
2680 // }
2681 // ```
2682 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2683 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2684 BasicBlock *TaskAllocaBB =
2685 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2686
2687 InsertPointTy TaskAllocaIP =
2688 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2689 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2690 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP, TaskExitBB))
2691 return Err;
2692
2693 auto OI = std::make_unique<OutlineInfo>();
2694 OI->EntryBB = TaskAllocaBB;
2695 OI->OuterAllocBB = AllocaIP.getBlock();
2696 OI->ExitBB = TaskExitBB;
2697 OI->OuterDeallocBBs.reserve(DeallocBlocks.size());
2698 copy(DeallocBlocks, OI->OuterDeallocBBs.end());
2699
2700 // Add the thread ID argument.
2702 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2703 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2704
2705 OI->PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2706 Affinities, Mergeable, Priority, EventHandle,
2707 TaskAllocaBB,
2708 ToBeDeleted](Function &OutlinedFn) mutable {
2709 // Replace the Stale CI by appropriate RTL function call.
2710 assert(OutlinedFn.hasOneUse() &&
2711 "there must be a single user for the outlined function");
2712 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2713
2714 // HasShareds is true if any variables are captured in the outlined region,
2715 // false otherwise.
2716 bool HasShareds = StaleCI->arg_size() > 1;
2717 Builder.SetInsertPoint(StaleCI);
2718
2719 // Gather the arguments for emitting the runtime call for
2720 // @__kmpc_omp_task_alloc
2721 Function *TaskAllocFn =
2722 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2723
2724 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2725 // call.
2726 Value *ThreadID = getOrCreateThreadID(Ident);
2727
2728 // Argument - `flags`
2729 // Task is tied iff (Flags & 1) == 1.
2730 // Task is untied iff (Flags & 1) == 0.
2731 // Task is final iff (Flags & 2) == 2.
2732 // Task is not final iff (Flags & 2) == 0.
2733 // Task is mergeable iff (Flags & 4) == 4.
2734 // Task is not mergeable iff (Flags & 4) == 0.
2735 // Task is priority iff (Flags & 32) == 32.
2736 // Task is not priority iff (Flags & 32) == 0.
2737 // TODO: Handle the other flags.
2738 Value *Flags = Builder.getInt32(Tied);
2739 if (Final) {
2740 Value *FinalFlag =
2741 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2742 Flags = Builder.CreateOr(FinalFlag, Flags);
2743 }
2744
2745 if (Mergeable)
2746 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2747 if (Priority)
2748 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2749
2750 // Argument - `sizeof_kmp_task_t` (TaskSize)
2751 // Tasksize refers to the size in bytes of kmp_task_t data structure
2752 // including private vars accessed in task.
2753 // TODO: add kmp_task_t_with_privates (privates)
2754 Value *TaskSize = Builder.getInt64(
2755 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2756
2757 // Argument - `sizeof_shareds` (SharedsSize)
2758 // SharedsSize refers to the shareds array size in the kmp_task_t data
2759 // structure.
2760 Value *SharedsSize = Builder.getInt64(0);
2761 if (HasShareds) {
2762 AllocaInst *ArgStructAlloca =
2764 assert(ArgStructAlloca &&
2765 "Unable to find the alloca instruction corresponding to arguments "
2766 "for extracted function");
2767 std::optional<TypeSize> ArgAllocSize =
2768 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2769 assert(ArgAllocSize &&
2770 "Unable to determine size of arguments for extracted function");
2771 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2772 }
2773 // Emit the @__kmpc_omp_task_alloc runtime call
2774 // The runtime call returns a pointer to an area where the task captured
2775 // variables must be copied before the task is run (TaskData)
2777 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2778 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2779 /*task_func=*/&OutlinedFn});
2780
2781 if (Affinities.Count && Affinities.Info) {
2783 OMPRTL___kmpc_omp_reg_task_with_affinity);
2784
2785 createRuntimeFunctionCall(RegAffFn, {Ident, ThreadID, TaskData,
2786 Affinities.Count, Affinities.Info});
2787 }
2788
2789 // Emit detach clause initialization.
2790 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2791 // task_descriptor);
2792 if (EventHandle) {
2794 OMPRTL___kmpc_task_allow_completion_event);
2795 llvm::Value *EventVal =
2796 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2797 llvm::Value *EventHandleAddr =
2798 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2799 Builder.getPtrTy(0));
2800 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2801 Builder.CreateStore(EventVal, EventHandleAddr);
2802 }
2803 // Copy the arguments for outlined function
2804 if (HasShareds) {
2805 Value *Shareds = StaleCI->getArgOperand(1);
2806 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2807 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2808 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2809 SharedsSize);
2810 }
2811
2812 if (Priority) {
2813 //
2814 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2815 // we populate the priority information into the "kmp_task_t" here
2816 //
2817 // The struct "kmp_task_t" definition is available in kmp.h
2818 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2819 // data2 is used for priority
2820 //
2821 Type *Int32Ty = Builder.getInt32Ty();
2822 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2823 // kmp_task_t* => { ptr }
2824 Type *TaskPtr = StructType::get(VoidPtr);
2825 Value *TaskGEP =
2826 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2827 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2828 Type *TaskStructType = StructType::get(
2829 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2830 Value *PriorityData = Builder.CreateInBoundsGEP(
2831 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2832 // kmp_cmplrdata_t => { ptr, ptr }
2833 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2834 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2835 PriorityData, {Zero, Zero});
2836 Builder.CreateStore(Priority, CmplrData);
2837 }
2838
2839 Value *DepArray = nullptr;
2840 Value *NumDeps = nullptr;
2841 if (Dependencies.DepArray) {
2842 DepArray = Dependencies.DepArray;
2843 NumDeps = Dependencies.NumDeps;
2844 } else if (!Dependencies.Deps.empty()) {
2845 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
2846 NumDeps = Builder.getInt32(Dependencies.Deps.size());
2847 }
2848
2849 // In the presence of the `if` clause, the following IR is generated:
2850 // ...
2851 // %data = call @__kmpc_omp_task_alloc(...)
2852 // br i1 %if_condition, label %then, label %else
2853 // then:
2854 // call @__kmpc_omp_task(...)
2855 // br label %exit
2856 // else:
2857 // ;; Wait for resolution of dependencies, if any, before
2858 // ;; beginning the task
2859 // call @__kmpc_omp_wait_deps(...)
2860 // call @__kmpc_omp_task_begin_if0(...)
2861 // call @outlined_fn(...)
2862 // call @__kmpc_omp_task_complete_if0(...)
2863 // br label %exit
2864 // exit:
2865 // ...
2866 if (IfCondition) {
2867 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2868 // terminator.
2869 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2870 Instruction *IfTerminator =
2871 Builder.GetInsertPoint()->getParent()->getTerminator();
2872 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2873 Builder.SetInsertPoint(IfTerminator);
2874 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2875 &ElseTI);
2876 Builder.SetInsertPoint(ElseTI);
2877
2878 if (DepArray) {
2879 Function *TaskWaitFn =
2880 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2882 TaskWaitFn,
2883 {Ident, ThreadID, NumDeps, DepArray,
2884 ConstantInt::get(Builder.getInt32Ty(), 0),
2886 }
2887 Function *TaskBeginFn =
2888 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2889 Function *TaskCompleteFn =
2890 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2891 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2892 CallInst *CI = nullptr;
2893 if (HasShareds)
2894 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2895 else
2896 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2897 CI->setDebugLoc(StaleCI->getDebugLoc());
2898 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2899 Builder.SetInsertPoint(ThenTI);
2900 }
2901
2902 if (DepArray) {
2903 Function *TaskFn =
2904 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2906 TaskFn,
2907 {Ident, ThreadID, TaskData, NumDeps, DepArray,
2908 ConstantInt::get(Builder.getInt32Ty(), 0),
2910
2911 } else {
2912 // Emit the @__kmpc_omp_task runtime call to spawn the task
2913 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2914 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2915 }
2916
2917 StaleCI->eraseFromParent();
2918
2919 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2920 if (HasShareds) {
2921 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2922 OutlinedFn.getArg(1)->replaceUsesWithIf(
2923 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2924 }
2925
2926 for (Instruction *I : llvm::reverse(ToBeDeleted))
2927 I->eraseFromParent();
2928 };
2929
2930 addOutlineInfo(std::move(OI));
2931 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2932
2933 return Builder.saveIP();
2934}
2935
2937 const LocationDescription &Loc, InsertPointTy AllocaIP,
2938 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB) {
2939 if (!updateToLocation(Loc))
2940 return InsertPointTy();
2941
2942 uint32_t SrcLocStrSize;
2943 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2944 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2945 Value *ThreadID = getOrCreateThreadID(Ident);
2946
2947 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2948 Function *TaskgroupFn =
2949 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2950 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2951
2952 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2953 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP(), DeallocBlocks))
2954 return Err;
2955
2956 Builder.SetInsertPoint(TaskgroupExitBB);
2957 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2958 Function *EndTaskgroupFn =
2959 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2960 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2961
2962 return Builder.saveIP();
2963}
2964
2966 const LocationDescription &Loc, InsertPointTy AllocaIP,
2968 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2969 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2970
2971 if (!updateToLocation(Loc))
2972 return Loc.IP;
2973
2974 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2975
2976 // Each section is emitted as a switch case
2977 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2978 // -> OMP.createSection() which generates the IR for each section
2979 // Iterate through all sections and emit a switch construct:
2980 // switch (IV) {
2981 // case 0:
2982 // <SectionStmt[0]>;
2983 // break;
2984 // ...
2985 // case <NumSection> - 1:
2986 // <SectionStmt[<NumSection> - 1]>;
2987 // break;
2988 // }
2989 // ...
2990 // section_loop.after:
2991 // <FiniCB>;
2992 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2993 Builder.restoreIP(CodeGenIP);
2995 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2996 Function *CurFn = Continue->getParent();
2997 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2998
2999 unsigned CaseNumber = 0;
3000 for (auto SectionCB : SectionCBs) {
3002 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
3003 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
3004 Builder.SetInsertPoint(CaseBB);
3005 UncondBrInst *CaseEndBr = Builder.CreateBr(Continue);
3006 if (Error Err =
3007 SectionCB(InsertPointTy(),
3008 {CaseEndBr->getParent(), CaseEndBr->getIterator()}, {}))
3009 return Err;
3010 CaseNumber++;
3011 }
3012 // remove the existing terminator from body BB since there can be no
3013 // terminators after switch/case
3014 return Error::success();
3015 };
3016 // Loop body ends here
3017 // LowerBound, UpperBound, and STride for createCanonicalLoop
3018 Type *I32Ty = Type::getInt32Ty(M.getContext());
3019 Value *LB = ConstantInt::get(I32Ty, 0);
3020 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
3021 Value *ST = ConstantInt::get(I32Ty, 1);
3023 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
3024 if (!LoopInfo)
3025 return LoopInfo.takeError();
3026
3027 InsertPointOrErrorTy WsloopIP =
3028 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
3029 WorksharingLoopType::ForStaticLoop, !IsNowait);
3030 if (!WsloopIP)
3031 return WsloopIP.takeError();
3032 InsertPointTy AfterIP = *WsloopIP;
3033
3034 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
3035 assert(LoopFini && "Bad structure of static workshare loop finalization");
3036
3037 // Apply the finalization callback in LoopAfterBB
3038 auto FiniInfo = FinalizationStack.pop_back_val();
3039 assert(FiniInfo.DK == OMPD_sections &&
3040 "Unexpected finalization stack state!");
3041 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
3042 return Err;
3043
3044 return AfterIP;
3045}
3046
3049 BodyGenCallbackTy BodyGenCB,
3050 FinalizeCallbackTy FiniCB) {
3051 if (!updateToLocation(Loc))
3052 return Loc.IP;
3053
3054 auto FiniCBWrapper = [&](InsertPointTy IP) {
3055 if (IP.getBlock()->end() != IP.getPoint())
3056 return FiniCB(IP);
3057 // This must be done otherwise any nested constructs using FinalizeOMPRegion
3058 // will fail because that function requires the Finalization Basic Block to
3059 // have a terminator, which is already removed by EmitOMPRegionBody.
3060 // IP is currently at cancelation block.
3061 // We need to backtrack to the condition block to fetch
3062 // the exit block and create a branch from cancelation
3063 // to exit block.
3065 Builder.restoreIP(IP);
3066 auto *CaseBB = Loc.IP.getBlock();
3067 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
3068 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
3069 Instruction *I = Builder.CreateBr(ExitBB);
3070 IP = InsertPointTy(I->getParent(), I->getIterator());
3071 return FiniCB(IP);
3072 };
3073
3074 Directive OMPD = Directive::OMPD_sections;
3075 // Since we are using Finalization Callback here, HasFinalize
3076 // and IsCancellable have to be true
3077 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
3078 /*Conditional*/ false, /*hasFinalize*/ true,
3079 /*IsCancellable*/ true);
3080}
3081
3087
3088Value *OpenMPIRBuilder::getGPUThreadID() {
3091 OMPRTL___kmpc_get_hardware_thread_id_in_block),
3092 {});
3093}
3094
3095Value *OpenMPIRBuilder::getGPUWarpSize() {
3097 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
3098}
3099
3100Value *OpenMPIRBuilder::getNVPTXWarpID() {
3101 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
3102 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
3103}
3104
3105Value *OpenMPIRBuilder::getNVPTXLaneID() {
3106 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
3107 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
3108 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
3109 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
3110 "nvptx_lane_id");
3111}
3112
3113Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
3114 Type *ToType) {
3115 Type *FromType = From->getType();
3116 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
3117 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
3118 assert(FromSize > 0 && "From size must be greater than zero");
3119 assert(ToSize > 0 && "To size must be greater than zero");
3120 if (FromType == ToType)
3121 return From;
3122 if (FromSize == ToSize)
3123 return Builder.CreateBitCast(From, ToType);
3124 if (ToType->isIntegerTy() && FromType->isIntegerTy())
3125 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
3126 InsertPointTy SaveIP = Builder.saveIP();
3127 Builder.restoreIP(AllocaIP);
3128 Value *CastItem = Builder.CreateAlloca(ToType);
3129 Builder.restoreIP(SaveIP);
3130
3131 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
3132 CastItem, Builder.getPtrTy(0));
3133 Builder.CreateStore(From, ValCastItem);
3134 return Builder.CreateLoad(ToType, CastItem);
3135}
3136
3137Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
3138 Value *Element,
3139 Type *ElementType,
3140 Value *Offset) {
3141 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
3142 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
3143
3144 // Cast all types to 32- or 64-bit values before calling shuffle routines.
3145 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
3146 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
3147 Value *WarpSize =
3148 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
3150 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
3151 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
3152 Value *WarpSizeCast =
3153 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
3154 Value *ShuffleCall =
3155 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
3156 return castValueToType(AllocaIP, ShuffleCall, CastTy);
3157}
3158
3159void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
3160 Value *DstAddr, Type *ElemType,
3161 Value *Offset, Type *ReductionArrayTy,
3162 bool IsByRefElem) {
3163 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
3164 // Create the loop over the big sized data.
3165 // ptr = (void*)Elem;
3166 // ptrEnd = (void*) Elem + 1;
3167 // Step = 8;
3168 // while (ptr + Step < ptrEnd)
3169 // shuffle((int64_t)*ptr);
3170 // Step = 4;
3171 // while (ptr + Step < ptrEnd)
3172 // shuffle((int32_t)*ptr);
3173 // ...
3174 Type *IndexTy = Builder.getIndexTy(
3175 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3176 Value *ElemPtr = DstAddr;
3177 Value *Ptr = SrcAddr;
3178 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
3179 if (Size < IntSize)
3180 continue;
3181 Type *IntType = Builder.getIntNTy(IntSize * 8);
3182 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3183 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
3184 Value *SrcAddrGEP =
3185 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
3186 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3187 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
3188
3189 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3190 if ((Size / IntSize) > 1) {
3191 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
3192 SrcAddrGEP, Builder.getPtrTy());
3193 BasicBlock *PreCondBB =
3194 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
3195 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
3196 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
3197 BasicBlock *CurrentBB = Builder.GetInsertBlock();
3198 emitBlock(PreCondBB, CurFunc);
3199 PHINode *PhiSrc =
3200 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
3201 PhiSrc->addIncoming(Ptr, CurrentBB);
3202 PHINode *PhiDest =
3203 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
3204 PhiDest->addIncoming(ElemPtr, CurrentBB);
3205 Ptr = PhiSrc;
3206 ElemPtr = PhiDest;
3207 Value *PtrDiff = Builder.CreatePtrDiff(
3208 Builder.getInt8Ty(), PtrEnd,
3209 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
3210 Builder.CreateCondBr(
3211 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
3212 ExitBB);
3213 emitBlock(ThenBB, CurFunc);
3214 Value *Res = createRuntimeShuffleFunction(
3215 AllocaIP,
3216 Builder.CreateAlignedLoad(
3217 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
3218 IntType, Offset);
3219 Builder.CreateAlignedStore(Res, ElemPtr,
3220 M.getDataLayout().getPrefTypeAlign(ElemType));
3221 Value *LocalPtr =
3222 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3223 Value *LocalElemPtr =
3224 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3225 PhiSrc->addIncoming(LocalPtr, ThenBB);
3226 PhiDest->addIncoming(LocalElemPtr, ThenBB);
3227 emitBranch(PreCondBB);
3228 emitBlock(ExitBB, CurFunc);
3229 } else {
3230 Value *Res = createRuntimeShuffleFunction(
3231 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
3232 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
3233 Res->getType()->getScalarSizeInBits())
3234 Res = Builder.CreateTrunc(Res, ElemType);
3235 Builder.CreateStore(Res, ElemPtr);
3236 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3237 ElemPtr =
3238 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3239 }
3240 Size = Size % IntSize;
3241 }
3242}
3243
3244Error OpenMPIRBuilder::emitReductionListCopy(
3245 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
3246 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
3247 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
3248 Type *IndexTy = Builder.getIndexTy(
3249 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3250 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
3251
3252 // Iterates, element-by-element, through the source Reduce list and
3253 // make a copy.
3254 for (auto En : enumerate(ReductionInfos)) {
3255 const ReductionInfo &RI = En.value();
3256 Value *SrcElementAddr = nullptr;
3257 AllocaInst *DestAlloca = nullptr;
3258 Value *DestElementAddr = nullptr;
3259 Value *DestElementPtrAddr = nullptr;
3260 // Should we shuffle in an element from a remote lane?
3261 bool ShuffleInElement = false;
3262 // Set to true to update the pointer in the dest Reduce list to a
3263 // newly created element.
3264 bool UpdateDestListPtr = false;
3265
3266 // Step 1.1: Get the address for the src element in the Reduce list.
3267 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3268 ReductionArrayTy, SrcBase,
3269 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3270 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3271
3272 // Step 1.2: Create a temporary to store the element in the destination
3273 // Reduce list.
3274 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3275 ReductionArrayTy, DestBase,
3276 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3277 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3278 switch (Action) {
3280 InsertPointTy CurIP = Builder.saveIP();
3281 Builder.restoreIP(AllocaIP);
3282
3283 Type *DestAllocaType =
3284 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3285 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3286 ".omp.reduction.element");
3287 DestAlloca->setAlignment(
3288 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3289 DestElementAddr = DestAlloca;
3290 DestElementAddr =
3291 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3292 DestElementAddr->getName() + ".ascast");
3293 Builder.restoreIP(CurIP);
3294 ShuffleInElement = true;
3295 UpdateDestListPtr = true;
3296 break;
3297 }
3299 DestElementAddr =
3300 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3301 break;
3302 }
3303 }
3304
3305 // Now that all active lanes have read the element in the
3306 // Reduce list, shuffle over the value from the remote lane.
3307 if (ShuffleInElement) {
3308 Type *ShuffleType = RI.ElementType;
3309 Value *ShuffleSrcAddr = SrcElementAddr;
3310 Value *ShuffleDestAddr = DestElementAddr;
3311 AllocaInst *LocalStorage = nullptr;
3312
3313 if (IsByRefElem) {
3314 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3315 assert(RI.ByRefAllocatedType &&
3316 "Expected by-ref allocated type to be set");
3317 // For by-ref reductions, we need to copy from the remote lane the
3318 // actual value of the partial reduction computed by that remote lane;
3319 // rather than, for example, a pointer to that data or, even worse, a
3320 // pointer to the descriptor of the by-ref reduction element.
3321 ShuffleType = RI.ByRefElementType;
3322
3323 if (RI.DataPtrPtrGen) {
3324 // Descriptor-based by-ref: extract data pointer from descriptor.
3325 InsertPointOrErrorTy GenResult = RI.DataPtrPtrGen(
3326 Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3327
3328 if (!GenResult)
3329 return GenResult.takeError();
3330
3331 ShuffleSrcAddr =
3332 Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3333
3334 {
3335 InsertPointTy OldIP = Builder.saveIP();
3336 Builder.restoreIP(AllocaIP);
3337
3338 LocalStorage = Builder.CreateAlloca(ShuffleType);
3339 Builder.restoreIP(OldIP);
3340 ShuffleDestAddr = LocalStorage;
3341 }
3342 } else {
3343 // Non-descriptor by-ref: the pointer already references data
3344 // directly. Shuffle into the destination alloca.
3345 ShuffleDestAddr = DestElementAddr;
3346 }
3347 }
3348
3349 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3350 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3351
3352 if (IsByRefElem && RI.DataPtrPtrGen) {
3353 // Copy descriptor from source and update base_ptr to shuffled data
3354 Value *DestDescriptorAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3355 DestAlloca, Builder.getPtrTy(), ".ascast");
3356
3357 InsertPointOrErrorTy GenResult = generateReductionDescriptor(
3358 DestDescriptorAddr, LocalStorage, SrcElementAddr,
3359 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3360
3361 if (!GenResult)
3362 return GenResult.takeError();
3363 }
3364 } else {
3365 switch (RI.EvaluationKind) {
3366 case EvalKind::Scalar: {
3367 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3368 // Store the source element value to the dest element address.
3369 Builder.CreateStore(Elem, DestElementAddr);
3370 break;
3371 }
3372 case EvalKind::Complex: {
3373 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3374 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3375 Value *SrcReal = Builder.CreateLoad(
3376 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3377 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3378 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3379 Value *SrcImg = Builder.CreateLoad(
3380 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3381
3382 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3383 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3384 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3385 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3386 Builder.CreateStore(SrcReal, DestRealPtr);
3387 Builder.CreateStore(SrcImg, DestImgPtr);
3388 break;
3389 }
3390 case EvalKind::Aggregate: {
3391 Value *SizeVal = Builder.getInt64(
3392 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3393 Builder.CreateMemCpy(
3394 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3395 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3396 SizeVal, false);
3397 break;
3398 }
3399 };
3400 }
3401
3402 // Step 3.1: Modify reference in dest Reduce list as needed.
3403 // Modifying the reference in Reduce list to point to the newly
3404 // created element. The element is live in the current function
3405 // scope and that of functions it invokes (i.e., reduce_function).
3406 // RemoteReduceData[i] = (void*)&RemoteElem
3407 if (UpdateDestListPtr) {
3408 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3409 DestElementAddr, Builder.getPtrTy(),
3410 DestElementAddr->getName() + ".ascast");
3411 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3412 }
3413 }
3414
3415 return Error::success();
3416}
3417
3418Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3419 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3420 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3421 InsertPointTy SavedIP = Builder.saveIP();
3422 LLVMContext &Ctx = M.getContext();
3423 FunctionType *FuncTy = FunctionType::get(
3424 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3425 /* IsVarArg */ false);
3426 Function *WcFunc =
3428 "_omp_reduction_inter_warp_copy_func", &M);
3429 WcFunc->setAttributes(FuncAttrs);
3430 WcFunc->addParamAttr(0, Attribute::NoUndef);
3431 WcFunc->addParamAttr(1, Attribute::NoUndef);
3432 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3433 Builder.SetInsertPoint(EntryBB);
3434
3435 // ReduceList: thread local Reduce list.
3436 // At the stage of the computation when this function is called, partially
3437 // aggregated values reside in the first lane of every active warp.
3438 Argument *ReduceListArg = WcFunc->getArg(0);
3439 // NumWarps: number of warps active in the parallel region. This could
3440 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3441 Argument *NumWarpsArg = WcFunc->getArg(1);
3442
3443 // This array is used as a medium to transfer, one reduce element at a time,
3444 // the data from the first lane of every warp to lanes in the first warp
3445 // in order to perform the final step of a reduction in a parallel region
3446 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3447 // for reduced latency, as well as to have a distinct copy for concurrently
3448 // executing target regions. The array is declared with common linkage so
3449 // as to be shared across compilation units.
3450 StringRef TransferMediumName =
3451 "__openmp_nvptx_data_transfer_temporary_storage";
3452 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3453 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3454 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3455 if (!TransferMedium) {
3456 TransferMedium = new GlobalVariable(
3457 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3458 UndefValue::get(ArrayTy), TransferMediumName,
3459 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3460 /*AddressSpace=*/3);
3461 }
3462
3463 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3464 Value *GPUThreadID = getGPUThreadID();
3465 // nvptx_lane_id = nvptx_id % warpsize
3466 Value *LaneID = getNVPTXLaneID();
3467 // nvptx_warp_id = nvptx_id / warpsize
3468 Value *WarpID = getNVPTXWarpID();
3469
3470 InsertPointTy AllocaIP =
3471 InsertPointTy(Builder.GetInsertBlock(),
3472 Builder.GetInsertBlock()->getFirstInsertionPt());
3473 Type *Arg0Type = ReduceListArg->getType();
3474 Type *Arg1Type = NumWarpsArg->getType();
3475 Builder.restoreIP(AllocaIP);
3476 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3477 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3478 AllocaInst *NumWarpsAlloca =
3479 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3480 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3481 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3482 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3483 NumWarpsAlloca, Builder.getPtrTy(0),
3484 NumWarpsAlloca->getName() + ".ascast");
3485 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3486 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3487 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3488 InsertPointTy CodeGenIP =
3489 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3490 Builder.restoreIP(CodeGenIP);
3491
3492 Value *ReduceList =
3493 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3494
3495 for (auto En : enumerate(ReductionInfos)) {
3496 //
3497 // Warp master copies reduce element to transfer medium in __shared__
3498 // memory.
3499 //
3500 const ReductionInfo &RI = En.value();
3501 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3502 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3503 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3504 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3505 Type *CType = Builder.getIntNTy(TySize * 8);
3506
3507 unsigned NumIters = RealTySize / TySize;
3508 if (NumIters == 0)
3509 continue;
3510 Value *Cnt = nullptr;
3511 Value *CntAddr = nullptr;
3512 BasicBlock *PrecondBB = nullptr;
3513 BasicBlock *ExitBB = nullptr;
3514 if (NumIters > 1) {
3515 CodeGenIP = Builder.saveIP();
3516 Builder.restoreIP(AllocaIP);
3517 CntAddr =
3518 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3519
3520 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3521 CntAddr->getName() + ".ascast");
3522 Builder.restoreIP(CodeGenIP);
3523 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3524 CntAddr,
3525 /*Volatile=*/false);
3526 PrecondBB = BasicBlock::Create(Ctx, "precond");
3527 ExitBB = BasicBlock::Create(Ctx, "exit");
3528 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3529 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3530 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3531 /*Volatile=*/false);
3532 Value *Cmp = Builder.CreateICmpULT(
3533 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3534 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3535 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3536 }
3537
3538 // kmpc_barrier.
3539 InsertPointOrErrorTy BarrierIP1 =
3540 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3541 omp::Directive::OMPD_unknown,
3542 /* ForceSimpleCall */ false,
3543 /* CheckCancelFlag */ true);
3544 if (!BarrierIP1)
3545 return BarrierIP1.takeError();
3546 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3547 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3548 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3549
3550 // if (lane_id == 0)
3551 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3552 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3553 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3554
3555 // Reduce element = LocalReduceList[i]
3556 auto *RedListArrayTy =
3557 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3558 Type *IndexTy = Builder.getIndexTy(
3559 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3560 Value *ElemPtrPtr =
3561 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3562 {ConstantInt::get(IndexTy, 0),
3563 ConstantInt::get(IndexTy, En.index())});
3564 // elemptr = ((CopyType*)(elemptrptr)) + I
3565 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3566
3567 if (IsByRefElem && RI.DataPtrPtrGen) {
3568 InsertPointOrErrorTy GenRes =
3569 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3570
3571 if (!GenRes)
3572 return GenRes.takeError();
3573
3574 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3575 }
3576
3577 if (NumIters > 1)
3578 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3579
3580 // Get pointer to location in transfer medium.
3581 // MediumPtr = &medium[warp_id]
3582 Value *MediumPtr = Builder.CreateInBoundsGEP(
3583 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3584 // elem = *elemptr
3585 //*MediumPtr = elem
3586 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3587 // Store the source element value to the dest element address.
3588 Builder.CreateStore(Elem, MediumPtr,
3589 /*IsVolatile*/ true);
3590 Builder.CreateBr(MergeBB);
3591
3592 // else
3593 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3594 Builder.CreateBr(MergeBB);
3595
3596 // endif
3597 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3598 InsertPointOrErrorTy BarrierIP2 =
3599 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3600 omp::Directive::OMPD_unknown,
3601 /* ForceSimpleCall */ false,
3602 /* CheckCancelFlag */ true);
3603 if (!BarrierIP2)
3604 return BarrierIP2.takeError();
3605
3606 // Warp 0 copies reduce element from transfer medium
3607 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3608 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3609 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3610
3611 Value *NumWarpsVal =
3612 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3613 // Up to 32 threads in warp 0 are active.
3614 Value *IsActiveThread =
3615 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3616 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3617
3618 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3619
3620 // SecMediumPtr = &medium[tid]
3621 // SrcMediumVal = *SrcMediumPtr
3622 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3623 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3624 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3625 Value *TargetElemPtrPtr =
3626 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3627 {ConstantInt::get(IndexTy, 0),
3628 ConstantInt::get(IndexTy, En.index())});
3629 Value *TargetElemPtrVal =
3630 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3631 Value *TargetElemPtr = TargetElemPtrVal;
3632
3633 if (IsByRefElem && RI.DataPtrPtrGen) {
3634 InsertPointOrErrorTy GenRes =
3635 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3636
3637 if (!GenRes)
3638 return GenRes.takeError();
3639
3640 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3641 }
3642
3643 if (NumIters > 1)
3644 TargetElemPtr =
3645 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3646
3647 // *TargetElemPtr = SrcMediumVal;
3648 Value *SrcMediumValue =
3649 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3650 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3651 Builder.CreateBr(W0MergeBB);
3652
3653 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3654 Builder.CreateBr(W0MergeBB);
3655
3656 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3657
3658 if (NumIters > 1) {
3659 Cnt = Builder.CreateNSWAdd(
3660 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3661 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3662
3663 auto *CurFn = Builder.GetInsertBlock()->getParent();
3664 emitBranch(PrecondBB);
3665 emitBlock(ExitBB, CurFn);
3666 }
3667 RealTySize %= TySize;
3668 }
3669 }
3670
3671 Builder.CreateRetVoid();
3672 Builder.restoreIP(SavedIP);
3673
3674 return WcFunc;
3675}
3676
3677Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3678 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3679 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3680 LLVMContext &Ctx = M.getContext();
3681 FunctionType *FuncTy =
3682 FunctionType::get(Builder.getVoidTy(),
3683 {Builder.getPtrTy(), Builder.getInt16Ty(),
3684 Builder.getInt16Ty(), Builder.getInt16Ty()},
3685 /* IsVarArg */ false);
3686 Function *SarFunc =
3688 "_omp_reduction_shuffle_and_reduce_func", &M);
3689 SarFunc->setAttributes(FuncAttrs);
3690 SarFunc->addParamAttr(0, Attribute::NoUndef);
3691 SarFunc->addParamAttr(1, Attribute::NoUndef);
3692 SarFunc->addParamAttr(2, Attribute::NoUndef);
3693 SarFunc->addParamAttr(3, Attribute::NoUndef);
3694 SarFunc->addParamAttr(1, Attribute::SExt);
3695 SarFunc->addParamAttr(2, Attribute::SExt);
3696 SarFunc->addParamAttr(3, Attribute::SExt);
3697 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3698 Builder.SetInsertPoint(EntryBB);
3699
3700 // Thread local Reduce list used to host the values of data to be reduced.
3701 Argument *ReduceListArg = SarFunc->getArg(0);
3702 // Current lane id; could be logical.
3703 Argument *LaneIDArg = SarFunc->getArg(1);
3704 // Offset of the remote source lane relative to the current lane.
3705 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3706 // Algorithm version. This is expected to be known at compile time.
3707 Argument *AlgoVerArg = SarFunc->getArg(3);
3708
3709 Type *ReduceListArgType = ReduceListArg->getType();
3710 Type *LaneIDArgType = LaneIDArg->getType();
3711 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3712 Value *ReduceListAlloca = Builder.CreateAlloca(
3713 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3714 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3715 LaneIDArg->getName() + ".addr");
3716 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3717 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3718 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3719 AlgoVerArg->getName() + ".addr");
3720 ArrayType *RedListArrayTy =
3721 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3722
3723 // Create a local thread-private variable to host the Reduce list
3724 // from a remote lane.
3725 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3726 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3727
3728 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3729 ReduceListAlloca, ReduceListArgType,
3730 ReduceListAlloca->getName() + ".ascast");
3731 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3732 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3733 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3734 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3735 RemoteLaneOffsetAlloca->getName() + ".ascast");
3736 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3737 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3738 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3739 RemoteReductionListAlloca, Builder.getPtrTy(),
3740 RemoteReductionListAlloca->getName() + ".ascast");
3741
3742 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3743 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3744 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3745 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3746
3747 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3748 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3749 Value *RemoteLaneOffset =
3750 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3751 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3752
3753 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3754
3755 // This loop iterates through the list of reduce elements and copies,
3756 // element by element, from a remote lane in the warp to RemoteReduceList,
3757 // hosted on the thread's stack.
3758 Error EmitRedLsCpRes = emitReductionListCopy(
3759 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3760 ReduceList, RemoteListAddrCast, IsByRef,
3761 {RemoteLaneOffset, nullptr, nullptr});
3762
3763 if (EmitRedLsCpRes)
3764 return EmitRedLsCpRes;
3765
3766 // The actions to be performed on the Remote Reduce list is dependent
3767 // on the algorithm version.
3768 //
3769 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3770 // LaneId % 2 == 0 && Offset > 0):
3771 // do the reduction value aggregation
3772 //
3773 // The thread local variable Reduce list is mutated in place to host the
3774 // reduced data, which is the aggregated value produced from local and
3775 // remote lanes.
3776 //
3777 // Note that AlgoVer is expected to be a constant integer known at compile
3778 // time.
3779 // When AlgoVer==0, the first conjunction evaluates to true, making
3780 // the entire predicate true during compile time.
3781 // When AlgoVer==1, the second conjunction has only the second part to be
3782 // evaluated during runtime. Other conjunctions evaluates to false
3783 // during compile time.
3784 // When AlgoVer==2, the third conjunction has only the second part to be
3785 // evaluated during runtime. Other conjunctions evaluates to false
3786 // during compile time.
3787 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3788 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3789 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3790 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3791 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3792 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3793 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3794 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3795 Value *RemoteOffsetComp =
3796 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3797 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3798 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3799 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3800
3801 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3802 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3803 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3804
3805 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3806 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3807 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3808 ReduceList, Builder.getPtrTy());
3809 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3810 RemoteListAddrCast, Builder.getPtrTy());
3811 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3812 ->addFnAttr(Attribute::NoUnwind);
3813 Builder.CreateBr(MergeBB);
3814
3815 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3816 Builder.CreateBr(MergeBB);
3817
3818 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3819
3820 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3821 // Reduce list.
3822 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3823 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3824 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3825
3826 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3827 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3828 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3829 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3830
3831 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3832
3833 EmitRedLsCpRes = emitReductionListCopy(
3834 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3835 RemoteListAddrCast, ReduceList, IsByRef);
3836
3837 if (EmitRedLsCpRes)
3838 return EmitRedLsCpRes;
3839
3840 Builder.CreateBr(CpyMergeBB);
3841
3842 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3843 Builder.CreateBr(CpyMergeBB);
3844
3845 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3846
3847 Builder.CreateRetVoid();
3848
3849 return SarFunc;
3850}
3851
3853OpenMPIRBuilder::generateReductionDescriptor(
3854 Value *DescriptorAddr, Value *DataPtr, Value *SrcDescriptorAddr,
3855 Type *DescriptorType,
3856 function_ref<InsertPointOrErrorTy(InsertPointTy, Value *, Value *&)>
3857 DataPtrPtrGen) {
3858
3859 // Copy the source descriptor to preserve all metadata (rank, extents,
3860 // strides, etc.)
3861 Value *DescriptorSize =
3862 Builder.getInt64(M.getDataLayout().getTypeStoreSize(DescriptorType));
3863 Builder.CreateMemCpy(
3864 DescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3865 SrcDescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3866 DescriptorSize);
3867
3868 // Update the base pointer field to point to the local shuffled data
3869 Value *DataPtrField;
3870 InsertPointOrErrorTy GenResult =
3871 DataPtrPtrGen(Builder.saveIP(), DescriptorAddr, DataPtrField);
3872
3873 if (!GenResult)
3874 return GenResult.takeError();
3875
3876 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3877 DataPtr, Builder.getPtrTy(), ".ascast"),
3878 DataPtrField);
3879
3880 return Builder.saveIP();
3881}
3882
3883Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3884 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3885 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3886 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3887 LLVMContext &Ctx = M.getContext();
3888 FunctionType *FuncTy = FunctionType::get(
3889 Builder.getVoidTy(),
3890 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3891 /* IsVarArg */ false);
3892 Function *LtGCFunc =
3894 "_omp_reduction_list_to_global_copy_func", &M);
3895 LtGCFunc->setAttributes(FuncAttrs);
3896 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3897 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3898 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3899
3900 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3901 Builder.SetInsertPoint(EntryBlock);
3902
3903 // Buffer: global reduction buffer.
3904 Argument *BufferArg = LtGCFunc->getArg(0);
3905 // Idx: index of the buffer.
3906 Argument *IdxArg = LtGCFunc->getArg(1);
3907 // ReduceList: thread local Reduce list.
3908 Argument *ReduceListArg = LtGCFunc->getArg(2);
3909
3910 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3911 BufferArg->getName() + ".addr");
3912 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3913 IdxArg->getName() + ".addr");
3914 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3915 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3916 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3917 BufferArgAlloca, Builder.getPtrTy(),
3918 BufferArgAlloca->getName() + ".ascast");
3919 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3920 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3921 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3922 ReduceListArgAlloca, Builder.getPtrTy(),
3923 ReduceListArgAlloca->getName() + ".ascast");
3924
3925 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3926 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3927 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3928
3929 Value *LocalReduceList =
3930 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3931 Value *BufferArgVal =
3932 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3933 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3934 Type *IndexTy = Builder.getIndexTy(
3935 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3936 for (auto En : enumerate(ReductionInfos)) {
3937 const ReductionInfo &RI = En.value();
3938 auto *RedListArrayTy =
3939 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3940 // Reduce element = LocalReduceList[i]
3941 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3942 RedListArrayTy, LocalReduceList,
3943 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3944 // elemptr = ((CopyType*)(elemptrptr)) + I
3945 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3946
3947 // Global = Buffer.VD[Idx];
3948 Value *BufferVD =
3949 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3950 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3951 ReductionsBufferTy, BufferVD, 0, En.index());
3952
3953 switch (RI.EvaluationKind) {
3954 case EvalKind::Scalar: {
3955 Value *TargetElement;
3956
3957 if (IsByRef.empty() || !IsByRef[En.index()]) {
3958 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3959 } else {
3960 if (RI.DataPtrPtrGen) {
3961 InsertPointOrErrorTy GenResult =
3962 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3963
3964 if (!GenResult)
3965 return GenResult.takeError();
3966
3967 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3968 }
3969 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3970 }
3971
3972 Builder.CreateStore(TargetElement, GlobVal);
3973 break;
3974 }
3975 case EvalKind::Complex: {
3976 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3977 RI.ElementType, ElemPtr, 0, 0, ".realp");
3978 Value *SrcReal = Builder.CreateLoad(
3979 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3980 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3981 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3982 Value *SrcImg = Builder.CreateLoad(
3983 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3984
3985 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3986 RI.ElementType, GlobVal, 0, 0, ".realp");
3987 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3988 RI.ElementType, GlobVal, 0, 1, ".imagp");
3989 Builder.CreateStore(SrcReal, DestRealPtr);
3990 Builder.CreateStore(SrcImg, DestImgPtr);
3991 break;
3992 }
3993 case EvalKind::Aggregate: {
3994 Value *SizeVal =
3995 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3996 Builder.CreateMemCpy(
3997 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3998 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3999 break;
4000 }
4001 }
4002 }
4003
4004 Builder.CreateRetVoid();
4005 Builder.restoreIP(OldIP);
4006 return LtGCFunc;
4007}
4008
4009Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
4010 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4011 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4012 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4013 LLVMContext &Ctx = M.getContext();
4014 FunctionType *FuncTy = FunctionType::get(
4015 Builder.getVoidTy(),
4016 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4017 /* IsVarArg */ false);
4018 Function *LtGRFunc =
4020 "_omp_reduction_list_to_global_reduce_func", &M);
4021 LtGRFunc->setAttributes(FuncAttrs);
4022 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
4023 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
4024 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
4025
4026 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
4027 Builder.SetInsertPoint(EntryBlock);
4028
4029 // Buffer: global reduction buffer.
4030 Argument *BufferArg = LtGRFunc->getArg(0);
4031 // Idx: index of the buffer.
4032 Argument *IdxArg = LtGRFunc->getArg(1);
4033 // ReduceList: thread local Reduce list.
4034 Argument *ReduceListArg = LtGRFunc->getArg(2);
4035
4036 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4037 BufferArg->getName() + ".addr");
4038 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4039 IdxArg->getName() + ".addr");
4040 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4041 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4042 auto *RedListArrayTy =
4043 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4044
4045 // 1. Build a list of reduction variables.
4046 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4047 Value *LocalReduceList =
4048 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4049
4050 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4051
4052 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4053 BufferArgAlloca, Builder.getPtrTy(),
4054 BufferArgAlloca->getName() + ".ascast");
4055 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4056 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4057 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4058 ReduceListArgAlloca, Builder.getPtrTy(),
4059 ReduceListArgAlloca->getName() + ".ascast");
4060 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4061 LocalReduceList, Builder.getPtrTy(),
4062 LocalReduceList->getName() + ".ascast");
4063
4064 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4065 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4066 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4067
4068 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4069 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4070 Type *IndexTy = Builder.getIndexTy(
4071 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4072 for (auto En : enumerate(ReductionInfos)) {
4073 const ReductionInfo &RI = En.value();
4074
4075 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4076 RedListArrayTy, LocalReduceListAddrCast,
4077 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4078 Value *BufferVD =
4079 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4080 // Global = Buffer.VD[Idx];
4081 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4082 ReductionsBufferTy, BufferVD, 0, En.index());
4083
4084 if (!IsByRef.empty() && IsByRef[En.index()] && RI.DataPtrPtrGen) {
4085 InsertPointTy OldIP = Builder.saveIP();
4086 Builder.restoreIP(AllocaIP);
4087
4088 Value *ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4089 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4090 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4091
4092 Builder.restoreIP(OldIP);
4093
4094 // Get source descriptor from the reduce list argument
4095 Value *ReduceList =
4096 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4097 Value *SrcElementPtrPtr =
4098 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
4099 {ConstantInt::get(IndexTy, 0),
4100 ConstantInt::get(IndexTy, En.index())});
4101 Value *SrcDescriptorAddr =
4102 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4103
4104 // Copy descriptor from source and update base_ptr to global buffer data
4105 InsertPointOrErrorTy GenResult =
4106 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4107 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4108
4109 if (!GenResult)
4110 return GenResult.takeError();
4111
4112 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4113 } else {
4114 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4115 }
4116 }
4117
4118 // Call reduce_function(GlobalReduceList, ReduceList)
4119 Value *ReduceList =
4120 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4121 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
4122 ->addFnAttr(Attribute::NoUnwind);
4123 Builder.CreateRetVoid();
4124 Builder.restoreIP(OldIP);
4125 return LtGRFunc;
4126}
4127
4128Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
4129 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
4130 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4131 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4132 LLVMContext &Ctx = M.getContext();
4133 FunctionType *FuncTy = FunctionType::get(
4134 Builder.getVoidTy(),
4135 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4136 /* IsVarArg */ false);
4137 Function *GtLCFunc =
4139 "_omp_reduction_global_to_list_copy_func", &M);
4140 GtLCFunc->setAttributes(FuncAttrs);
4141 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
4142 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
4143 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
4144
4145 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
4146 Builder.SetInsertPoint(EntryBlock);
4147
4148 // Buffer: global reduction buffer.
4149 Argument *BufferArg = GtLCFunc->getArg(0);
4150 // Idx: index of the buffer.
4151 Argument *IdxArg = GtLCFunc->getArg(1);
4152 // ReduceList: thread local Reduce list.
4153 Argument *ReduceListArg = GtLCFunc->getArg(2);
4154
4155 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4156 BufferArg->getName() + ".addr");
4157 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4158 IdxArg->getName() + ".addr");
4159 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4160 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4161 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4162 BufferArgAlloca, Builder.getPtrTy(),
4163 BufferArgAlloca->getName() + ".ascast");
4164 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4165 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4166 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4167 ReduceListArgAlloca, Builder.getPtrTy(),
4168 ReduceListArgAlloca->getName() + ".ascast");
4169 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4170 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4171 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4172
4173 Value *LocalReduceList =
4174 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4175 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4176 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4177 Type *IndexTy = Builder.getIndexTy(
4178 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4179 for (auto En : enumerate(ReductionInfos)) {
4180 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4181 auto *RedListArrayTy =
4182 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4183 // Reduce element = LocalReduceList[i]
4184 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
4185 RedListArrayTy, LocalReduceList,
4186 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4187 // elemptr = ((CopyType*)(elemptrptr)) + I
4188 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
4189 // Global = Buffer.VD[Idx];
4190 Value *BufferVD =
4191 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4192 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4193 ReductionsBufferTy, BufferVD, 0, En.index());
4194
4195 switch (RI.EvaluationKind) {
4196 case EvalKind::Scalar: {
4197 Type *ElemType = RI.ElementType;
4198
4199 if (!IsByRef.empty() && IsByRef[En.index()]) {
4200 ElemType = RI.ByRefElementType;
4201 if (RI.DataPtrPtrGen) {
4202 InsertPointOrErrorTy GenResult =
4203 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
4204
4205 if (!GenResult)
4206 return GenResult.takeError();
4207
4208 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
4209 }
4210 }
4211
4212 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
4213 Builder.CreateStore(TargetElement, ElemPtr);
4214 break;
4215 }
4216 case EvalKind::Complex: {
4217 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
4218 RI.ElementType, GlobValPtr, 0, 0, ".realp");
4219 Value *SrcReal = Builder.CreateLoad(
4220 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
4221 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
4222 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
4223 Value *SrcImg = Builder.CreateLoad(
4224 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
4225
4226 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
4227 RI.ElementType, ElemPtr, 0, 0, ".realp");
4228 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
4229 RI.ElementType, ElemPtr, 0, 1, ".imagp");
4230 Builder.CreateStore(SrcReal, DestRealPtr);
4231 Builder.CreateStore(SrcImg, DestImgPtr);
4232 break;
4233 }
4234 case EvalKind::Aggregate: {
4235 Value *SizeVal =
4236 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4237 Builder.CreateMemCpy(
4238 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4239 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4240 SizeVal, false);
4241 break;
4242 }
4243 }
4244 }
4245
4246 Builder.CreateRetVoid();
4247 Builder.restoreIP(OldIP);
4248 return GtLCFunc;
4249}
4250
4251Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
4252 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4253 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4254 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4255 LLVMContext &Ctx = M.getContext();
4256 auto *FuncTy = FunctionType::get(
4257 Builder.getVoidTy(),
4258 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4259 /* IsVarArg */ false);
4260 Function *GtLRFunc =
4262 "_omp_reduction_global_to_list_reduce_func", &M);
4263 GtLRFunc->setAttributes(FuncAttrs);
4264 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
4265 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
4266 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
4267
4268 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
4269 Builder.SetInsertPoint(EntryBlock);
4270
4271 // Buffer: global reduction buffer.
4272 Argument *BufferArg = GtLRFunc->getArg(0);
4273 // Idx: index of the buffer.
4274 Argument *IdxArg = GtLRFunc->getArg(1);
4275 // ReduceList: thread local Reduce list.
4276 Argument *ReduceListArg = GtLRFunc->getArg(2);
4277
4278 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4279 BufferArg->getName() + ".addr");
4280 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4281 IdxArg->getName() + ".addr");
4282 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4283 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4284 ArrayType *RedListArrayTy =
4285 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4286
4287 // 1. Build a list of reduction variables.
4288 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4289 Value *LocalReduceList =
4290 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4291
4292 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4293
4294 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4295 BufferArgAlloca, Builder.getPtrTy(),
4296 BufferArgAlloca->getName() + ".ascast");
4297 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4298 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4299 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4300 ReduceListArgAlloca, Builder.getPtrTy(),
4301 ReduceListArgAlloca->getName() + ".ascast");
4302 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4303 LocalReduceList, Builder.getPtrTy(),
4304 LocalReduceList->getName() + ".ascast");
4305
4306 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4307 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4308 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4309
4310 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4311 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4312 Type *IndexTy = Builder.getIndexTy(
4313 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4314 for (auto En : enumerate(ReductionInfos)) {
4315 const ReductionInfo &RI = En.value();
4316
4317 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4318 RedListArrayTy, ReductionList,
4319 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4320 // Global = Buffer.VD[Idx];
4321 Value *BufferVD =
4322 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4323 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4324 ReductionsBufferTy, BufferVD, 0, En.index());
4325
4326 if (!IsByRef.empty() && IsByRef[En.index()] && RI.DataPtrPtrGen) {
4327 InsertPointTy OldIP = Builder.saveIP();
4328 Builder.restoreIP(AllocaIP);
4329
4330 Value *ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4331 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4332 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4333
4334 Builder.restoreIP(OldIP);
4335
4336 // Get source descriptor from the reduce list
4337 Value *ReduceListVal =
4338 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4339 Value *SrcElementPtrPtr =
4340 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceListVal,
4341 {ConstantInt::get(IndexTy, 0),
4342 ConstantInt::get(IndexTy, En.index())});
4343 Value *SrcDescriptorAddr =
4344 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4345
4346 // Copy descriptor from source and update base_ptr to global buffer data
4347 InsertPointOrErrorTy GenResult =
4348 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4349 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4350 if (!GenResult)
4351 return GenResult.takeError();
4352
4353 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4354 } else {
4355 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4356 }
4357 }
4358
4359 // Call reduce_function(ReduceList, GlobalReduceList)
4360 Value *ReduceList =
4361 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4362 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4363 ->addFnAttr(Attribute::NoUnwind);
4364 Builder.CreateRetVoid();
4365 Builder.restoreIP(OldIP);
4366 return GtLRFunc;
4367}
4368
4369std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4370 std::string Suffix =
4371 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4372 return (Name + Suffix).str();
4373}
4374
4375Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4376 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4378 AttributeList FuncAttrs) {
4379 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4380 {Builder.getPtrTy(), Builder.getPtrTy()},
4381 /* IsVarArg */ false);
4382 std::string Name = getReductionFuncName(ReducerName);
4383 Function *ReductionFunc =
4385 ReductionFunc->setAttributes(FuncAttrs);
4386 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4387 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4388 BasicBlock *EntryBB =
4389 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4390 Builder.SetInsertPoint(EntryBB);
4391
4392 // Need to alloca memory here and deal with the pointers before getting
4393 // LHS/RHS pointers out
4394 Value *LHSArrayPtr = nullptr;
4395 Value *RHSArrayPtr = nullptr;
4396 Argument *Arg0 = ReductionFunc->getArg(0);
4397 Argument *Arg1 = ReductionFunc->getArg(1);
4398 Type *Arg0Type = Arg0->getType();
4399 Type *Arg1Type = Arg1->getType();
4400
4401 Value *LHSAlloca =
4402 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4403 Value *RHSAlloca =
4404 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4405 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4406 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4407 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4408 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4409 Builder.CreateStore(Arg0, LHSAddrCast);
4410 Builder.CreateStore(Arg1, RHSAddrCast);
4411 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4412 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4413
4414 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4415 Type *IndexTy = Builder.getIndexTy(
4416 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4417 SmallVector<Value *> LHSPtrs, RHSPtrs;
4418 for (auto En : enumerate(ReductionInfos)) {
4419 const ReductionInfo &RI = En.value();
4420 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4421 RedArrayTy, RHSArrayPtr,
4422 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4423 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4424 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4425 RHSI8Ptr, RI.PrivateVariable->getType(),
4426 RHSI8Ptr->getName() + ".ascast");
4427
4428 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4429 RedArrayTy, LHSArrayPtr,
4430 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4431 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4432 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4433 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4434
4436 LHSPtrs.emplace_back(LHSPtr);
4437 RHSPtrs.emplace_back(RHSPtr);
4438 } else {
4439 Value *LHS = LHSPtr;
4440 Value *RHS = RHSPtr;
4441
4442 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4443 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4444 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4445 }
4446
4447 Value *Reduced;
4448 InsertPointOrErrorTy AfterIP =
4449 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4450 if (!AfterIP)
4451 return AfterIP.takeError();
4452 if (!Builder.GetInsertBlock())
4453 return ReductionFunc;
4454
4455 Builder.restoreIP(*AfterIP);
4456
4457 if (!IsByRef.empty() && !IsByRef[En.index()])
4458 Builder.CreateStore(Reduced, LHSPtr);
4459 }
4460 }
4461
4463 for (auto En : enumerate(ReductionInfos)) {
4464 unsigned Index = En.index();
4465 const ReductionInfo &RI = En.value();
4466 Value *LHSFixupPtr, *RHSFixupPtr;
4467 Builder.restoreIP(RI.ReductionGenClang(
4468 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4469
4470 // Fix the CallBack code genereated to use the correct Values for the LHS
4471 // and RHS
4472 LHSFixupPtr->replaceUsesWithIf(
4473 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4474 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4475 ReductionFunc;
4476 });
4477 RHSFixupPtr->replaceUsesWithIf(
4478 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4479 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4480 ReductionFunc;
4481 });
4482 }
4483
4484 Builder.CreateRetVoid();
4485 // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
4486 // to the entry block (this is dones for higher opt levels by later passes in
4487 // the pipeline). This has caused issues because non-entry `alloca`s force the
4488 // function to use dynamic stack allocations and we might run out of scratch
4489 // memory.
4490 hoistNonEntryAllocasToEntryBlock(ReductionFunc);
4491
4492 return ReductionFunc;
4493}
4494
4495static void
4497 bool IsGPU) {
4498 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4499 (void)RI;
4500 assert(RI.Variable && "expected non-null variable");
4501 assert(RI.PrivateVariable && "expected non-null private variable");
4502 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4503 "expected non-null reduction generator callback");
4504 if (!IsGPU) {
4505 assert(
4506 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4507 "expected variables and their private equivalents to have the same "
4508 "type");
4509 }
4510 assert(RI.Variable->getType()->isPointerTy() &&
4511 "expected variables to be pointers");
4512 }
4513}
4514
4516 const LocationDescription &Loc, InsertPointTy AllocaIP,
4517 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4518 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4519 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4520 unsigned ReductionBufNum, Value *SrcLocInfo) {
4521 if (!updateToLocation(Loc))
4522 return InsertPointTy();
4523 Builder.restoreIP(CodeGenIP);
4524 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4525 LLVMContext &Ctx = M.getContext();
4526
4527 // Source location for the ident struct
4528 if (!SrcLocInfo) {
4529 uint32_t SrcLocStrSize;
4530 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4531 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4532 }
4533
4534 if (ReductionInfos.size() == 0)
4535 return Builder.saveIP();
4536
4537 BasicBlock *ContinuationBlock = nullptr;
4539 // Copied code from createReductions
4540 BasicBlock *InsertBlock = Loc.IP.getBlock();
4541 ContinuationBlock =
4542 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4543 InsertBlock->getTerminator()->eraseFromParent();
4544 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4545 }
4546
4547 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4548 AttributeList FuncAttrs;
4549 AttrBuilder AttrBldr(Ctx);
4550 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4551 AttrBldr.addAttribute(Attr);
4552 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4553 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4554
4555 CodeGenIP = Builder.saveIP();
4556 Expected<Function *> ReductionResult = createReductionFunction(
4557 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4558 ReductionGenCBKind, FuncAttrs);
4559 if (!ReductionResult)
4560 return ReductionResult.takeError();
4561 Function *ReductionFunc = *ReductionResult;
4562 Builder.restoreIP(CodeGenIP);
4563
4564 // Set the grid value in the config needed for lowering later on
4565 if (GridValue.has_value())
4566 Config.setGridValue(GridValue.value());
4567 else
4568 Config.setGridValue(getGridValue(T, ReductionFunc));
4569
4570 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4571 // RedList, shuffle_reduce_func, interwarp_copy_func);
4572 // or
4573 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4574 Value *Res;
4575
4576 // 1. Build a list of reduction variables.
4577 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4578 auto Size = ReductionInfos.size();
4579 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4580 Type *FuncPtrTy =
4581 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4582 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4583 CodeGenIP = Builder.saveIP();
4584 Builder.restoreIP(AllocaIP);
4585 Value *ReductionListAlloca =
4586 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4587 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4588 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4589 Builder.restoreIP(CodeGenIP);
4590 Type *IndexTy = Builder.getIndexTy(
4591 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4592 for (auto En : enumerate(ReductionInfos)) {
4593 const ReductionInfo &RI = En.value();
4594 Value *ElemPtr = Builder.CreateInBoundsGEP(
4595 RedArrayTy, ReductionList,
4596 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4597
4598 Value *PrivateVar = RI.PrivateVariable;
4599 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4600 if (IsByRefElem)
4601 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4602
4603 Value *CastElem =
4604 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4605 Builder.CreateStore(CastElem, ElemPtr);
4606 }
4607 CodeGenIP = Builder.saveIP();
4608 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4609 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4610
4611 if (!SarFunc)
4612 return SarFunc.takeError();
4613
4614 Expected<Function *> CopyResult =
4615 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4616 if (!CopyResult)
4617 return CopyResult.takeError();
4618 Function *WcFunc = *CopyResult;
4619 Builder.restoreIP(CodeGenIP);
4620
4621 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4622
4623 // NOTE: ReductionDataSize is passed as the reduce_data_size
4624 // argument to __kmpc_nvptx_{parallel,teams}_reduce_nowait_v2, but
4625 // the runtime implementations do not currently use it. The teams
4626 // runtime reads ReductionDataSize from KernelEnvironmentTy instead
4627 // (set separately via TargetKernelDefaultAttrs). It is computed
4628 // here conservatively as max(element sizes) * N rather than the
4629 // exact sum, which over-calculates the size for mixed reduction
4630 // types but is harmless given the argument is unused.
4631 // TODO: Consider dropping this computation if the runtime API is
4632 // ever revised to remove the unused parameter.
4633 unsigned MaxDataSize = 0;
4634 SmallVector<Type *> ReductionTypeArgs;
4635 for (auto En : enumerate(ReductionInfos)) {
4636 // Use ByRefElementType for by-ref reductions so that MaxDataSize matches
4637 // the actual data size stored in the global reduction buffer, consistent
4638 // with the ReductionsBufferTy struct used for GEP offsets below.
4639 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4640 ? En.value().ByRefElementType
4641 : En.value().ElementType;
4642 auto Size = M.getDataLayout().getTypeStoreSize(RedTypeArg);
4643 if (Size > MaxDataSize)
4644 MaxDataSize = Size;
4645 ReductionTypeArgs.emplace_back(RedTypeArg);
4646 }
4647 Value *ReductionDataSize =
4648 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4649 if (!IsTeamsReduction) {
4650 Value *SarFuncCast =
4651 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4652 Value *WcFuncCast =
4653 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4654 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4655 WcFuncCast};
4657 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4658 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4659 } else {
4660 CodeGenIP = Builder.saveIP();
4661 StructType *ReductionsBufferTy = StructType::create(
4662 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4663 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4664 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4665
4666 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4667 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4668 if (!LtGCFunc)
4669 return LtGCFunc.takeError();
4670
4671 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4672 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4673 if (!LtGRFunc)
4674 return LtGRFunc.takeError();
4675
4676 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4677 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4678 if (!GtLCFunc)
4679 return GtLCFunc.takeError();
4680
4681 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4682 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4683 if (!GtLRFunc)
4684 return GtLRFunc.takeError();
4685
4686 Builder.restoreIP(CodeGenIP);
4687
4688 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4689 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4690
4691 Value *Args3[] = {SrcLocInfo,
4692 KernelTeamsReductionPtr,
4693 Builder.getInt32(ReductionBufNum),
4694 ReductionDataSize,
4695 RL,
4696 *SarFunc,
4697 WcFunc,
4698 *LtGCFunc,
4699 *LtGRFunc,
4700 *GtLCFunc,
4701 *GtLRFunc};
4702
4703 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4704 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4705 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4706 }
4707
4708 // 5. Build if (res == 1)
4709 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4710 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4711 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4712 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4713
4714 // 6. Build then branch: where we have reduced values in the master
4715 // thread in each team.
4716 // __kmpc_end_reduce{_nowait}(<gtid>);
4717 // break;
4718 emitBlock(ThenBB, CurFunc);
4719
4720 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4721 for (auto En : enumerate(ReductionInfos)) {
4722 const ReductionInfo &RI = En.value();
4724 Value *RedValue = RI.Variable;
4725 Value *RHS =
4726 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4727
4729 Value *LHSPtr, *RHSPtr;
4730 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4731 &LHSPtr, &RHSPtr, CurFunc));
4732
4733 // Fix the CallBack code genereated to use the correct Values for the LHS
4734 // and RHS
4735 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4736 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4737 ReductionFunc;
4738 });
4739 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4740 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4741 ReductionFunc;
4742 });
4743 } else {
4744 if (IsByRef.empty() || !IsByRef[En.index()]) {
4745 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4746 "red.value." + Twine(En.index()));
4747 }
4748 Value *PrivateRedValue = Builder.CreateLoad(
4749 ValueType, RHS, "red.private.value" + Twine(En.index()));
4750 Value *Reduced;
4751 InsertPointOrErrorTy AfterIP =
4752 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4753 if (!AfterIP)
4754 return AfterIP.takeError();
4755 Builder.restoreIP(*AfterIP);
4756
4757 if (!IsByRef.empty() && !IsByRef[En.index()])
4758 Builder.CreateStore(Reduced, RI.Variable);
4759 }
4760 }
4761 emitBlock(ExitBB, CurFunc);
4762 if (ContinuationBlock) {
4763 Builder.CreateBr(ContinuationBlock);
4764 Builder.SetInsertPoint(ContinuationBlock);
4765 }
4766 Config.setEmitLLVMUsed();
4767
4768 return Builder.saveIP();
4769}
4770
4772 Type *VoidTy = Type::getVoidTy(M.getContext());
4773 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4774 auto *FuncTy =
4775 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4777 ".omp.reduction.func", &M);
4778}
4779
4781 Function *ReductionFunc,
4783 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4784 Module *Module = ReductionFunc->getParent();
4785 BasicBlock *ReductionFuncBlock =
4786 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4787 Builder.SetInsertPoint(ReductionFuncBlock);
4788 Value *LHSArrayPtr = nullptr;
4789 Value *RHSArrayPtr = nullptr;
4790 if (IsGPU) {
4791 // Need to alloca memory here and deal with the pointers before getting
4792 // LHS/RHS pointers out
4793 //
4794 Argument *Arg0 = ReductionFunc->getArg(0);
4795 Argument *Arg1 = ReductionFunc->getArg(1);
4796 Type *Arg0Type = Arg0->getType();
4797 Type *Arg1Type = Arg1->getType();
4798
4799 Value *LHSAlloca =
4800 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4801 Value *RHSAlloca =
4802 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4803 Value *LHSAddrCast =
4804 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4805 Value *RHSAddrCast =
4806 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4807 Builder.CreateStore(Arg0, LHSAddrCast);
4808 Builder.CreateStore(Arg1, RHSAddrCast);
4809 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4810 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4811 } else {
4812 LHSArrayPtr = ReductionFunc->getArg(0);
4813 RHSArrayPtr = ReductionFunc->getArg(1);
4814 }
4815
4816 unsigned NumReductions = ReductionInfos.size();
4817 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4818
4819 for (auto En : enumerate(ReductionInfos)) {
4820 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4821 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4822 RedArrayTy, LHSArrayPtr, 0, En.index());
4823 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4824 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4825 LHSI8Ptr, RI.Variable->getType());
4826 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4827 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4828 RedArrayTy, RHSArrayPtr, 0, En.index());
4829 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4830 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4831 RHSI8Ptr, RI.PrivateVariable->getType());
4832 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4833 Value *Reduced;
4835 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4836 if (!AfterIP)
4837 return AfterIP.takeError();
4838
4839 Builder.restoreIP(*AfterIP);
4840 // TODO: Consider flagging an error.
4841 if (!Builder.GetInsertBlock())
4842 return Error::success();
4843
4844 // store is inside of the reduction region when using by-ref
4845 if (!IsByRef[En.index()])
4846 Builder.CreateStore(Reduced, LHSPtr);
4847 }
4848 Builder.CreateRetVoid();
4849 return Error::success();
4850}
4851
4853 const LocationDescription &Loc, InsertPointTy AllocaIP,
4854 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4855 bool IsNoWait, bool IsTeamsReduction) {
4856 assert(ReductionInfos.size() == IsByRef.size());
4857 if (Config.isGPU())
4858 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4859 IsByRef, IsNoWait, IsTeamsReduction);
4860
4861 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4862
4863 if (!updateToLocation(Loc))
4864 return InsertPointTy();
4865
4866 if (ReductionInfos.size() == 0)
4867 return Builder.saveIP();
4868
4869 BasicBlock *InsertBlock = Loc.IP.getBlock();
4870 BasicBlock *ContinuationBlock =
4871 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4872 InsertBlock->getTerminator()->eraseFromParent();
4873
4874 // Create and populate array of type-erased pointers to private reduction
4875 // values.
4876 unsigned NumReductions = ReductionInfos.size();
4877 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4878 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4879 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4880
4881 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4882
4883 for (auto En : enumerate(ReductionInfos)) {
4884 unsigned Index = En.index();
4885 const ReductionInfo &RI = En.value();
4886 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4887 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4888 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4889 }
4890
4891 // Emit a call to the runtime function that orchestrates the reduction.
4892 // Declare the reduction function in the process.
4893 Type *IndexTy = Builder.getIndexTy(
4894 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4895 Function *Func = Builder.GetInsertBlock()->getParent();
4896 Module *Module = Func->getParent();
4897 uint32_t SrcLocStrSize;
4898 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4899 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4900 return RI.AtomicReductionGen;
4901 });
4902 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4903 CanGenerateAtomic
4904 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4905 : IdentFlag(0));
4906 Value *ThreadId = getOrCreateThreadID(Ident);
4907 Constant *NumVariables = Builder.getInt32(NumReductions);
4908 const DataLayout &DL = Module->getDataLayout();
4909 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4910 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4911 Function *ReductionFunc = getFreshReductionFunc(*Module);
4912 Value *Lock = getOMPCriticalRegionLock(".reduction");
4914 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4915 : RuntimeFunction::OMPRTL___kmpc_reduce);
4916 CallInst *ReduceCall =
4917 createRuntimeFunctionCall(ReduceFunc,
4918 {Ident, ThreadId, NumVariables, RedArraySize,
4919 RedArray, ReductionFunc, Lock},
4920 "reduce");
4921
4922 // Create final reduction entry blocks for the atomic and non-atomic case.
4923 // Emit IR that dispatches control flow to one of the blocks based on the
4924 // reduction supporting the atomic mode.
4925 BasicBlock *NonAtomicRedBlock =
4926 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4927 BasicBlock *AtomicRedBlock =
4928 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4929 SwitchInst *Switch =
4930 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4931 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4932 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4933
4934 // Populate the non-atomic reduction using the elementwise reduction function.
4935 // This loads the elements from the global and private variables and reduces
4936 // them before storing back the result to the global variable.
4937 Builder.SetInsertPoint(NonAtomicRedBlock);
4938 for (auto En : enumerate(ReductionInfos)) {
4939 const ReductionInfo &RI = En.value();
4941 // We have one less load for by-ref case because that load is now inside of
4942 // the reduction region
4943 Value *RedValue = RI.Variable;
4944 if (!IsByRef[En.index()]) {
4945 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4946 "red.value." + Twine(En.index()));
4947 }
4948 Value *PrivateRedValue =
4949 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4950 "red.private.value." + Twine(En.index()));
4951 Value *Reduced;
4952 InsertPointOrErrorTy AfterIP =
4953 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4954 if (!AfterIP)
4955 return AfterIP.takeError();
4956 Builder.restoreIP(*AfterIP);
4957
4958 if (!Builder.GetInsertBlock())
4959 return InsertPointTy();
4960 // for by-ref case, the load is inside of the reduction region
4961 if (!IsByRef[En.index()])
4962 Builder.CreateStore(Reduced, RI.Variable);
4963 }
4964 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4965 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4966 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4967 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4968 Builder.CreateBr(ContinuationBlock);
4969
4970 // Populate the atomic reduction using the atomic elementwise reduction
4971 // function. There are no loads/stores here because they will be happening
4972 // inside the atomic elementwise reduction.
4973 Builder.SetInsertPoint(AtomicRedBlock);
4974 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4975 for (const ReductionInfo &RI : ReductionInfos) {
4977 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4978 if (!AfterIP)
4979 return AfterIP.takeError();
4980 Builder.restoreIP(*AfterIP);
4981 if (!Builder.GetInsertBlock())
4982 return InsertPointTy();
4983 }
4984 Builder.CreateBr(ContinuationBlock);
4985 } else {
4986 Builder.CreateUnreachable();
4987 }
4988
4989 // Populate the outlined reduction function using the elementwise reduction
4990 // function. Partial values are extracted from the type-erased array of
4991 // pointers to private variables.
4992 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4993 IsByRef, /*isGPU=*/false);
4994 if (Err)
4995 return Err;
4996
4997 if (!Builder.GetInsertBlock())
4998 return InsertPointTy();
4999
5000 Builder.SetInsertPoint(ContinuationBlock);
5001 return Builder.saveIP();
5002}
5003
5006 BodyGenCallbackTy BodyGenCB,
5007 FinalizeCallbackTy FiniCB) {
5008 if (!updateToLocation(Loc))
5009 return Loc.IP;
5010
5011 Directive OMPD = Directive::OMPD_master;
5012 uint32_t SrcLocStrSize;
5013 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5014 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5015 Value *ThreadId = getOrCreateThreadID(Ident);
5016 Value *Args[] = {Ident, ThreadId};
5017
5018 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
5019 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
5020
5021 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
5022 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
5023
5024 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5025 /*Conditional*/ true, /*hasFinalize*/ true);
5026}
5027
5030 BodyGenCallbackTy BodyGenCB,
5031 FinalizeCallbackTy FiniCB, Value *Filter) {
5032 if (!updateToLocation(Loc))
5033 return Loc.IP;
5034
5035 Directive OMPD = Directive::OMPD_masked;
5036 uint32_t SrcLocStrSize;
5037 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5038 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5039 Value *ThreadId = getOrCreateThreadID(Ident);
5040 Value *Args[] = {Ident, ThreadId, Filter};
5041 Value *ArgsEnd[] = {Ident, ThreadId};
5042
5043 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
5044 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
5045
5046 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
5047 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
5048
5049 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5050 /*Conditional*/ true, /*hasFinalize*/ true);
5051}
5052
5054 llvm::FunctionCallee Callee,
5056 const llvm::Twine &Name) {
5057 llvm::CallInst *Call = Builder.CreateCall(
5058 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
5059 Call->setDoesNotThrow();
5060 return Call;
5061}
5062
5063// Expects input basic block is dominated by BeforeScanBB.
5064// Once Scan directive is encountered, the code after scan directive should be
5065// dominated by AfterScanBB. Scan directive splits the code sequence to
5066// scan and input phase. Based on whether inclusive or exclusive
5067// clause is used in the scan directive and whether input loop or scan loop
5068// is lowered, it adds jumps to input and scan phase. First Scan loop is the
5069// input loop and second is the scan loop. The code generated handles only
5070// inclusive scans now.
5072 const LocationDescription &Loc, InsertPointTy AllocaIP,
5073 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
5074 bool IsInclusive, ScanInfo *ScanRedInfo) {
5075 if (ScanRedInfo->OMPFirstScanLoop) {
5076 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
5077 ScanVarsType, ScanRedInfo);
5078 if (Err)
5079 return Err;
5080 }
5081 if (!updateToLocation(Loc))
5082 return Loc.IP;
5083
5084 llvm::Value *IV = ScanRedInfo->IV;
5085
5086 if (ScanRedInfo->OMPFirstScanLoop) {
5087 // Emit buffer[i] = red; at the end of the input phase.
5088 for (size_t i = 0; i < ScanVars.size(); i++) {
5089 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
5090 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5091 Type *DestTy = ScanVarsType[i];
5092 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5093 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
5094
5095 Builder.CreateStore(Src, Val);
5096 }
5097 }
5098 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5099 emitBlock(ScanRedInfo->OMPScanDispatch,
5100 Builder.GetInsertBlock()->getParent());
5101
5102 if (!ScanRedInfo->OMPFirstScanLoop) {
5103 IV = ScanRedInfo->IV;
5104 // Emit red = buffer[i]; at the entrance to the scan phase.
5105 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
5106 for (size_t i = 0; i < ScanVars.size(); i++) {
5107 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
5108 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5109 Type *DestTy = ScanVarsType[i];
5110 Value *SrcPtr =
5111 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5112 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
5113 Builder.CreateStore(Src, ScanVars[i]);
5114 }
5115 }
5116
5117 // TODO: Update it to CreateBr and remove dead blocks
5118 llvm::Value *CmpI = Builder.getInt1(true);
5119 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
5120 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
5121 ScanRedInfo->OMPAfterScanBlock);
5122 } else {
5123 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
5124 ScanRedInfo->OMPBeforeScanBlock);
5125 }
5126 emitBlock(ScanRedInfo->OMPAfterScanBlock,
5127 Builder.GetInsertBlock()->getParent());
5128 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
5129 return Builder.saveIP();
5130}
5131
5132Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
5133 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
5134 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
5135
5136 Builder.restoreIP(AllocaIP);
5137 // Create the shared pointer at alloca IP.
5138 for (size_t i = 0; i < ScanVars.size(); i++) {
5139 llvm::Value *BuffPtr =
5140 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
5141 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
5142 }
5143
5144 // Allocate temporary buffer by master thread
5145 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5146 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5147 Builder.restoreIP(CodeGenIP);
5148 Value *AllocSpan =
5149 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
5150 for (size_t i = 0; i < ScanVars.size(); i++) {
5151 Type *IntPtrTy = Builder.getInt32Ty();
5152 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
5153 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
5154 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
5155 AllocSpan, nullptr, "arr");
5156 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
5157 }
5158 return Error::success();
5159 };
5160 // TODO: Perform finalization actions for variables. This has to be
5161 // called for variables which have destructors/finalizers.
5162 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5163
5164 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
5165 llvm::Value *FilterVal = Builder.getInt32(0);
5167 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5168
5169 if (!AfterIP)
5170 return AfterIP.takeError();
5171 Builder.restoreIP(*AfterIP);
5172 BasicBlock *InputBB = Builder.GetInsertBlock();
5173 if (InputBB->hasTerminator())
5174 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
5175 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5176 if (!AfterIP)
5177 return AfterIP.takeError();
5178 Builder.restoreIP(*AfterIP);
5179
5180 return Error::success();
5181}
5182
5183Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
5184 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
5185 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5186 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5187 Builder.restoreIP(CodeGenIP);
5188 for (ReductionInfo RedInfo : ReductionInfos) {
5189 Value *PrivateVar = RedInfo.PrivateVariable;
5190 Value *OrigVar = RedInfo.Variable;
5191 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
5192 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5193
5194 Type *SrcTy = RedInfo.ElementType;
5195 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
5196 "arrayOffset");
5197 Value *Src = Builder.CreateLoad(SrcTy, Val);
5198
5199 Builder.CreateStore(Src, OrigVar);
5200 Builder.CreateFree(Buff);
5201 }
5202 return Error::success();
5203 };
5204 // TODO: Perform finalization actions for variables. This has to be
5205 // called for variables which have destructors/finalizers.
5206 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5207
5208 if (Instruction *TI = ScanRedInfo->OMPScanFinish->getTerminatorOrNull())
5209 Builder.SetInsertPoint(TI);
5210 else
5211 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
5212
5213 llvm::Value *FilterVal = Builder.getInt32(0);
5215 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5216
5217 if (!AfterIP)
5218 return AfterIP.takeError();
5219 Builder.restoreIP(*AfterIP);
5220 BasicBlock *InputBB = Builder.GetInsertBlock();
5221 if (InputBB->hasTerminator())
5222 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
5223 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5224 if (!AfterIP)
5225 return AfterIP.takeError();
5226 Builder.restoreIP(*AfterIP);
5227 return Error::success();
5228}
5229
5231 const LocationDescription &Loc,
5233 ScanInfo *ScanRedInfo) {
5234
5235 if (!updateToLocation(Loc))
5236 return Loc.IP;
5237 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5238 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5239 Builder.restoreIP(CodeGenIP);
5240 Function *CurFn = Builder.GetInsertBlock()->getParent();
5241 // for (int k = 0; k <= ceil(log2(n)); ++k)
5242 llvm::BasicBlock *LoopBB =
5243 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
5244 llvm::BasicBlock *ExitBB =
5245 splitBB(Builder, false, "omp.outer.log.scan.exit");
5247 Builder.GetInsertBlock()->getModule(),
5248 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
5249 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
5250 llvm::Value *Arg =
5251 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
5252 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
5254 Builder.GetInsertBlock()->getModule(),
5255 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
5256 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
5257 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
5258 llvm::Value *NMin1 = Builder.CreateNUWSub(
5259 ScanRedInfo->Span,
5260 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
5261 Builder.SetInsertPoint(InputBB);
5262 Builder.CreateBr(LoopBB);
5263 emitBlock(LoopBB, CurFn);
5264 Builder.SetInsertPoint(LoopBB);
5265
5266 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5267 // size pow2k = 1;
5268 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5269 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
5270 InputBB);
5271 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
5272 InputBB);
5273 // for (size i = n - 1; i >= 2 ^ k; --i)
5274 // tmp[i] op= tmp[i-pow2k];
5275 llvm::BasicBlock *InnerLoopBB =
5276 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
5277 llvm::BasicBlock *InnerExitBB =
5278 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
5279 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
5280 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5281 emitBlock(InnerLoopBB, CurFn);
5282 Builder.SetInsertPoint(InnerLoopBB);
5283 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5284 IVal->addIncoming(NMin1, LoopBB);
5285 for (ReductionInfo RedInfo : ReductionInfos) {
5286 Value *ReductionVal = RedInfo.PrivateVariable;
5287 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
5288 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5289 Type *DestTy = RedInfo.ElementType;
5290 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
5291 Value *LHSPtr =
5292 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5293 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
5294 Value *RHSPtr =
5295 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
5296 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
5297 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
5298 llvm::Value *Result;
5299 InsertPointOrErrorTy AfterIP =
5300 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
5301 if (!AfterIP)
5302 return AfterIP.takeError();
5303 Builder.CreateStore(Result, LHSPtr);
5304 }
5305 llvm::Value *NextIVal = Builder.CreateNUWSub(
5306 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
5307 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
5308 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
5309 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5310 emitBlock(InnerExitBB, CurFn);
5311 llvm::Value *Next = Builder.CreateNUWAdd(
5312 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
5313 Counter->addIncoming(Next, Builder.GetInsertBlock());
5314 // pow2k <<= 1;
5315 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
5316 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
5317 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
5318 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
5319 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
5320 return Error::success();
5321 };
5322
5323 // TODO: Perform finalization actions for variables. This has to be
5324 // called for variables which have destructors/finalizers.
5325 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5326
5327 llvm::Value *FilterVal = Builder.getInt32(0);
5329 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5330
5331 if (!AfterIP)
5332 return AfterIP.takeError();
5333 Builder.restoreIP(*AfterIP);
5334 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5335
5336 if (!AfterIP)
5337 return AfterIP.takeError();
5338 Builder.restoreIP(*AfterIP);
5339 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5340 if (Err)
5341 return Err;
5342
5343 return AfterIP;
5344}
5345
5346Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5347 llvm::function_ref<Error()> InputLoopGen,
5348 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5349 ScanInfo *ScanRedInfo) {
5350
5351 {
5352 // Emit loop with input phase:
5353 // for (i: 0..<num_iters>) {
5354 // <input phase>;
5355 // buffer[i] = red;
5356 // }
5357 ScanRedInfo->OMPFirstScanLoop = true;
5358 Error Err = InputLoopGen();
5359 if (Err)
5360 return Err;
5361 }
5362 {
5363 // Emit loop with scan phase:
5364 // for (i: 0..<num_iters>) {
5365 // red = buffer[i];
5366 // <scan phase>;
5367 // }
5368 ScanRedInfo->OMPFirstScanLoop = false;
5369 Error Err = ScanLoopGen(Builder.saveIP());
5370 if (Err)
5371 return Err;
5372 }
5373 return Error::success();
5374}
5375
5376void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5377 Function *Fun = Builder.GetInsertBlock()->getParent();
5378 ScanRedInfo->OMPScanDispatch =
5379 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5380 ScanRedInfo->OMPAfterScanBlock =
5381 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5382 ScanRedInfo->OMPBeforeScanBlock =
5383 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5384 ScanRedInfo->OMPScanLoopExit =
5385 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5386}
5388 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5389 BasicBlock *PostInsertBefore, const Twine &Name) {
5390 Module *M = F->getParent();
5391 LLVMContext &Ctx = M->getContext();
5392 Type *IndVarTy = TripCount->getType();
5393
5394 // Create the basic block structure.
5395 BasicBlock *Preheader =
5396 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5397 BasicBlock *Header =
5398 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5399 BasicBlock *Cond =
5400 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5401 BasicBlock *Body =
5402 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5403 BasicBlock *Latch =
5404 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5405 BasicBlock *Exit =
5406 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5407 BasicBlock *After =
5408 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5409
5410 // Use specified DebugLoc for new instructions.
5411 Builder.SetCurrentDebugLocation(DL);
5412
5413 Builder.SetInsertPoint(Preheader);
5414 Builder.CreateBr(Header);
5415
5416 Builder.SetInsertPoint(Header);
5417 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5418 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5419 Builder.CreateBr(Cond);
5420
5421 Builder.SetInsertPoint(Cond);
5422 Value *Cmp =
5423 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5424 Builder.CreateCondBr(Cmp, Body, Exit);
5425
5426 Builder.SetInsertPoint(Body);
5427 Builder.CreateBr(Latch);
5428
5429 Builder.SetInsertPoint(Latch);
5430 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5431 "omp_" + Name + ".next", /*HasNUW=*/true);
5432 Builder.CreateBr(Header);
5433 IndVarPHI->addIncoming(Next, Latch);
5434
5435 Builder.SetInsertPoint(Exit);
5436 Builder.CreateBr(After);
5437
5438 // Remember and return the canonical control flow.
5439 LoopInfos.emplace_front();
5440 CanonicalLoopInfo *CL = &LoopInfos.front();
5441
5442 CL->Header = Header;
5443 CL->Cond = Cond;
5444 CL->Latch = Latch;
5445 CL->Exit = Exit;
5446
5447#ifndef NDEBUG
5448 CL->assertOK();
5449#endif
5450 return CL;
5451}
5452
5455 LoopBodyGenCallbackTy BodyGenCB,
5456 Value *TripCount, const Twine &Name) {
5457 BasicBlock *BB = Loc.IP.getBlock();
5458 BasicBlock *NextBB = BB->getNextNode();
5459
5460 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5461 NextBB, NextBB, Name);
5462 BasicBlock *After = CL->getAfter();
5463
5464 // If location is not set, don't connect the loop.
5465 if (updateToLocation(Loc)) {
5466 // Split the loop at the insertion point: Branch to the preheader and move
5467 // every following instruction to after the loop (the After BB). Also, the
5468 // new successor is the loop's after block.
5469 spliceBB(Builder, After, /*CreateBranch=*/false);
5470 Builder.CreateBr(CL->getPreheader());
5471 }
5472
5473 // Emit the body content. We do it after connecting the loop to the CFG to
5474 // avoid that the callback encounters degenerate BBs.
5475 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5476 return Err;
5477
5478#ifndef NDEBUG
5479 CL->assertOK();
5480#endif
5481 return CL;
5482}
5483
5485 ScanInfos.emplace_front();
5486 ScanInfo *Result = &ScanInfos.front();
5487 return Result;
5488}
5489
5493 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5494 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5495 LocationDescription ComputeLoc =
5496 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5497 updateToLocation(ComputeLoc);
5498
5500
5502 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5503 ScanRedInfo->Span = TripCount;
5504 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5505 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5506
5507 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5508 Builder.restoreIP(CodeGenIP);
5509 ScanRedInfo->IV = IV;
5510 createScanBBs(ScanRedInfo);
5511 BasicBlock *InputBlock = Builder.GetInsertBlock();
5512 Instruction *Terminator = InputBlock->getTerminator();
5513 assert(Terminator->getNumSuccessors() == 1);
5514 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5515 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5516 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5517 Builder.GetInsertBlock()->getParent());
5518 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5519 emitBlock(ScanRedInfo->OMPScanLoopExit,
5520 Builder.GetInsertBlock()->getParent());
5521 Builder.CreateBr(ContinueBlock);
5522 Builder.SetInsertPoint(
5523 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5524 return BodyGenCB(Builder.saveIP(), IV);
5525 };
5526
5527 const auto &&InputLoopGen = [&]() -> Error {
5529 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5530 ComputeIP, Name, true, ScanRedInfo);
5531 if (!LoopInfo)
5532 return LoopInfo.takeError();
5533 Result.push_back(*LoopInfo);
5534 Builder.restoreIP((*LoopInfo)->getAfterIP());
5535 return Error::success();
5536 };
5537 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5539 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5540 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5541 if (!LoopInfo)
5542 return LoopInfo.takeError();
5543 Result.push_back(*LoopInfo);
5544 Builder.restoreIP((*LoopInfo)->getAfterIP());
5545 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5546 return Error::success();
5547 };
5548 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5549 if (Err)
5550 return Err;
5551 return Result;
5552}
5553
5555 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5556 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5557
5558 // Consider the following difficulties (assuming 8-bit signed integers):
5559 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5560 // DO I = 1, 100, 50
5561 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5562 // DO I = 100, 0, -128
5563
5564 // Start, Stop and Step must be of the same integer type.
5565 auto *IndVarTy = cast<IntegerType>(Start->getType());
5566 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5567 assert(IndVarTy == Step->getType() && "Step type mismatch");
5568
5570
5571 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5572 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5573
5574 // Like Step, but always positive.
5575 Value *Incr = Step;
5576
5577 // Distance between Start and Stop; always positive.
5578 Value *Span;
5579
5580 // Condition whether there are no iterations are executed at all, e.g. because
5581 // UB < LB.
5582 Value *ZeroCmp;
5583
5584 if (IsSigned) {
5585 // Ensure that increment is positive. If not, negate and invert LB and UB.
5586 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5587 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5588 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5589 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5590 Span = Builder.CreateSub(UB, LB, "", false, true);
5591 ZeroCmp = Builder.CreateICmp(
5592 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5593 } else {
5594 Span = Builder.CreateSub(Stop, Start, "", true);
5595 ZeroCmp = Builder.CreateICmp(
5596 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5597 }
5598
5599 Value *CountIfLooping;
5600 if (InclusiveStop) {
5601 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5602 } else {
5603 // Avoid incrementing past stop since it could overflow.
5604 Value *CountIfTwo = Builder.CreateAdd(
5605 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5606 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5607 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5608 }
5609
5610 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5611 "omp_" + Name + ".tripcount");
5612}
5613
5616 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5617 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5618 ScanInfo *ScanRedInfo) {
5619 LocationDescription ComputeLoc =
5620 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5621
5623 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5624
5625 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5626 Builder.restoreIP(CodeGenIP);
5627 Value *Span = Builder.CreateMul(IV, Step);
5628 Value *IndVar = Builder.CreateAdd(Span, Start);
5629 if (InScan)
5630 ScanRedInfo->IV = IndVar;
5631 return BodyGenCB(Builder.saveIP(), IndVar);
5632 };
5633 LocationDescription LoopLoc =
5634 ComputeIP.isSet()
5635 ? Loc
5636 : LocationDescription(Builder.saveIP(),
5637 Builder.getCurrentDebugLocation());
5638 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5639}
5640
5641// Returns an LLVM function to call for initializing loop bounds using OpenMP
5642// static scheduling for composite `distribute parallel for` depending on
5643// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5644// integers as unsigned similarly to CanonicalLoopInfo.
5645static FunctionCallee
5647 OpenMPIRBuilder &OMPBuilder) {
5648 unsigned Bitwidth = Ty->getIntegerBitWidth();
5649 if (Bitwidth == 32)
5650 return OMPBuilder.getOrCreateRuntimeFunction(
5651 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5652 if (Bitwidth == 64)
5653 return OMPBuilder.getOrCreateRuntimeFunction(
5654 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5655 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5656}
5657
5658// Returns an LLVM function to call for initializing loop bounds using OpenMP
5659// static scheduling depending on `type`. Only i32 and i64 are supported by the
5660// runtime. Always interpret integers as unsigned similarly to
5661// CanonicalLoopInfo.
5663 OpenMPIRBuilder &OMPBuilder) {
5664 unsigned Bitwidth = Ty->getIntegerBitWidth();
5665 if (Bitwidth == 32)
5666 return OMPBuilder.getOrCreateRuntimeFunction(
5667 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5668 if (Bitwidth == 64)
5669 return OMPBuilder.getOrCreateRuntimeFunction(
5670 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5671 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5672}
5673
5674OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5675 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5676 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5677 OMPScheduleType DistScheduleSchedType) {
5678 assert(CLI->isValid() && "Requires a valid canonical loop");
5679 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5680 "Require dedicated allocate IP");
5681
5682 // Set up the source location value for OpenMP runtime.
5683 Builder.restoreIP(CLI->getPreheaderIP());
5684 Builder.SetCurrentDebugLocation(DL);
5685
5686 uint32_t SrcLocStrSize;
5687 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5689 switch (LoopType) {
5690 case WorksharingLoopType::ForStaticLoop:
5691 Flag = OMP_IDENT_FLAG_WORK_LOOP;
5692 break;
5693 case WorksharingLoopType::DistributeStaticLoop:
5694 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5695 break;
5696 case WorksharingLoopType::DistributeForStaticLoop:
5697 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
5698 break;
5699 }
5700 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5701
5702 // Declare useful OpenMP runtime functions.
5703 Value *IV = CLI->getIndVar();
5704 Type *IVTy = IV->getType();
5705 FunctionCallee StaticInit =
5706 LoopType == WorksharingLoopType::DistributeForStaticLoop
5707 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5708 : getKmpcForStaticInitForType(IVTy, M, *this);
5709 FunctionCallee StaticFini =
5710 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5711
5712 // Allocate space for computed loop bounds as expected by the "init" function.
5713 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5714
5715 Type *I32Type = Type::getInt32Ty(M.getContext());
5716 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5717 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5718 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5719 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5720 CLI->setLastIter(PLastIter);
5721
5722 // At the end of the preheader, prepare for calling the "init" function by
5723 // storing the current loop bounds into the allocated space. A canonical loop
5724 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5725 // and produces an inclusive upper bound.
5726 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5727 Constant *Zero = ConstantInt::get(IVTy, 0);
5728 Constant *One = ConstantInt::get(IVTy, 1);
5729 Builder.CreateStore(Zero, PLowerBound);
5730 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5731 Builder.CreateStore(UpperBound, PUpperBound);
5732 Builder.CreateStore(One, PStride);
5733
5734 Value *ThreadNum =
5735 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5736
5737 OMPScheduleType SchedType =
5738 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5739 ? OMPScheduleType::OrderedDistribute
5741 Constant *SchedulingType =
5742 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5743
5744 // Call the "init" function and update the trip count of the loop with the
5745 // value it produced.
5746 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5747 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5748 this](Value *SchedulingType, auto &Builder) {
5749 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5750 PLowerBound, PUpperBound});
5751 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5752 Value *PDistUpperBound =
5753 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5754 Args.push_back(PDistUpperBound);
5755 }
5756 Args.append({PStride, One, Zero});
5757 createRuntimeFunctionCall(StaticInit, Args);
5758 };
5759 BuildInitCall(SchedulingType, Builder);
5760 if (HasDistSchedule &&
5761 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5762 Constant *DistScheduleSchedType = ConstantInt::get(
5763 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5764 // We want to emit a second init function call for the dist_schedule clause
5765 // to the Distribute construct. This should only be done however if a
5766 // Workshare Loop is nested within a Distribute Construct
5767 BuildInitCall(DistScheduleSchedType, Builder);
5768 }
5769 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5770 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5771 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5772 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5773 CLI->setTripCount(TripCount);
5774
5775 // Update all uses of the induction variable except the one in the condition
5776 // block that compares it with the actual upper bound, and the increment in
5777 // the latch block.
5778
5779 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5780 Builder.SetInsertPoint(CLI->getBody(),
5781 CLI->getBody()->getFirstInsertionPt());
5782 Builder.SetCurrentDebugLocation(DL);
5783 return Builder.CreateAdd(OldIV, LowerBound);
5784 });
5785
5786 // In the "exit" block, call the "fini" function.
5787 Builder.SetInsertPoint(CLI->getExit(),
5788 CLI->getExit()->getTerminator()->getIterator());
5789 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5790
5791 // Add the barrier if requested.
5792 if (NeedsBarrier) {
5793 InsertPointOrErrorTy BarrierIP =
5795 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5796 /* CheckCancelFlag */ false);
5797 if (!BarrierIP)
5798 return BarrierIP.takeError();
5799 }
5800
5801 InsertPointTy AfterIP = CLI->getAfterIP();
5802 CLI->invalidate();
5803
5804 return AfterIP;
5805}
5806
5807static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5808 LoopInfo &LI);
5809static void addLoopMetadata(CanonicalLoopInfo *Loop,
5810 ArrayRef<Metadata *> Properties);
5811
5813 LLVMContext &Ctx, Loop *Loop,
5815 SmallVector<Metadata *> &LoopMDList) {
5816 SmallSet<BasicBlock *, 8> Reachable;
5817
5818 // Get the basic blocks from the loop in which memref instructions
5819 // can be found.
5820 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5821 // preferably without running any passes.
5822 for (BasicBlock *Block : Loop->getBlocks()) {
5823 if (Block == CLI->getCond() || Block == CLI->getHeader())
5824 continue;
5825 Reachable.insert(Block);
5826 }
5827
5828 // Add access group metadata to memory-access instructions.
5829 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5830 for (BasicBlock *BB : Reachable)
5831 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5832 // TODO: If the loop has existing parallel access metadata, have
5833 // to combine two lists.
5834 LoopMDList.push_back(MDNode::get(
5835 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5836}
5837
5839OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5840 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5841 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5842 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5843 assert(CLI->isValid() && "Requires a valid canonical loop");
5844 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5845
5846 LLVMContext &Ctx = CLI->getFunction()->getContext();
5847 Value *IV = CLI->getIndVar();
5848 Value *OrigTripCount = CLI->getTripCount();
5849 Type *IVTy = IV->getType();
5850 assert(IVTy->getIntegerBitWidth() <= 64 &&
5851 "Max supported tripcount bitwidth is 64 bits");
5852 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5853 : Type::getInt64Ty(Ctx);
5854 Type *I32Type = Type::getInt32Ty(M.getContext());
5855 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5856 Constant *One = ConstantInt::get(InternalIVTy, 1);
5857
5858 Function *F = CLI->getFunction();
5859 // Blocks must have terminators.
5860 // FIXME: Don't run analyses on incomplete/invalid IR.
5862 for (BasicBlock &BB : *F)
5863 if (!BB.hasTerminator())
5864 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
5866 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5867 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5868 LoopAnalysis LIA;
5869 LoopInfo &&LI = LIA.run(*F, FAM);
5870 for (Instruction *I : UIs)
5871 I->eraseFromParent();
5872 Loop *L = LI.getLoopFor(CLI->getHeader());
5873 SmallVector<Metadata *> LoopMDList;
5874 if (ChunkSize || DistScheduleChunkSize)
5875 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5876 addLoopMetadata(CLI, LoopMDList);
5877
5878 // Declare useful OpenMP runtime functions.
5879 FunctionCallee StaticInit =
5880 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5881 FunctionCallee StaticFini =
5882 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5883
5884 // Allocate space for computed loop bounds as expected by the "init" function.
5885 Builder.restoreIP(AllocaIP);
5886 Builder.SetCurrentDebugLocation(DL);
5887 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5888 Value *PLowerBound =
5889 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5890 Value *PUpperBound =
5891 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5892 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5893 CLI->setLastIter(PLastIter);
5894
5895 // Set up the source location value for the OpenMP runtime.
5896 Builder.restoreIP(CLI->getPreheaderIP());
5897 Builder.SetCurrentDebugLocation(DL);
5898
5899 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5900 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5901 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5902 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5903 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5904 "distschedulechunksize");
5905 Value *CastedTripCount =
5906 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5907
5908 Constant *SchedulingType =
5909 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5910 Constant *DistSchedulingType =
5911 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5912 Builder.CreateStore(Zero, PLowerBound);
5913 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5914 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5915 Value *UpperBound =
5916 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5917 Builder.CreateStore(UpperBound, PUpperBound);
5918 Builder.CreateStore(One, PStride);
5919
5920 // Call the "init" function and update the trip count of the loop with the
5921 // value it produced.
5922 uint32_t SrcLocStrSize;
5923 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5924 IdentFlag Flag = OMP_IDENT_FLAG_WORK_LOOP;
5925 if (DistScheduleSchedType != OMPScheduleType::None) {
5926 Flag |= OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5927 }
5928 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5929 Value *ThreadNum =
5930 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5931 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5932 PUpperBound, PStride, One,
5933 this](Value *SchedulingType, Value *ChunkSize,
5934 auto &Builder) {
5936 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5937 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5938 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5939 /*pstride=*/PStride, /*incr=*/One,
5940 /*chunk=*/ChunkSize});
5941 };
5942 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5943 if (DistScheduleSchedType != OMPScheduleType::None &&
5944 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5945 SchedType != OMPScheduleType::OrderedDistribute) {
5946 // We want to emit a second init function call for the dist_schedule clause
5947 // to the Distribute construct. This should only be done however if a
5948 // Workshare Loop is nested within a Distribute Construct
5949 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5950 }
5951
5952 // Load values written by the "init" function.
5953 Value *FirstChunkStart =
5954 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5955 Value *FirstChunkStop =
5956 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5957 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5958 Value *ChunkRange =
5959 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5960 Value *NextChunkStride =
5961 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5962
5963 // Create outer "dispatch" loop for enumerating the chunks.
5964 BasicBlock *DispatchEnter = splitBB(Builder, true);
5965 Value *DispatchCounter;
5966
5967 // It is safe to assume this didn't return an error because the callback
5968 // passed into createCanonicalLoop is the only possible error source, and it
5969 // always returns success.
5970 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5971 {Builder.saveIP(), DL},
5972 [&](InsertPointTy BodyIP, Value *Counter) {
5973 DispatchCounter = Counter;
5974 return Error::success();
5975 },
5976 FirstChunkStart, CastedTripCount, NextChunkStride,
5977 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5978 "dispatch"));
5979
5980 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5981 // not have to preserve the canonical invariant.
5982 BasicBlock *DispatchBody = DispatchCLI->getBody();
5983 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5984 BasicBlock *DispatchExit = DispatchCLI->getExit();
5985 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5986 DispatchCLI->invalidate();
5987
5988 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5989 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5990 redirectTo(CLI->getExit(), DispatchLatch, DL);
5991 redirectTo(DispatchBody, DispatchEnter, DL);
5992
5993 // Prepare the prolog of the chunk loop.
5994 Builder.restoreIP(CLI->getPreheaderIP());
5995 Builder.SetCurrentDebugLocation(DL);
5996
5997 // Compute the number of iterations of the chunk loop.
5998 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5999 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
6000 Value *IsLastChunk =
6001 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
6002 Value *CountUntilOrigTripCount =
6003 Builder.CreateSub(CastedTripCount, DispatchCounter);
6004 Value *ChunkTripCount = Builder.CreateSelect(
6005 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
6006 Value *BackcastedChunkTC =
6007 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
6008 CLI->setTripCount(BackcastedChunkTC);
6009
6010 // Update all uses of the induction variable except the one in the condition
6011 // block that compares it with the actual upper bound, and the increment in
6012 // the latch block.
6013 Value *BackcastedDispatchCounter =
6014 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
6015 CLI->mapIndVar([&](Instruction *) -> Value * {
6016 Builder.restoreIP(CLI->getBodyIP());
6017 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
6018 });
6019
6020 // In the "exit" block, call the "fini" function.
6021 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
6022 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
6023
6024 // Add the barrier if requested.
6025 if (NeedsBarrier) {
6026 InsertPointOrErrorTy AfterIP =
6027 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
6028 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
6029 if (!AfterIP)
6030 return AfterIP.takeError();
6031 }
6032
6033#ifndef NDEBUG
6034 // Even though we currently do not support applying additional methods to it,
6035 // the chunk loop should remain a canonical loop.
6036 CLI->assertOK();
6037#endif
6038
6039 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
6040}
6041
6042// Returns an LLVM function to call for executing an OpenMP static worksharing
6043// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
6044// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
6045static FunctionCallee
6047 WorksharingLoopType LoopType) {
6048 unsigned Bitwidth = Ty->getIntegerBitWidth();
6049 Module &M = OMPBuilder->M;
6050 switch (LoopType) {
6051 case WorksharingLoopType::ForStaticLoop:
6052 if (Bitwidth == 32)
6053 return OMPBuilder->getOrCreateRuntimeFunction(
6054 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
6055 if (Bitwidth == 64)
6056 return OMPBuilder->getOrCreateRuntimeFunction(
6057 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
6058 break;
6059 case WorksharingLoopType::DistributeStaticLoop:
6060 if (Bitwidth == 32)
6061 return OMPBuilder->getOrCreateRuntimeFunction(
6062 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
6063 if (Bitwidth == 64)
6064 return OMPBuilder->getOrCreateRuntimeFunction(
6065 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
6066 break;
6067 case WorksharingLoopType::DistributeForStaticLoop:
6068 if (Bitwidth == 32)
6069 return OMPBuilder->getOrCreateRuntimeFunction(
6070 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
6071 if (Bitwidth == 64)
6072 return OMPBuilder->getOrCreateRuntimeFunction(
6073 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
6074 break;
6075 }
6076 if (Bitwidth != 32 && Bitwidth != 64) {
6077 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
6078 }
6079 llvm_unreachable("Unknown type of OpenMP worksharing loop");
6080}
6081
6082// Inserts a call to proper OpenMP Device RTL function which handles
6083// loop worksharing.
6085 WorksharingLoopType LoopType,
6086 BasicBlock *InsertBlock, Value *Ident,
6087 Value *LoopBodyArg, Value *TripCount,
6088 Function &LoopBodyFn, bool NoLoop) {
6089 Type *TripCountTy = TripCount->getType();
6090 Module &M = OMPBuilder->M;
6091 IRBuilder<> &Builder = OMPBuilder->Builder;
6092 FunctionCallee RTLFn =
6093 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
6094 SmallVector<Value *, 8> RealArgs;
6095 RealArgs.push_back(Ident);
6096 RealArgs.push_back(&LoopBodyFn);
6097 RealArgs.push_back(LoopBodyArg);
6098 RealArgs.push_back(TripCount);
6099 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
6100 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6101 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
6102 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
6103 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
6104 return;
6105 }
6106 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
6107 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
6108 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
6109 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
6110
6111 RealArgs.push_back(
6112 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
6113 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6114 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
6115 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6116 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
6117 } else {
6118 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
6119 }
6120
6121 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
6122}
6123
6125 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
6126 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
6127 WorksharingLoopType LoopType, bool NoLoop) {
6128 IRBuilder<> &Builder = OMPIRBuilder->Builder;
6129 BasicBlock *Preheader = CLI->getPreheader();
6130 Value *TripCount = CLI->getTripCount();
6131
6132 // After loop body outling, the loop body contains only set up
6133 // of loop body argument structure and the call to the outlined
6134 // loop body function. Firstly, we need to move setup of loop body args
6135 // into loop preheader.
6136 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
6137 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
6138
6139 // The next step is to remove the whole loop. We do not it need anymore.
6140 // That's why make an unconditional branch from loop preheader to loop
6141 // exit block
6142 Builder.restoreIP({Preheader, Preheader->end()});
6143 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
6144 Preheader->getTerminator()->eraseFromParent();
6145 Builder.CreateBr(CLI->getExit());
6146
6147 // Delete dead loop blocks
6148 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
6149 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
6150 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
6151 CleanUpInfo.EntryBB = CLI->getHeader();
6152 CleanUpInfo.ExitBB = CLI->getExit();
6153 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
6154 DeleteDeadBlocks(BlocksToBeRemoved);
6155
6156 // Find the instruction which corresponds to loop body argument structure
6157 // and remove the call to loop body function instruction.
6158 Value *LoopBodyArg;
6159 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
6160 assert(OutlinedFnUser &&
6161 "Expected unique undroppable user of outlined function");
6162 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
6163 assert(OutlinedFnCallInstruction && "Expected outlined function call");
6164 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
6165 "Expected outlined function call to be located in loop preheader");
6166 // Check in case no argument structure has been passed.
6167 if (OutlinedFnCallInstruction->arg_size() > 1)
6168 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
6169 else
6170 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
6171 OutlinedFnCallInstruction->eraseFromParent();
6172
6173 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
6174 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
6175
6176 for (auto &ToBeDeletedItem : ToBeDeleted)
6177 ToBeDeletedItem->eraseFromParent();
6178 CLI->invalidate();
6179}
6180
6181OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
6182 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
6183 WorksharingLoopType LoopType, bool NoLoop) {
6184 uint32_t SrcLocStrSize;
6185 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6187 switch (LoopType) {
6188 case WorksharingLoopType::ForStaticLoop:
6189 Flag = OMP_IDENT_FLAG_WORK_LOOP;
6190 break;
6191 case WorksharingLoopType::DistributeStaticLoop:
6192 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
6193 break;
6194 case WorksharingLoopType::DistributeForStaticLoop:
6195 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
6196 break;
6197 }
6198 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
6199
6200 auto OI = std::make_unique<OutlineInfo>();
6201 OI->OuterAllocBB = CLI->getPreheader();
6202 Function *OuterFn = CLI->getPreheader()->getParent();
6203
6204 // Instructions which need to be deleted at the end of code generation
6205 SmallVector<Instruction *, 4> ToBeDeleted;
6206
6207 OI->OuterAllocBB = AllocaIP.getBlock();
6208
6209 // Mark the body loop as region which needs to be extracted
6210 OI->EntryBB = CLI->getBody();
6211 OI->ExitBB = CLI->getLatch()->splitBasicBlockBefore(CLI->getLatch()->begin(),
6212 "omp.prelatch");
6213
6214 // Prepare loop body for extraction
6215 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
6216
6217 // Insert new loop counter variable which will be used only in loop
6218 // body.
6219 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
6220 Instruction *NewLoopCntLoad =
6221 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
6222 // New loop counter instructions are redundant in the loop preheader when
6223 // code generation for workshare loop is finshed. That's why mark them as
6224 // ready for deletion.
6225 ToBeDeleted.push_back(NewLoopCntLoad);
6226 ToBeDeleted.push_back(NewLoopCnt);
6227
6228 // Analyse loop body region. Find all input variables which are used inside
6229 // loop body region.
6230 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
6232 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
6233
6234 CodeExtractorAnalysisCache CEAC(*OuterFn);
6235 CodeExtractor Extractor(Blocks,
6236 /* DominatorTree */ nullptr,
6237 /* AggregateArgs */ true,
6238 /* BlockFrequencyInfo */ nullptr,
6239 /* BranchProbabilityInfo */ nullptr,
6240 /* AssumptionCache */ nullptr,
6241 /* AllowVarArgs */ true,
6242 /* AllowAlloca */ true,
6243 /* AllocationBlock */ CLI->getPreheader(),
6244 /* DeallocationBlocks */ {},
6245 /* Suffix */ ".omp_wsloop",
6246 /* AggrArgsIn0AddrSpace */ true);
6247
6248 BasicBlock *CommonExit = nullptr;
6249 SetVector<Value *> SinkingCands, HoistingCands;
6250
6251 // Find allocas outside the loop body region which are used inside loop
6252 // body
6253 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
6254
6255 // We need to model loop body region as the function f(cnt, loop_arg).
6256 // That's why we replace loop induction variable by the new counter
6257 // which will be one of loop body function argument
6259 CLI->getIndVar()->user_end());
6260 for (auto Use : Users) {
6261 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
6262 if (ParallelRegionBlockSet.count(Inst->getParent())) {
6263 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
6264 }
6265 }
6266 }
6267 // Make sure that loop counter variable is not merged into loop body
6268 // function argument structure and it is passed as separate variable
6269 OI->ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
6270
6271 // PostOutline CB is invoked when loop body function is outlined and
6272 // loop body is replaced by call to outlined function. We need to add
6273 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
6274 // function will handle loop control logic.
6275 //
6276 OI->PostOutlineCB = [=, ToBeDeletedVec =
6277 std::move(ToBeDeleted)](Function &OutlinedFn) {
6278 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
6279 LoopType, NoLoop);
6280 };
6281 addOutlineInfo(std::move(OI));
6282 return CLI->getAfterIP();
6283}
6284
6287 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
6288 bool HasSimdModifier, bool HasMonotonicModifier,
6289 bool HasNonmonotonicModifier, bool HasOrderedClause,
6290 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
6291 Value *DistScheduleChunkSize) {
6292 if (Config.isTargetDevice())
6293 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
6294 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
6295 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
6296 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
6297
6298 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
6299 OMPScheduleType::ModifierOrdered;
6300 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
6301 if (HasDistSchedule) {
6302 DistScheduleSchedType = DistScheduleChunkSize
6303 ? OMPScheduleType::OrderedDistributeChunked
6304 : OMPScheduleType::OrderedDistribute;
6305 }
6306 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
6307 case OMPScheduleType::BaseStatic:
6308 case OMPScheduleType::BaseDistribute:
6309 assert((!ChunkSize || !DistScheduleChunkSize) &&
6310 "No chunk size with static-chunked schedule");
6311 if (IsOrdered && !HasDistSchedule)
6312 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6313 NeedsBarrier, ChunkSize);
6314 // FIXME: Monotonicity ignored?
6315 if (DistScheduleChunkSize)
6316 return applyStaticChunkedWorkshareLoop(
6317 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6318 DistScheduleChunkSize, DistScheduleSchedType);
6319 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
6320 HasDistSchedule);
6321
6322 case OMPScheduleType::BaseStaticChunked:
6323 case OMPScheduleType::BaseDistributeChunked:
6324 if (IsOrdered && !HasDistSchedule)
6325 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6326 NeedsBarrier, ChunkSize);
6327 // FIXME: Monotonicity ignored?
6328 return applyStaticChunkedWorkshareLoop(
6329 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6330 DistScheduleChunkSize, DistScheduleSchedType);
6331
6332 case OMPScheduleType::BaseRuntime:
6333 case OMPScheduleType::BaseAuto:
6334 case OMPScheduleType::BaseGreedy:
6335 case OMPScheduleType::BaseBalanced:
6336 case OMPScheduleType::BaseSteal:
6337 case OMPScheduleType::BaseRuntimeSimd:
6338 assert(!ChunkSize &&
6339 "schedule type does not support user-defined chunk sizes");
6340 [[fallthrough]];
6341 case OMPScheduleType::BaseGuidedSimd:
6342 case OMPScheduleType::BaseDynamicChunked:
6343 case OMPScheduleType::BaseGuidedChunked:
6344 case OMPScheduleType::BaseGuidedIterativeChunked:
6345 case OMPScheduleType::BaseGuidedAnalyticalChunked:
6346 case OMPScheduleType::BaseStaticBalancedChunked:
6347 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6348 NeedsBarrier, ChunkSize);
6349
6350 default:
6351 llvm_unreachable("Unknown/unimplemented schedule kind");
6352 }
6353}
6354
6355/// Returns an LLVM function to call for initializing loop bounds using OpenMP
6356/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6357/// the runtime. Always interpret integers as unsigned similarly to
6358/// CanonicalLoopInfo.
6359static FunctionCallee
6361 unsigned Bitwidth = Ty->getIntegerBitWidth();
6362 if (Bitwidth == 32)
6363 return OMPBuilder.getOrCreateRuntimeFunction(
6364 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
6365 if (Bitwidth == 64)
6366 return OMPBuilder.getOrCreateRuntimeFunction(
6367 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
6368 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6369}
6370
6371/// Returns an LLVM function to call for updating the next loop using OpenMP
6372/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6373/// the runtime. Always interpret integers as unsigned similarly to
6374/// CanonicalLoopInfo.
6375static FunctionCallee
6377 unsigned Bitwidth = Ty->getIntegerBitWidth();
6378 if (Bitwidth == 32)
6379 return OMPBuilder.getOrCreateRuntimeFunction(
6380 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6381 if (Bitwidth == 64)
6382 return OMPBuilder.getOrCreateRuntimeFunction(
6383 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6384 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6385}
6386
6387/// Returns an LLVM function to call for finalizing the dynamic loop using
6388/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6389/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6390static FunctionCallee
6392 unsigned Bitwidth = Ty->getIntegerBitWidth();
6393 if (Bitwidth == 32)
6394 return OMPBuilder.getOrCreateRuntimeFunction(
6395 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6396 if (Bitwidth == 64)
6397 return OMPBuilder.getOrCreateRuntimeFunction(
6398 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6399 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6400}
6401
6403OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6404 InsertPointTy AllocaIP,
6405 OMPScheduleType SchedType,
6406 bool NeedsBarrier, Value *Chunk) {
6407 assert(CLI->isValid() && "Requires a valid canonical loop");
6408 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6409 "Require dedicated allocate IP");
6411 "Require valid schedule type");
6412
6413 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6414 OMPScheduleType::ModifierOrdered;
6415
6416 // Set up the source location value for OpenMP runtime.
6417 Builder.SetCurrentDebugLocation(DL);
6418
6419 uint32_t SrcLocStrSize;
6420 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6421 Value *SrcLoc =
6422 getOrCreateIdent(SrcLocStr, SrcLocStrSize, OMP_IDENT_FLAG_WORK_LOOP);
6423
6424 // Declare useful OpenMP runtime functions.
6425 Value *IV = CLI->getIndVar();
6426 Type *IVTy = IV->getType();
6427 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6428 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6429
6430 // Allocate space for computed loop bounds as expected by the "init" function.
6431 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6432 Type *I32Type = Type::getInt32Ty(M.getContext());
6433 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6434 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6435 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6436 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6437 CLI->setLastIter(PLastIter);
6438
6439 // At the end of the preheader, prepare for calling the "init" function by
6440 // storing the current loop bounds into the allocated space. A canonical loop
6441 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6442 // and produces an inclusive upper bound.
6443 BasicBlock *PreHeader = CLI->getPreheader();
6444 Builder.SetInsertPoint(PreHeader->getTerminator());
6445 Constant *One = ConstantInt::get(IVTy, 1);
6446 Builder.CreateStore(One, PLowerBound);
6447 Value *UpperBound = CLI->getTripCount();
6448 Builder.CreateStore(UpperBound, PUpperBound);
6449 Builder.CreateStore(One, PStride);
6450
6451 BasicBlock *Header = CLI->getHeader();
6452 BasicBlock *Exit = CLI->getExit();
6453 BasicBlock *Cond = CLI->getCond();
6454 BasicBlock *Latch = CLI->getLatch();
6455 InsertPointTy AfterIP = CLI->getAfterIP();
6456
6457 // The CLI will be "broken" in the code below, as the loop is no longer
6458 // a valid canonical loop.
6459
6460 if (!Chunk)
6461 Chunk = One;
6462
6463 Value *ThreadNum =
6464 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
6465
6466 Constant *SchedulingType =
6467 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6468
6469 // Call the "init" function.
6470 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6471 /* LowerBound */ One, UpperBound,
6472 /* step */ One, Chunk});
6473
6474 // An outer loop around the existing one.
6475 BasicBlock *OuterCond = BasicBlock::Create(
6476 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6477 PreHeader->getParent());
6478 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6479 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6481 DynamicNext,
6482 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6483 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6484 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6485 Value *LowerBound =
6486 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6487 Builder.CreateCondBr(MoreWork, Header, Exit);
6488
6489 // Change PHI-node in loop header to use outer cond rather than preheader,
6490 // and set IV to the LowerBound.
6491 Instruction *Phi = &Header->front();
6492 auto *PI = cast<PHINode>(Phi);
6493 PI->setIncomingBlock(0, OuterCond);
6494 PI->setIncomingValue(0, LowerBound);
6495
6496 // Then set the pre-header to jump to the OuterCond
6497 Instruction *Term = PreHeader->getTerminator();
6498 auto *Br = cast<UncondBrInst>(Term);
6499 Br->setSuccessor(OuterCond);
6500
6501 // Modify the inner condition:
6502 // * Use the UpperBound returned from the DynamicNext call.
6503 // * jump to the loop outer loop when done with one of the inner loops.
6504 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6505 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6506 Instruction *Comp = &*Builder.GetInsertPoint();
6507 auto *CI = cast<CmpInst>(Comp);
6508 CI->setOperand(1, UpperBound);
6509 // Redirect the inner exit to branch to outer condition.
6510 Instruction *Branch = &Cond->back();
6511 auto *BI = cast<CondBrInst>(Branch);
6512 assert(BI->getSuccessor(1) == Exit);
6513 BI->setSuccessor(1, OuterCond);
6514
6515 // Call the "fini" function if "ordered" is present in wsloop directive.
6516 if (Ordered) {
6517 Builder.SetInsertPoint(&Latch->back());
6518 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6519 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6520 }
6521
6522 // Add the barrier if requested.
6523 if (NeedsBarrier) {
6524 Builder.SetInsertPoint(&Exit->back());
6525 InsertPointOrErrorTy BarrierIP =
6527 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6528 /* CheckCancelFlag */ false);
6529 if (!BarrierIP)
6530 return BarrierIP.takeError();
6531 }
6532
6533 CLI->invalidate();
6534 return AfterIP;
6535}
6536
6537/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6538/// after this \p OldTarget will be orphaned.
6540 BasicBlock *NewTarget, DebugLoc DL) {
6541 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6542 redirectTo(Pred, NewTarget, DL);
6543}
6544
6545/// Determine which blocks in \p BBs are reachable from outside and remove the
6546/// ones that are not reachable from the function.
6549 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
6550 for (Use &U : BB->uses()) {
6551 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6552 if (!UseInst)
6553 continue;
6554 if (BBsToErase.count(UseInst->getParent()))
6555 continue;
6556 return true;
6557 }
6558 return false;
6559 };
6560
6561 while (BBsToErase.remove_if(HasRemainingUses)) {
6562 // Try again if anything was removed.
6563 }
6564
6565 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
6566 DeleteDeadBlocks(BBVec);
6567}
6568
6569CanonicalLoopInfo *
6571 InsertPointTy ComputeIP) {
6572 assert(Loops.size() >= 1 && "At least one loop required");
6573 size_t NumLoops = Loops.size();
6574
6575 // Nothing to do if there is already just one loop.
6576 if (NumLoops == 1)
6577 return Loops.front();
6578
6579 CanonicalLoopInfo *Outermost = Loops.front();
6580 CanonicalLoopInfo *Innermost = Loops.back();
6581 BasicBlock *OrigPreheader = Outermost->getPreheader();
6582 BasicBlock *OrigAfter = Outermost->getAfter();
6583 Function *F = OrigPreheader->getParent();
6584
6585 // Loop control blocks that may become orphaned later.
6586 SmallVector<BasicBlock *, 12> OldControlBBs;
6587 OldControlBBs.reserve(6 * Loops.size());
6589 Loop->collectControlBlocks(OldControlBBs);
6590
6591 // Setup the IRBuilder for inserting the trip count computation.
6592 Builder.SetCurrentDebugLocation(DL);
6593 if (ComputeIP.isSet())
6594 Builder.restoreIP(ComputeIP);
6595 else
6596 Builder.restoreIP(Outermost->getPreheaderIP());
6597
6598 // Derive the collapsed' loop trip count.
6599 // TODO: Find common/largest indvar type.
6600 Value *CollapsedTripCount = nullptr;
6601 for (CanonicalLoopInfo *L : Loops) {
6602 assert(L->isValid() &&
6603 "All loops to collapse must be valid canonical loops");
6604 Value *OrigTripCount = L->getTripCount();
6605 if (!CollapsedTripCount) {
6606 CollapsedTripCount = OrigTripCount;
6607 continue;
6608 }
6609
6610 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6611 CollapsedTripCount =
6612 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6613 }
6614
6615 // Create the collapsed loop control flow.
6616 CanonicalLoopInfo *Result =
6617 createLoopSkeleton(DL, CollapsedTripCount, F,
6618 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6619
6620 // Build the collapsed loop body code.
6621 // Start with deriving the input loop induction variables from the collapsed
6622 // one, using a divmod scheme. To preserve the original loops' order, the
6623 // innermost loop use the least significant bits.
6624 Builder.restoreIP(Result->getBodyIP());
6625
6626 Value *Leftover = Result->getIndVar();
6627 SmallVector<Value *> NewIndVars;
6628 NewIndVars.resize(NumLoops);
6629 for (int i = NumLoops - 1; i >= 1; --i) {
6630 Value *OrigTripCount = Loops[i]->getTripCount();
6631
6632 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6633 NewIndVars[i] = NewIndVar;
6634
6635 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6636 }
6637 // Outermost loop gets all the remaining bits.
6638 NewIndVars[0] = Leftover;
6639
6640 // Construct the loop body control flow.
6641 // We progressively construct the branch structure following in direction of
6642 // the control flow, from the leading in-between code, the loop nest body, the
6643 // trailing in-between code, and rejoining the collapsed loop's latch.
6644 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6645 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6646 // its predecessors as sources.
6647 BasicBlock *ContinueBlock = Result->getBody();
6648 BasicBlock *ContinuePred = nullptr;
6649 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6650 BasicBlock *NextSrc) {
6651 if (ContinueBlock)
6652 redirectTo(ContinueBlock, Dest, DL);
6653 else
6654 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6655
6656 ContinueBlock = nullptr;
6657 ContinuePred = NextSrc;
6658 };
6659
6660 // The code before the nested loop of each level.
6661 // Because we are sinking it into the nest, it will be executed more often
6662 // that the original loop. More sophisticated schemes could keep track of what
6663 // the in-between code is and instantiate it only once per thread.
6664 for (size_t i = 0; i < NumLoops - 1; ++i)
6665 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6666
6667 // Connect the loop nest body.
6668 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6669
6670 // The code after the nested loop at each level.
6671 for (size_t i = NumLoops - 1; i > 0; --i)
6672 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6673
6674 // Connect the finished loop to the collapsed loop latch.
6675 ContinueWith(Result->getLatch(), nullptr);
6676
6677 // Replace the input loops with the new collapsed loop.
6678 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6679 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6680
6681 // Replace the input loop indvars with the derived ones.
6682 for (size_t i = 0; i < NumLoops; ++i)
6683 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6684
6685 // Remove unused parts of the input loops.
6686 removeUnusedBlocksFromParent(OldControlBBs);
6687
6688 for (CanonicalLoopInfo *L : Loops)
6689 L->invalidate();
6690
6691#ifndef NDEBUG
6692 Result->assertOK();
6693#endif
6694 return Result;
6695}
6696
6697std::vector<CanonicalLoopInfo *>
6699 ArrayRef<Value *> TileSizes) {
6700 assert(TileSizes.size() == Loops.size() &&
6701 "Must pass as many tile sizes as there are loops");
6702 int NumLoops = Loops.size();
6703 assert(NumLoops >= 1 && "At least one loop to tile required");
6704
6705 CanonicalLoopInfo *OutermostLoop = Loops.front();
6706 CanonicalLoopInfo *InnermostLoop = Loops.back();
6707 Function *F = OutermostLoop->getBody()->getParent();
6708 BasicBlock *InnerEnter = InnermostLoop->getBody();
6709 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6710
6711 // Loop control blocks that may become orphaned later.
6712 SmallVector<BasicBlock *, 12> OldControlBBs;
6713 OldControlBBs.reserve(6 * Loops.size());
6715 Loop->collectControlBlocks(OldControlBBs);
6716
6717 // Collect original trip counts and induction variable to be accessible by
6718 // index. Also, the structure of the original loops is not preserved during
6719 // the construction of the tiled loops, so do it before we scavenge the BBs of
6720 // any original CanonicalLoopInfo.
6721 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6722 for (CanonicalLoopInfo *L : Loops) {
6723 assert(L->isValid() && "All input loops must be valid canonical loops");
6724 OrigTripCounts.push_back(L->getTripCount());
6725 OrigIndVars.push_back(L->getIndVar());
6726 }
6727
6728 // Collect the code between loop headers. These may contain SSA definitions
6729 // that are used in the loop nest body. To be usable with in the innermost
6730 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6731 // these instructions may be executed more often than before the tiling.
6732 // TODO: It would be sufficient to only sink them into body of the
6733 // corresponding tile loop.
6735 for (int i = 0; i < NumLoops - 1; ++i) {
6736 CanonicalLoopInfo *Surrounding = Loops[i];
6737 CanonicalLoopInfo *Nested = Loops[i + 1];
6738
6739 BasicBlock *EnterBB = Surrounding->getBody();
6740 BasicBlock *ExitBB = Nested->getHeader();
6741 InbetweenCode.emplace_back(EnterBB, ExitBB);
6742 }
6743
6744 // Compute the trip counts of the floor loops.
6745 Builder.SetCurrentDebugLocation(DL);
6746 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6747 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6748 for (int i = 0; i < NumLoops; ++i) {
6749 Value *TileSize = TileSizes[i];
6750 Value *OrigTripCount = OrigTripCounts[i];
6751 Type *IVType = OrigTripCount->getType();
6752
6753 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6754 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6755
6756 // 0 if tripcount divides the tilesize, 1 otherwise.
6757 // 1 means we need an additional iteration for a partial tile.
6758 //
6759 // Unfortunately we cannot just use the roundup-formula
6760 // (tripcount + tilesize - 1)/tilesize
6761 // because the summation might overflow. We do not want introduce undefined
6762 // behavior when the untiled loop nest did not.
6763 Value *FloorTripOverflow =
6764 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6765
6766 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6767 Value *FloorTripCount =
6768 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6769 "omp_floor" + Twine(i) + ".tripcount", true);
6770
6771 // Remember some values for later use.
6772 FloorCompleteCount.push_back(FloorCompleteTripCount);
6773 FloorCount.push_back(FloorTripCount);
6774 FloorRems.push_back(FloorTripRem);
6775 }
6776
6777 // Generate the new loop nest, from the outermost to the innermost.
6778 std::vector<CanonicalLoopInfo *> Result;
6779 Result.reserve(NumLoops * 2);
6780
6781 // The basic block of the surrounding loop that enters the nest generated
6782 // loop.
6783 BasicBlock *Enter = OutermostLoop->getPreheader();
6784
6785 // The basic block of the surrounding loop where the inner code should
6786 // continue.
6787 BasicBlock *Continue = OutermostLoop->getAfter();
6788
6789 // Where the next loop basic block should be inserted.
6790 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6791
6792 auto EmbeddNewLoop =
6793 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6794 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6795 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6796 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6797 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6798 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6799
6800 // Setup the position where the next embedded loop connects to this loop.
6801 Enter = EmbeddedLoop->getBody();
6802 Continue = EmbeddedLoop->getLatch();
6803 OutroInsertBefore = EmbeddedLoop->getLatch();
6804 return EmbeddedLoop;
6805 };
6806
6807 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6808 const Twine &NameBase) {
6809 for (auto P : enumerate(TripCounts)) {
6810 CanonicalLoopInfo *EmbeddedLoop =
6811 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6812 Result.push_back(EmbeddedLoop);
6813 }
6814 };
6815
6816 EmbeddNewLoops(FloorCount, "floor");
6817
6818 // Within the innermost floor loop, emit the code that computes the tile
6819 // sizes.
6820 Builder.SetInsertPoint(Enter->getTerminator());
6821 SmallVector<Value *, 4> TileCounts;
6822 for (int i = 0; i < NumLoops; ++i) {
6823 CanonicalLoopInfo *FloorLoop = Result[i];
6824 Value *TileSize = TileSizes[i];
6825
6826 Value *FloorIsEpilogue =
6827 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6828 Value *TileTripCount =
6829 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6830
6831 TileCounts.push_back(TileTripCount);
6832 }
6833
6834 // Create the tile loops.
6835 EmbeddNewLoops(TileCounts, "tile");
6836
6837 // Insert the inbetween code into the body.
6838 BasicBlock *BodyEnter = Enter;
6839 BasicBlock *BodyEntered = nullptr;
6840 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6841 BasicBlock *EnterBB = P.first;
6842 BasicBlock *ExitBB = P.second;
6843
6844 if (BodyEnter)
6845 redirectTo(BodyEnter, EnterBB, DL);
6846 else
6847 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6848
6849 BodyEnter = nullptr;
6850 BodyEntered = ExitBB;
6851 }
6852
6853 // Append the original loop nest body into the generated loop nest body.
6854 if (BodyEnter)
6855 redirectTo(BodyEnter, InnerEnter, DL);
6856 else
6857 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6859
6860 // Replace the original induction variable with an induction variable computed
6861 // from the tile and floor induction variables.
6862 Builder.restoreIP(Result.back()->getBodyIP());
6863 for (int i = 0; i < NumLoops; ++i) {
6864 CanonicalLoopInfo *FloorLoop = Result[i];
6865 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6866 Value *OrigIndVar = OrigIndVars[i];
6867 Value *Size = TileSizes[i];
6868
6869 Value *Scale =
6870 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6871 Value *Shift =
6872 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6873 OrigIndVar->replaceAllUsesWith(Shift);
6874 }
6875
6876 // Remove unused parts of the original loops.
6877 removeUnusedBlocksFromParent(OldControlBBs);
6878
6879 for (CanonicalLoopInfo *L : Loops)
6880 L->invalidate();
6881
6882#ifndef NDEBUG
6883 for (CanonicalLoopInfo *GenL : Result)
6884 GenL->assertOK();
6885#endif
6886 return Result;
6887}
6888
6889/// Attach metadata \p Properties to the basic block described by \p BB. If the
6890/// basic block already has metadata, the basic block properties are appended.
6892 ArrayRef<Metadata *> Properties) {
6893 // Nothing to do if no property to attach.
6894 if (Properties.empty())
6895 return;
6896
6897 LLVMContext &Ctx = BB->getContext();
6898 SmallVector<Metadata *> NewProperties;
6899 NewProperties.push_back(nullptr);
6900
6901 // If the basic block already has metadata, prepend it to the new metadata.
6902 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6903 if (Existing)
6904 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6905
6906 append_range(NewProperties, Properties);
6907 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6908 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6909
6910 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6911}
6912
6913/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6914/// loop already has metadata, the loop properties are appended.
6916 ArrayRef<Metadata *> Properties) {
6917 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6918
6919 // Attach metadata to the loop's latch
6920 BasicBlock *Latch = Loop->getLatch();
6921 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6922 addBasicBlockMetadata(Latch, Properties);
6923}
6924
6925/// Attach llvm.access.group metadata to the memref instructions of \p Block
6927 LoopInfo &LI) {
6928 for (Instruction &I : *Block) {
6929 if (I.mayReadOrWriteMemory()) {
6930 // TODO: This instruction may already have access group from
6931 // other pragmas e.g. #pragma clang loop vectorize. Append
6932 // so that the existing metadata is not overwritten.
6933 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6934 }
6935 }
6936}
6937
6938CanonicalLoopInfo *
6940 CanonicalLoopInfo *firstLoop = Loops.front();
6941 CanonicalLoopInfo *lastLoop = Loops.back();
6942 Function *F = firstLoop->getPreheader()->getParent();
6943
6944 // Loop control blocks that will become orphaned later
6945 SmallVector<BasicBlock *> oldControlBBs;
6947 Loop->collectControlBlocks(oldControlBBs);
6948
6949 // Collect original trip counts
6950 SmallVector<Value *> origTripCounts;
6951 for (CanonicalLoopInfo *L : Loops) {
6952 assert(L->isValid() && "All input loops must be valid canonical loops");
6953 origTripCounts.push_back(L->getTripCount());
6954 }
6955
6956 Builder.SetCurrentDebugLocation(DL);
6957
6958 // Compute max trip count.
6959 // The fused loop will be from 0 to max(origTripCounts)
6960 BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
6961 F, firstLoop->getHeader());
6962 Builder.SetInsertPoint(TCBlock);
6963 Value *fusedTripCount = nullptr;
6964 for (CanonicalLoopInfo *L : Loops) {
6965 assert(L->isValid() && "All loops to fuse must be valid canonical loops");
6966 Value *origTripCount = L->getTripCount();
6967 if (!fusedTripCount) {
6968 fusedTripCount = origTripCount;
6969 continue;
6970 }
6971 Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
6972 fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
6973 ".omp.fuse.tc");
6974 }
6975
6976 // Generate new loop
6977 CanonicalLoopInfo *fused =
6978 createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
6979 lastLoop->getLatch(), "fused");
6980
6981 // Replace original loops with the fused loop
6982 // Preheader and After are not considered inside the CLI.
6983 // These are used to compute the individual TCs of the loops
6984 // so they have to be put before the resulting fused loop.
6985 // Moving them up for readability.
6986 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6987 Loops[i]->getPreheader()->moveBefore(TCBlock);
6988 Loops[i]->getAfter()->moveBefore(TCBlock);
6989 }
6990 lastLoop->getPreheader()->moveBefore(TCBlock);
6991
6992 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6993 redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
6994 redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
6995 }
6996 redirectTo(lastLoop->getPreheader(), TCBlock, DL);
6997 redirectTo(TCBlock, fused->getPreheader(), DL);
6998 redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
6999
7000 // Build the fused body
7001 // Create new Blocks with conditions that jump to the original loop bodies
7003 SmallVector<Value *> condValues;
7004 for (size_t i = 0; i < Loops.size(); ++i) {
7005 BasicBlock *condBlock = BasicBlock::Create(
7006 F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
7007 Builder.SetInsertPoint(condBlock);
7008 Value *condValue =
7009 Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
7010 condBBs.push_back(condBlock);
7011 condValues.push_back(condValue);
7012 }
7013 // Join the condition blocks with the bodies of the original loops
7014 redirectTo(fused->getBody(), condBBs[0], DL);
7015 for (size_t i = 0; i < Loops.size() - 1; ++i) {
7016 Builder.SetInsertPoint(condBBs[i]);
7017 Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
7018 redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
7019 // Replace the IV with the fused IV
7020 Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
7021 }
7022 // Last body jumps to the created end body block
7023 Builder.SetInsertPoint(condBBs.back());
7024 Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
7025 fused->getLatch());
7026 redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
7027 // Replace the IV with the fused IV
7028 lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
7029
7030 // The loop latch must have only one predecessor. Currently it is branched to
7031 // from both the last condition block and the last loop body
7032 fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(),
7033 "omp.fused.pre_latch");
7034
7035 // Remove unused parts
7036 removeUnusedBlocksFromParent(oldControlBBs);
7037
7038 // Invalidate old CLIs
7039 for (CanonicalLoopInfo *L : Loops)
7040 L->invalidate();
7041
7042#ifndef NDEBUG
7043 fused->assertOK();
7044#endif
7045 return fused;
7046}
7047
7049 LLVMContext &Ctx = Builder.getContext();
7051 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7052 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
7053}
7054
7056 LLVMContext &Ctx = Builder.getContext();
7058 Loop, {
7059 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7060 });
7061}
7062
7063void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
7064 Value *IfCond, ValueToValueMapTy &VMap,
7065 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
7066 const Twine &NamePrefix) {
7067 Function *F = CanonicalLoop->getFunction();
7068
7069 // We can't do
7070 // if (cond) {
7071 // simd_loop;
7072 // } else {
7073 // non_simd_loop;
7074 // }
7075 // because then the CanonicalLoopInfo would only point to one of the loops:
7076 // leading to other constructs operating on the same loop to malfunction.
7077 // Instead generate
7078 // while (...) {
7079 // if (cond) {
7080 // simd_body;
7081 // } else {
7082 // not_simd_body;
7083 // }
7084 // }
7085 // At least for simple loops, LLVM seems able to hoist the if out of the loop
7086 // body at -O3
7087
7088 // Define where if branch should be inserted
7089 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
7090
7091 // Create additional blocks for the if statement
7092 BasicBlock *Cond = SplitBeforeIt->getParent();
7093 llvm::LLVMContext &C = Cond->getContext();
7095 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
7097 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
7098
7099 // Create if condition branch.
7100 Builder.SetInsertPoint(SplitBeforeIt);
7101 Instruction *BrInstr =
7102 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
7103 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
7104 // Then block contains branch to omp loop body which needs to be vectorized
7105 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
7106 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
7107
7108 Builder.SetInsertPoint(ElseBlock);
7109
7110 // Clone loop for the else branch
7112
7113 SmallVector<BasicBlock *, 8> ExistingBlocks;
7114 ExistingBlocks.reserve(L->getNumBlocks() + 1);
7115 ExistingBlocks.push_back(ThenBlock);
7116 ExistingBlocks.append(L->block_begin(), L->block_end());
7117 // Cond is the block that has the if clause condition
7118 // LoopCond is omp_loop.cond
7119 // LoopHeader is omp_loop.header
7120 BasicBlock *LoopCond = Cond->getUniquePredecessor();
7121 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
7122 assert(LoopCond && LoopHeader && "Invalid loop structure");
7123 for (BasicBlock *Block : ExistingBlocks) {
7124 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
7125 Block == LoopHeader || Block == LoopCond || Block == Cond) {
7126 continue;
7127 }
7128 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
7129
7130 // fix name not to be omp.if.then
7131 if (Block == ThenBlock)
7132 NewBB->setName(NamePrefix + ".if.else");
7133
7134 NewBB->moveBefore(CanonicalLoop->getExit());
7135 VMap[Block] = NewBB;
7136 NewBlocks.push_back(NewBB);
7137 }
7138 remapInstructionsInBlocks(NewBlocks, VMap);
7139 Builder.CreateBr(NewBlocks.front());
7140
7141 // The loop latch must have only one predecessor. Currently it is branched to
7142 // from both the 'then' and 'else' branches.
7143 L->getLoopLatch()->splitBasicBlockBefore(L->getLoopLatch()->begin(),
7144 NamePrefix + ".pre_latch");
7145
7146 // Ensure that the then block is added to the loop so we add the attributes in
7147 // the next step
7148 L->addBasicBlockToLoop(ThenBlock, LI);
7149}
7150
7151unsigned
7153 const StringMap<bool> &Features) {
7154 if (TargetTriple.isX86()) {
7155 if (Features.lookup("avx512f"))
7156 return 512;
7157 else if (Features.lookup("avx"))
7158 return 256;
7159 return 128;
7160 }
7161 if (TargetTriple.isPPC())
7162 return 128;
7163 if (TargetTriple.isWasm())
7164 return 128;
7165 return 0;
7166}
7167
7169 MapVector<Value *, Value *> AlignedVars,
7170 Value *IfCond, OrderKind Order,
7171 ConstantInt *Simdlen, ConstantInt *Safelen) {
7172 LLVMContext &Ctx = Builder.getContext();
7173
7174 Function *F = CanonicalLoop->getFunction();
7175
7176 // Blocks must have terminators.
7177 // FIXME: Don't run analyses on incomplete/invalid IR.
7179 for (BasicBlock &BB : *F)
7180 if (!BB.hasTerminator())
7181 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7182
7183 // TODO: We should not rely on pass manager. Currently we use pass manager
7184 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
7185 // object. We should have a method which returns all blocks between
7186 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
7188 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7189 FAM.registerPass([]() { return LoopAnalysis(); });
7190 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7191
7192 LoopAnalysis LIA;
7193 LoopInfo &&LI = LIA.run(*F, FAM);
7194
7195 for (Instruction *I : UIs)
7196 I->eraseFromParent();
7197
7198 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
7199 if (AlignedVars.size()) {
7200 InsertPointTy IP = Builder.saveIP();
7201 for (auto &AlignedItem : AlignedVars) {
7202 Value *AlignedPtr = AlignedItem.first;
7203 Value *Alignment = AlignedItem.second;
7204 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
7205 Builder.SetInsertPoint(loadInst->getNextNode());
7206 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
7207 Alignment);
7208 }
7209 Builder.restoreIP(IP);
7210 }
7211
7212 if (IfCond) {
7213 ValueToValueMapTy VMap;
7214 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
7215 }
7216
7218
7219 // Get the basic blocks from the loop in which memref instructions
7220 // can be found.
7221 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
7222 // preferably without running any passes.
7223 for (BasicBlock *Block : L->getBlocks()) {
7224 if (Block == CanonicalLoop->getCond() ||
7225 Block == CanonicalLoop->getHeader())
7226 continue;
7227 Reachable.insert(Block);
7228 }
7229
7230 SmallVector<Metadata *> LoopMDList;
7231
7232 // In presence of finite 'safelen', it may be unsafe to mark all
7233 // the memory instructions parallel, because loop-carried
7234 // dependences of 'safelen' iterations are possible.
7235 // If clause order(concurrent) is specified then the memory instructions
7236 // are marked parallel even if 'safelen' is finite.
7237 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
7238 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
7239
7240 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
7241 // versions so we can't add the loop attributes in that case.
7242 if (IfCond) {
7243 // we can still add llvm.loop.parallel_access
7244 addLoopMetadata(CanonicalLoop, LoopMDList);
7245 return;
7246 }
7247
7248 // Use the above access group metadata to create loop level
7249 // metadata, which should be distinct for each loop.
7250 ConstantAsMetadata *BoolConst =
7252 LoopMDList.push_back(MDNode::get(
7253 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
7254
7255 if (Simdlen || Safelen) {
7256 // If both simdlen and safelen clauses are specified, the value of the
7257 // simdlen parameter must be less than or equal to the value of the safelen
7258 // parameter. Therefore, use safelen only in the absence of simdlen.
7259 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
7260 LoopMDList.push_back(
7261 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
7262 ConstantAsMetadata::get(VectorizeWidth)}));
7263 }
7264
7265 addLoopMetadata(CanonicalLoop, LoopMDList);
7266}
7267
7268/// Create the TargetMachine object to query the backend for optimization
7269/// preferences.
7270///
7271/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
7272/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
7273/// needed for the LLVM pass pipline. We use some default options to avoid
7274/// having to pass too many settings from the frontend that probably do not
7275/// matter.
7276///
7277/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
7278/// method. If we are going to use TargetMachine for more purposes, especially
7279/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
7280/// might become be worth requiring front-ends to pass on their TargetMachine,
7281/// or at least cache it between methods. Note that while fontends such as Clang
7282/// have just a single main TargetMachine per translation unit, "target-cpu" and
7283/// "target-features" that determine the TargetMachine are per-function and can
7284/// be overrided using __attribute__((target("OPTIONS"))).
7285static std::unique_ptr<TargetMachine>
7287 Module *M = F->getParent();
7288
7289 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
7290 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
7291 const llvm::Triple &Triple = M->getTargetTriple();
7292
7293 std::string Error;
7295 if (!TheTarget)
7296 return {};
7297
7299 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
7300 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
7301 /*CodeModel=*/std::nullopt, OptLevel));
7302}
7303
7304/// Heuristically determine the best-performant unroll factor for \p CLI. This
7305/// depends on the target processor. We are re-using the same heuristics as the
7306/// LoopUnrollPass.
7308 Function *F = CLI->getFunction();
7309
7310 // Assume the user requests the most aggressive unrolling, even if the rest of
7311 // the code is optimized using a lower setting.
7313 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
7314
7315 // Blocks must have terminators.
7316 // FIXME: Don't run analyses on incomplete/invalid IR.
7318 for (BasicBlock &BB : *F)
7319 if (!BB.hasTerminator())
7320 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7321
7323 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
7324 FAM.registerPass([]() { return AssumptionAnalysis(); });
7325 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7326 FAM.registerPass([]() { return LoopAnalysis(); });
7327 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
7328 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7329 TargetIRAnalysis TIRA;
7330 if (TM)
7331 TIRA = TargetIRAnalysis(
7332 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
7333 FAM.registerPass([&]() { return TIRA; });
7334
7335 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
7337 ScalarEvolution &&SE = SEA.run(*F, FAM);
7339 DominatorTree &&DT = DTA.run(*F, FAM);
7340 LoopAnalysis LIA;
7341 LoopInfo &&LI = LIA.run(*F, FAM);
7343 AssumptionCache &&AC = ACT.run(*F, FAM);
7345
7346 for (Instruction *I : UIs)
7347 I->eraseFromParent();
7348
7349 Loop *L = LI.getLoopFor(CLI->getHeader());
7350 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
7351
7353 L, SE, TTI,
7354 /*BlockFrequencyInfo=*/nullptr,
7355 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
7356 /*UserThreshold=*/std::nullopt,
7357 /*UserCount=*/std::nullopt,
7358 /*UserAllowPartial=*/true,
7359 /*UserAllowRuntime=*/true,
7360 /*UserUpperBound=*/std::nullopt,
7361 /*UserFullUnrollMaxCount=*/std::nullopt);
7362
7363 UP.Force = true;
7364
7365 // Account for additional optimizations taking place before the LoopUnrollPass
7366 // would unroll the loop.
7369
7370 // Use normal unroll factors even if the rest of the code is optimized for
7371 // size.
7374
7375 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
7376 << " Threshold=" << UP.Threshold << "\n"
7377 << " PartialThreshold=" << UP.PartialThreshold << "\n"
7378 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
7379 << " PartialOptSizeThreshold="
7380 << UP.PartialOptSizeThreshold << "\n");
7381
7382 // Disable peeling.
7385 /*UserAllowPeeling=*/false,
7386 /*UserAllowProfileBasedPeeling=*/false,
7387 /*UnrollingSpecficValues=*/false);
7388
7390 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
7391
7392 // Assume that reads and writes to stack variables can be eliminated by
7393 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
7394 // size.
7395 for (BasicBlock *BB : L->blocks()) {
7396 for (Instruction &I : *BB) {
7397 Value *Ptr;
7398 if (auto *Load = dyn_cast<LoadInst>(&I)) {
7399 Ptr = Load->getPointerOperand();
7400 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
7401 Ptr = Store->getPointerOperand();
7402 } else
7403 continue;
7404
7405 Ptr = Ptr->stripPointerCasts();
7406
7407 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
7408 if (Alloca->getParent() == &F->getEntryBlock())
7409 EphValues.insert(&I);
7410 }
7411 }
7412 }
7413
7414 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
7415
7416 // Loop is not unrollable if the loop contains certain instructions.
7417 if (!UCE.canUnroll()) {
7418 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
7419 return 1;
7420 }
7421
7422 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
7423 << "\n");
7424
7425 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
7426 // be able to use it.
7427 int TripCount = 0;
7428 int MaxTripCount = 0;
7429 bool MaxOrZero = false;
7430 unsigned TripMultiple = 0;
7431
7432 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
7433 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
7434 unsigned Factor = UP.Count;
7435 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
7436
7437 // This function returns 1 to signal to not unroll a loop.
7438 if (Factor == 0)
7439 return 1;
7440 return Factor;
7441}
7442
7444 int32_t Factor,
7445 CanonicalLoopInfo **UnrolledCLI) {
7446 assert(Factor >= 0 && "Unroll factor must not be negative");
7447
7448 Function *F = Loop->getFunction();
7449 LLVMContext &Ctx = F->getContext();
7450
7451 // If the unrolled loop is not used for another loop-associated directive, it
7452 // is sufficient to add metadata for the LoopUnrollPass.
7453 if (!UnrolledCLI) {
7454 SmallVector<Metadata *, 2> LoopMetadata;
7455 LoopMetadata.push_back(
7456 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
7457
7458 if (Factor >= 1) {
7460 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7461 LoopMetadata.push_back(MDNode::get(
7462 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
7463 }
7464
7465 addLoopMetadata(Loop, LoopMetadata);
7466 return;
7467 }
7468
7469 // Heuristically determine the unroll factor.
7470 if (Factor == 0)
7472
7473 // No change required with unroll factor 1.
7474 if (Factor == 1) {
7475 *UnrolledCLI = Loop;
7476 return;
7477 }
7478
7479 assert(Factor >= 2 &&
7480 "unrolling only makes sense with a factor of 2 or larger");
7481
7482 Type *IndVarTy = Loop->getIndVarType();
7483
7484 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
7485 // unroll the inner loop.
7486 Value *FactorVal =
7487 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
7488 /*isSigned=*/false));
7489 std::vector<CanonicalLoopInfo *> LoopNest =
7490 tileLoops(DL, {Loop}, {FactorVal});
7491 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
7492 *UnrolledCLI = LoopNest[0];
7493 CanonicalLoopInfo *InnerLoop = LoopNest[1];
7494
7495 // LoopUnrollPass can only fully unroll loops with constant trip count.
7496 // Unroll by the unroll factor with a fallback epilog for the remainder
7497 // iterations if necessary.
7499 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7501 InnerLoop,
7502 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7504 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
7505
7506#ifndef NDEBUG
7507 (*UnrolledCLI)->assertOK();
7508#endif
7509}
7510
7513 llvm::Value *BufSize, llvm::Value *CpyBuf,
7514 llvm::Value *CpyFn, llvm::Value *DidIt) {
7515 if (!updateToLocation(Loc))
7516 return Loc.IP;
7517
7518 uint32_t SrcLocStrSize;
7519 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7520 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7521 Value *ThreadId = getOrCreateThreadID(Ident);
7522
7523 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7524
7525 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7526
7527 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7528 createRuntimeFunctionCall(Fn, Args);
7529
7530 return Builder.saveIP();
7531}
7532
7534 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7535 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7537
7538 if (!updateToLocation(Loc))
7539 return Loc.IP;
7540
7541 // If needed allocate and initialize `DidIt` with 0.
7542 // DidIt: flag variable: 1=single thread; 0=not single thread.
7543 llvm::Value *DidIt = nullptr;
7544 if (!CPVars.empty()) {
7545 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7546 Builder.CreateStore(Builder.getInt32(0), DidIt);
7547 }
7548
7549 Directive OMPD = Directive::OMPD_single;
7550 uint32_t SrcLocStrSize;
7551 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7552 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7553 Value *ThreadId = getOrCreateThreadID(Ident);
7554 Value *Args[] = {Ident, ThreadId};
7555
7556 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7557 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7558
7559 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7560 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7561
7562 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7563 if (Error Err = FiniCB(IP))
7564 return Err;
7565
7566 // The thread that executes the single region must set `DidIt` to 1.
7567 // This is used by __kmpc_copyprivate, to know if the caller is the
7568 // single thread or not.
7569 if (DidIt)
7570 Builder.CreateStore(Builder.getInt32(1), DidIt);
7571
7572 return Error::success();
7573 };
7574
7575 // generates the following:
7576 // if (__kmpc_single()) {
7577 // .... single region ...
7578 // __kmpc_end_single
7579 // }
7580 // __kmpc_copyprivate
7581 // __kmpc_barrier
7582
7583 InsertPointOrErrorTy AfterIP =
7584 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7585 /*Conditional*/ true,
7586 /*hasFinalize*/ true);
7587 if (!AfterIP)
7588 return AfterIP.takeError();
7589
7590 if (DidIt) {
7591 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7592 // NOTE BufSize is currently unused, so just pass 0.
7594 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7595 CPFuncs[I], DidIt);
7596 // NOTE __kmpc_copyprivate already inserts a barrier
7597 } else if (!IsNowait) {
7598 InsertPointOrErrorTy AfterIP =
7600 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7601 /* CheckCancelFlag */ false);
7602 if (!AfterIP)
7603 return AfterIP.takeError();
7604 }
7605 return Builder.saveIP();
7606}
7607
7610 BodyGenCallbackTy BodyGenCB,
7611 FinalizeCallbackTy FiniCB, bool IsNowait) {
7612
7613 if (!updateToLocation(Loc))
7614 return Loc.IP;
7615
7616 // All threads execute the scope body — no conditional entry.
7617 InsertPointOrErrorTy AfterIP = EmitOMPInlinedRegion(
7618 Directive::OMPD_scope, /*EntryCall=*/nullptr, /*ExitCall=*/nullptr,
7619 BodyGenCB, FiniCB, /*Conditional=*/false, /*HasFinalize=*/true,
7620 /*IsCancellable=*/false);
7621 if (!AfterIP)
7622 return AfterIP.takeError();
7623
7624 Builder.restoreIP(*AfterIP);
7625 if (!IsNowait) {
7626 AfterIP = createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
7627 omp::Directive::OMPD_unknown,
7628 /*ForceSimpleCall=*/false,
7629 /*CheckCancelFlag=*/false);
7630 if (!AfterIP)
7631 return AfterIP.takeError();
7632 }
7633 return Builder.saveIP();
7634}
7635
7637 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7638 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7639
7640 if (!updateToLocation(Loc))
7641 return Loc.IP;
7642
7643 Directive OMPD = Directive::OMPD_critical;
7644 uint32_t SrcLocStrSize;
7645 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7646 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7647 Value *ThreadId = getOrCreateThreadID(Ident);
7648 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7649 Value *Args[] = {Ident, ThreadId, LockVar};
7650
7651 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7652 Function *RTFn = nullptr;
7653 if (HintInst) {
7654 // Add Hint to entry Args and create call
7655 EnterArgs.push_back(HintInst);
7656 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7657 } else {
7658 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7659 }
7660 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7661
7662 Function *ExitRTLFn =
7663 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7664 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7665
7666 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7667 /*Conditional*/ false, /*hasFinalize*/ true);
7668}
7669
7672 InsertPointTy AllocaIP, unsigned NumLoops,
7673 ArrayRef<llvm::Value *> StoreValues,
7674 const Twine &Name, bool IsDependSource) {
7675 assert(
7676 llvm::all_of(StoreValues,
7677 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7678 "OpenMP runtime requires depend vec with i64 type");
7679
7680 if (!updateToLocation(Loc))
7681 return Loc.IP;
7682
7683 // Allocate space for vector and generate alloc instruction.
7684 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7685 Builder.restoreIP(AllocaIP);
7686 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7687 ArgsBase->setAlignment(Align(8));
7689
7690 // Store the index value with offset in depend vector.
7691 for (unsigned I = 0; I < NumLoops; ++I) {
7692 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7693 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7694 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7695 STInst->setAlignment(Align(8));
7696 }
7697
7698 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7699 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7700
7701 uint32_t SrcLocStrSize;
7702 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7703 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7704 Value *ThreadId = getOrCreateThreadID(Ident);
7705 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7706
7707 Function *RTLFn = nullptr;
7708 if (IsDependSource)
7709 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7710 else
7711 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7712 createRuntimeFunctionCall(RTLFn, Args);
7713
7714 return Builder.saveIP();
7715}
7716
7718 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7719 FinalizeCallbackTy FiniCB, bool IsThreads) {
7720 if (!updateToLocation(Loc))
7721 return Loc.IP;
7722
7723 Directive OMPD = Directive::OMPD_ordered;
7724 Instruction *EntryCall = nullptr;
7725 Instruction *ExitCall = nullptr;
7726
7727 if (IsThreads) {
7728 uint32_t SrcLocStrSize;
7729 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7730 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7731 Value *ThreadId = getOrCreateThreadID(Ident);
7732 Value *Args[] = {Ident, ThreadId};
7733
7734 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7735 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7736
7737 Function *ExitRTLFn =
7738 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7739 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7740 }
7741
7742 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7743 /*Conditional*/ false, /*hasFinalize*/ true);
7744}
7745
7746OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7747 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7748 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7749 bool HasFinalize, bool IsCancellable) {
7750
7751 if (HasFinalize)
7752 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7753
7754 // Create inlined region's entry and body blocks, in preparation
7755 // for conditional creation
7756 BasicBlock *EntryBB = Builder.GetInsertBlock();
7757 Instruction *SplitPos = EntryBB->getTerminatorOrNull();
7759 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7760 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7761 BasicBlock *FiniBB =
7762 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7763
7764 Builder.SetInsertPoint(EntryBB->getTerminator());
7765 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7766
7767 // generate body
7768 if (Error Err =
7769 BodyGenCB(/* AllocaIP */ InsertPointTy(),
7770 /* CodeGenIP */ Builder.saveIP(), /* DeallocBlocks */ {}))
7771 return Err;
7772
7773 // emit exit call and do any needed finalization.
7774 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7775 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7776 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7777 "Unexpected control flow graph state!!");
7778 InsertPointOrErrorTy AfterIP =
7779 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7780 if (!AfterIP)
7781 return AfterIP.takeError();
7782
7783 // If we are skipping the region of a non conditional, remove the exit
7784 // block, and clear the builder's insertion point.
7785 assert(SplitPos->getParent() == ExitBB &&
7786 "Unexpected Insertion point location!");
7787 auto merged = MergeBlockIntoPredecessor(ExitBB);
7788 BasicBlock *ExitPredBB = SplitPos->getParent();
7789 auto InsertBB = merged ? ExitPredBB : ExitBB;
7791 SplitPos->eraseFromParent();
7792 Builder.SetInsertPoint(InsertBB);
7793
7794 return Builder.saveIP();
7795}
7796
7797OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7798 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7799 // if nothing to do, Return current insertion point.
7800 if (!Conditional || !EntryCall)
7801 return Builder.saveIP();
7802
7803 BasicBlock *EntryBB = Builder.GetInsertBlock();
7804 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7805 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7806 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7807
7808 // Emit thenBB and set the Builder's insertion point there for
7809 // body generation next. Place the block after the current block.
7810 Function *CurFn = EntryBB->getParent();
7811 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7812
7813 // Move Entry branch to end of ThenBB, and replace with conditional
7814 // branch (If-stmt)
7815 Instruction *EntryBBTI = EntryBB->getTerminator();
7816 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7817 EntryBBTI->removeFromParent();
7818 Builder.SetInsertPoint(UI);
7819 Builder.Insert(EntryBBTI);
7820 UI->eraseFromParent();
7821 Builder.SetInsertPoint(ThenBB->getTerminator());
7822
7823 // return an insertion point to ExitBB.
7824 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7825}
7826
7827OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7828 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7829 bool HasFinalize) {
7830
7831 Builder.restoreIP(FinIP);
7832
7833 // If there is finalization to do, emit it before the exit call
7834 if (HasFinalize) {
7835 assert(!FinalizationStack.empty() &&
7836 "Unexpected finalization stack state!");
7837
7838 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7839 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7840
7841 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7842 return std::move(Err);
7843
7844 // Exit condition: insertion point is before the terminator of the new Fini
7845 // block
7846 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7847 }
7848
7849 if (!ExitCall)
7850 return Builder.saveIP();
7851
7852 // place the Exitcall as last instruction before Finalization block terminator
7853 ExitCall->removeFromParent();
7854 Builder.Insert(ExitCall);
7855
7856 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7857 ExitCall->getIterator());
7858}
7859
7861 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7862 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7863 if (!IP.isSet())
7864 return IP;
7865
7867
7868 // creates the following CFG structure
7869 // OMP_Entry : (MasterAddr != PrivateAddr)?
7870 // F T
7871 // | \
7872 // | copin.not.master
7873 // | /
7874 // v /
7875 // copyin.not.master.end
7876 // |
7877 // v
7878 // OMP.Entry.Next
7879
7880 BasicBlock *OMP_Entry = IP.getBlock();
7881 Function *CurFn = OMP_Entry->getParent();
7882 BasicBlock *CopyBegin =
7883 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7884 BasicBlock *CopyEnd = nullptr;
7885
7886 // If entry block is terminated, split to preserve the branch to following
7887 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7889 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7890 "copyin.not.master.end");
7891 OMP_Entry->getTerminator()->eraseFromParent();
7892 } else {
7893 CopyEnd =
7894 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7895 }
7896
7897 Builder.SetInsertPoint(OMP_Entry);
7898 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7899 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7900 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7901 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7902
7903 Builder.SetInsertPoint(CopyBegin);
7904 if (BranchtoEnd)
7905 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7906
7907 return Builder.saveIP();
7908}
7909
7911 Value *Size, Value *Allocator,
7912 std::string Name) {
7914 if (!updateToLocation(Loc))
7915 return nullptr;
7916
7917 uint32_t SrcLocStrSize;
7918 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7919 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7920 Value *ThreadId = getOrCreateThreadID(Ident);
7921 Value *Args[] = {ThreadId, Size, Allocator};
7922
7923 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7924
7925 return createRuntimeFunctionCall(Fn, Args, Name);
7926}
7927
7929 Value *Align, Value *Size,
7930 Value *Allocator,
7931 std::string Name) {
7933 if (!updateToLocation(Loc))
7934 return nullptr;
7935
7936 uint32_t SrcLocStrSize;
7937 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7938 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7939 Value *ThreadId = getOrCreateThreadID(Ident);
7940 Value *Args[] = {ThreadId, Align, Size, Allocator};
7941
7942 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_aligned_alloc);
7943
7944 return Builder.CreateCall(Fn, Args, Name);
7945}
7946
7948 Value *Addr, Value *Allocator,
7949 std::string Name) {
7951 if (!updateToLocation(Loc))
7952 return nullptr;
7953
7954 uint32_t SrcLocStrSize;
7955 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7956 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7957 Value *ThreadId = getOrCreateThreadID(Ident);
7958 Value *Args[] = {ThreadId, Addr, Allocator};
7959 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7960 return createRuntimeFunctionCall(Fn, Args, Name);
7961}
7962
7964 Value *Size,
7965 const Twine &Name) {
7968
7969 Value *Args[] = {Size};
7970 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc_shared);
7971 CallInst *Call = Builder.CreateCall(Fn, Args, Name);
7973 M.getContext(), M.getDataLayout().getPrefTypeAlign(Int64)));
7974 return Call;
7975}
7976
7978 Type *VarType,
7979 const Twine &Name) {
7980 return createOMPAllocShared(
7981 Loc, Builder.getInt64(M.getDataLayout().getTypeAllocSize(VarType)), Name);
7982}
7983
7985 Value *Addr, Value *Size,
7986 const Twine &Name) {
7989
7990 Value *Args[] = {Addr, Size};
7991 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free_shared);
7992 return Builder.CreateCall(Fn, Args, Name);
7993}
7994
7996 Value *Addr, Type *VarType,
7997 const Twine &Name) {
7998 return createOMPFreeShared(
7999 Loc, Addr, Builder.getInt64(M.getDataLayout().getTypeAllocSize(VarType)),
8000 Name);
8001}
8002
8004 const LocationDescription &Loc, Value *InteropVar,
8005 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
8006 Value *DependenceAddress, bool HaveNowaitClause) {
8009
8010 uint32_t SrcLocStrSize;
8011 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8012 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8013 Value *ThreadId = getOrCreateThreadID(Ident);
8014 if (Device == nullptr)
8015 Device = Constant::getAllOnesValue(Int32);
8016 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
8017 if (NumDependences == nullptr) {
8018 NumDependences = ConstantInt::get(Int32, 0);
8019 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8020 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8021 }
8022 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8023 Value *Args[] = {
8024 Ident, ThreadId, InteropVar, InteropTypeVal,
8025 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
8026
8027 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
8028
8029 return createRuntimeFunctionCall(Fn, Args);
8030}
8031
8033 const LocationDescription &Loc, Value *InteropVar, Value *Device,
8034 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
8037
8038 uint32_t SrcLocStrSize;
8039 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8040 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8041 Value *ThreadId = getOrCreateThreadID(Ident);
8042 if (Device == nullptr)
8043 Device = Constant::getAllOnesValue(Int32);
8044 if (NumDependences == nullptr) {
8045 NumDependences = ConstantInt::get(Int32, 0);
8046 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8047 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8048 }
8049 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8050 Value *Args[] = {
8051 Ident, ThreadId, InteropVar, Device,
8052 NumDependences, DependenceAddress, HaveNowaitClauseVal};
8053
8054 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
8055
8056 return createRuntimeFunctionCall(Fn, Args);
8057}
8058
8060 Value *InteropVar, Value *Device,
8061 Value *NumDependences,
8062 Value *DependenceAddress,
8063 bool HaveNowaitClause) {
8066 uint32_t SrcLocStrSize;
8067 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8068 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8069 Value *ThreadId = getOrCreateThreadID(Ident);
8070 if (Device == nullptr)
8071 Device = Constant::getAllOnesValue(Int32);
8072 if (NumDependences == nullptr) {
8073 NumDependences = ConstantInt::get(Int32, 0);
8074 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8075 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8076 }
8077 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8078 Value *Args[] = {
8079 Ident, ThreadId, InteropVar, Device,
8080 NumDependences, DependenceAddress, HaveNowaitClauseVal};
8081
8082 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
8083
8084 return createRuntimeFunctionCall(Fn, Args);
8085}
8086
8089 llvm::ConstantInt *Size, const llvm::Twine &Name) {
8092
8093 uint32_t SrcLocStrSize;
8094 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8095 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8096 Value *ThreadId = getOrCreateThreadID(Ident);
8097 Constant *ThreadPrivateCache =
8098 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
8099 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
8100
8101 Function *Fn =
8102 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
8103
8104 return createRuntimeFunctionCall(Fn, Args);
8105}
8106
8108 const LocationDescription &Loc,
8110 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
8111 "expected num_threads and num_teams to be specified");
8112
8113 if (!updateToLocation(Loc))
8114 return Loc.IP;
8115
8116 uint32_t SrcLocStrSize;
8117 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8118 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8119 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
8120 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
8121 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD &&
8122 Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP);
8123 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
8124 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
8125
8126 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
8127 Function *Kernel = DebugKernelWrapper;
8128
8129 // We need to strip the debug prefix to get the correct kernel name.
8130 StringRef KernelName = Kernel->getName();
8131 const std::string DebugPrefix = "_debug__";
8132 if (KernelName.ends_with(DebugPrefix)) {
8133 KernelName = KernelName.drop_back(DebugPrefix.length());
8134 Kernel = M.getFunction(KernelName);
8135 assert(Kernel && "Expected the real kernel to exist");
8136 }
8137
8138 // Manifest the launch configuration in the metadata matching the kernel
8139 // environment.
8140 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
8141 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
8142
8143 // If MaxThreads not set, select the maximum between the default workgroup
8144 // size and the MinThreads value.
8145 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
8146 if (MaxThreadsVal < 0) {
8147 if (hasGridValue(T)) {
8148 MaxThreadsVal =
8149 std::max(int32_t(getGridValue(T, Kernel).GV_Default_WG_Size),
8150 Attrs.MinThreads);
8151 } else {
8152 MaxThreadsVal = Attrs.MinThreads;
8153 }
8154 }
8155
8156 if (MaxThreadsVal > 0)
8157 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
8158
8159 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
8160 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
8161 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
8162 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
8163 Constant *ReductionDataSize =
8164 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
8165 Constant *ReductionBufferLength =
8166 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
8167
8169 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
8170 const DataLayout &DL = Fn->getDataLayout();
8171
8172 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
8173 Constant *DynamicEnvironmentInitializer =
8174 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
8175 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
8176 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
8177 DynamicEnvironmentInitializer, DynamicEnvironmentName,
8178 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
8179 DL.getDefaultGlobalsAddressSpace());
8180 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
8181
8182 Constant *DynamicEnvironment =
8183 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
8184 ? DynamicEnvironmentGV
8185 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
8186 DynamicEnvironmentPtr);
8187
8188 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
8189 ConfigurationEnvironment, {
8190 UseGenericStateMachineVal,
8191 MayUseNestedParallelismVal,
8192 IsSPMDVal,
8193 MinThreads,
8194 MaxThreads,
8195 MinTeams,
8196 MaxTeams,
8197 ReductionDataSize,
8198 ReductionBufferLength,
8199 });
8200 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
8201 KernelEnvironment, {
8202 ConfigurationEnvironmentInitializer,
8203 Ident,
8204 DynamicEnvironment,
8205 });
8206 std::string KernelEnvironmentName =
8207 (KernelName + "_kernel_environment").str();
8208 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
8209 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
8210 KernelEnvironmentInitializer, KernelEnvironmentName,
8211 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
8212 DL.getDefaultGlobalsAddressSpace());
8213 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
8214
8215 Constant *KernelEnvironment =
8216 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
8217 ? KernelEnvironmentGV
8218 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
8219 KernelEnvironmentPtr);
8220 Value *KernelLaunchEnvironment =
8221 DebugKernelWrapper->getArg(DebugKernelWrapper->arg_size() - 1);
8222 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
8223 KernelLaunchEnvironment =
8224 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
8225 ? KernelLaunchEnvironment
8226 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
8227 KernelLaunchEnvParamTy);
8228 CallInst *ThreadKind = createRuntimeFunctionCall(
8229 Fn, {KernelEnvironment, KernelLaunchEnvironment});
8230
8231 Value *ExecUserCode = Builder.CreateICmpEQ(
8232 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
8233 "exec_user_code");
8234
8235 // ThreadKind = __kmpc_target_init(...)
8236 // if (ThreadKind == -1)
8237 // user_code
8238 // else
8239 // return;
8240
8241 auto *UI = Builder.CreateUnreachable();
8242 BasicBlock *CheckBB = UI->getParent();
8243 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
8244
8245 BasicBlock *WorkerExitBB = BasicBlock::Create(
8246 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
8247 Builder.SetInsertPoint(WorkerExitBB);
8248 Builder.CreateRetVoid();
8249
8250 auto *CheckBBTI = CheckBB->getTerminator();
8251 Builder.SetInsertPoint(CheckBBTI);
8252 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
8253
8254 CheckBBTI->eraseFromParent();
8255 UI->eraseFromParent();
8256
8257 // Continue in the "user_code" block, see diagram above and in
8258 // openmp/libomptarget/deviceRTLs/common/include/target.h .
8259 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
8260}
8261
8263 int32_t TeamsReductionDataSize,
8264 int32_t TeamsReductionBufferLength) {
8265 if (!updateToLocation(Loc))
8266 return;
8267
8269 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
8270
8272
8273 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
8274 return;
8275
8276 Function *Kernel = Builder.GetInsertBlock()->getParent();
8277 // We need to strip the debug prefix to get the correct kernel name.
8278 StringRef KernelName = Kernel->getName();
8279 const std::string DebugPrefix = "_debug__";
8280 if (KernelName.ends_with(DebugPrefix))
8281 KernelName = KernelName.drop_back(DebugPrefix.length());
8282 auto *KernelEnvironmentGV =
8283 M.getNamedGlobal((KernelName + "_kernel_environment").str());
8284 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
8285 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
8286 auto *NewInitializer = ConstantFoldInsertValueInstruction(
8287 KernelEnvironmentInitializer,
8288 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
8289 NewInitializer = ConstantFoldInsertValueInstruction(
8290 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
8291 {0, 8});
8292 KernelEnvironmentGV->setInitializer(NewInitializer);
8293}
8294
8295static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
8296 bool Min) {
8297 if (Kernel.hasFnAttribute(Name)) {
8298 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
8299 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
8300 }
8301 Kernel.addFnAttr(Name, llvm::utostr(Value));
8302}
8303
8304std::pair<int32_t, int32_t>
8306 int32_t ThreadLimit =
8307 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
8308
8309 if (T.isAMDGPU()) {
8310 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
8311 if (!Attr.isValid() || !Attr.isStringAttribute())
8312 return {0, ThreadLimit};
8313 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
8314 int32_t LB, UB;
8315 if (!llvm::to_integer(UBStr, UB, 10))
8316 return {0, ThreadLimit};
8317 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
8318 if (!llvm::to_integer(LBStr, LB, 10))
8319 return {0, UB};
8320 return {LB, UB};
8321 }
8322
8323 if (Kernel.hasFnAttribute(NVVMAttr::MaxNTID)) {
8324 int32_t UB = Kernel.getFnAttributeAsParsedInteger(NVVMAttr::MaxNTID);
8325 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
8326 }
8327 return {0, ThreadLimit};
8328}
8329
8331 Function &Kernel, int32_t LB,
8332 int32_t UB) {
8333 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
8334
8335 if (T.isAMDGPU()) {
8336 Kernel.addFnAttr("amdgpu-flat-work-group-size",
8337 llvm::utostr(LB) + "," + llvm::utostr(UB));
8338 return;
8339 }
8340
8342}
8343
8344std::pair<int32_t, int32_t>
8346 // TODO: Read from backend annotations if available.
8347 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
8348}
8349
8351 int32_t LB, int32_t UB) {
8352 if (T.isNVPTX())
8353 if (UB > 0)
8355 if (T.isAMDGPU())
8356 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
8357
8358 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
8359}
8360
8361void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
8362 Function *OutlinedFn) {
8363 if (Config.isTargetDevice()) {
8365 // TODO: Determine if DSO local can be set to true.
8366 OutlinedFn->setDSOLocal(false);
8368 if (T.isAMDGCN())
8370 else if (T.isNVPTX())
8372 else if (T.isSPIRV())
8374 }
8375}
8376
8377Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
8378 StringRef EntryFnIDName) {
8379 if (Config.isTargetDevice()) {
8380 assert(OutlinedFn && "The outlined function must exist if embedded");
8381 return OutlinedFn;
8382 }
8383
8384 return new GlobalVariable(
8385 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
8386 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
8387}
8388
8389Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
8390 StringRef EntryFnName) {
8391 if (OutlinedFn)
8392 return OutlinedFn;
8393
8394 assert(!M.getGlobalVariable(EntryFnName, true) &&
8395 "Named kernel already exists?");
8396 return new GlobalVariable(
8397 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
8398 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
8399}
8400
8402 TargetRegionEntryInfo &EntryInfo,
8403 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
8404 Function *&OutlinedFn, Constant *&OutlinedFnID) {
8405
8406 SmallString<64> EntryFnName;
8407 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
8408
8409 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
8410 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
8411 if (!CBResult)
8412 return CBResult.takeError();
8413 OutlinedFn = *CBResult;
8414 } else {
8415 OutlinedFn = nullptr;
8416 }
8417
8418 // If this target outline function is not an offload entry, we don't need to
8419 // register it. This may be in the case of a false if clause, or if there are
8420 // no OpenMP targets.
8421 if (!IsOffloadEntry)
8422 return Error::success();
8423
8424 std::string EntryFnIDName =
8425 Config.isTargetDevice()
8426 ? std::string(EntryFnName)
8427 : createPlatformSpecificName({EntryFnName, "region_id"});
8428
8429 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
8430 EntryFnName, EntryFnIDName);
8431 return Error::success();
8432}
8433
8435 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
8436 StringRef EntryFnName, StringRef EntryFnIDName) {
8437 if (OutlinedFn)
8438 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
8439 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
8440 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
8441 OffloadInfoManager.registerTargetRegionEntryInfo(
8442 EntryInfo, EntryAddr, OutlinedFnID,
8444 return OutlinedFnID;
8445}
8446
8448 const LocationDescription &Loc, InsertPointTy AllocaIP,
8449 InsertPointTy CodeGenIP, ArrayRef<BasicBlock *> DeallocBlocks,
8450 Value *DeviceID, Value *IfCond, TargetDataInfo &Info,
8451 GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB,
8452 omp::RuntimeFunction *MapperFunc,
8454 BodyGenTy BodyGenType)>
8455 BodyGenCB,
8456 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
8457 if (!updateToLocation(Loc))
8458 return InsertPointTy();
8459
8460 Builder.restoreIP(CodeGenIP);
8461
8462 bool IsStandAlone = !BodyGenCB;
8463 MapInfosTy *MapInfo;
8464 // Generate the code for the opening of the data environment. Capture all the
8465 // arguments of the runtime call by reference because they are used in the
8466 // closing of the region.
8467 auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8468 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
8469 MapInfo = &GenMapInfoCB(Builder.saveIP());
8470 if (Error Err = emitOffloadingArrays(
8471 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
8472 /*IsNonContiguous=*/true, DeviceAddrCB))
8473 return Err;
8474
8475 TargetDataRTArgs RTArgs;
8477
8478 // Emit the number of elements in the offloading arrays.
8479 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8480
8481 // Source location for the ident struct
8482 if (!SrcLocInfo) {
8483 uint32_t SrcLocStrSize;
8484 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8485 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8486 }
8487
8488 SmallVector<llvm::Value *, 13> OffloadingArgs = {
8489 SrcLocInfo, DeviceID,
8490 PointerNum, RTArgs.BasePointersArray,
8491 RTArgs.PointersArray, RTArgs.SizesArray,
8492 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8493 RTArgs.MappersArray};
8494
8495 if (IsStandAlone) {
8496 assert(MapperFunc && "MapperFunc missing for standalone target data");
8497
8498 auto TaskBodyCB = [&](Value *, Value *,
8500 if (Info.HasNoWait) {
8501 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
8505 }
8506
8508 OffloadingArgs);
8509
8510 if (Info.HasNoWait) {
8511 BasicBlock *OffloadContBlock =
8512 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
8513 Function *CurFn = Builder.GetInsertBlock()->getParent();
8514 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
8515 Builder.restoreIP(Builder.saveIP());
8516 }
8517 return Error::success();
8518 };
8519
8520 bool RequiresOuterTargetTask = Info.HasNoWait;
8521 if (!RequiresOuterTargetTask)
8522 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
8523 /*TargetTaskAllocaIP=*/{}));
8524 else
8525 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
8526 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
8527 } else {
8528 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
8529 omp::OMPRTL___tgt_target_data_begin_mapper);
8530
8531 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
8532
8533 for (auto DeviceMap : Info.DevicePtrInfoMap) {
8534 if (isa<AllocaInst>(DeviceMap.second.second)) {
8535 auto *LI =
8536 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
8537 Builder.CreateStore(LI, DeviceMap.second.second);
8538 }
8539 }
8540
8541 // If device pointer privatization is required, emit the body of the
8542 // region here. It will have to be duplicated: with and without
8543 // privatization.
8544 InsertPointOrErrorTy AfterIP =
8545 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
8546 if (!AfterIP)
8547 return AfterIP.takeError();
8548 Builder.restoreIP(*AfterIP);
8549 }
8550 return Error::success();
8551 };
8552
8553 // If we need device pointer privatization, we need to emit the body of the
8554 // region with no privatization in the 'else' branch of the conditional.
8555 // Otherwise, we don't have to do anything.
8556 auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8557 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
8558 InsertPointOrErrorTy AfterIP =
8559 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
8560 if (!AfterIP)
8561 return AfterIP.takeError();
8562 Builder.restoreIP(*AfterIP);
8563 return Error::success();
8564 };
8565
8566 // Generate code for the closing of the data region.
8567 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8568 ArrayRef<BasicBlock *> DeallocBlocks) {
8569 TargetDataRTArgs RTArgs;
8570 Info.EmitDebug = !MapInfo->Names.empty();
8571 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
8572
8573 // Emit the number of elements in the offloading arrays.
8574 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8575
8576 // Source location for the ident struct
8577 if (!SrcLocInfo) {
8578 uint32_t SrcLocStrSize;
8579 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8580 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8581 }
8582
8583 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
8584 PointerNum, RTArgs.BasePointersArray,
8585 RTArgs.PointersArray, RTArgs.SizesArray,
8586 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8587 RTArgs.MappersArray};
8588 Function *EndMapperFunc =
8589 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
8590
8591 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
8592 return Error::success();
8593 };
8594
8595 // We don't have to do anything to close the region if the if clause evaluates
8596 // to false.
8597 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8598 ArrayRef<BasicBlock *> DeallocBlocks) {
8599 return Error::success();
8600 };
8601
8602 Error Err = [&]() -> Error {
8603 if (BodyGenCB) {
8604 Error Err = [&]() {
8605 if (IfCond)
8606 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8607 return BeginThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8608 }();
8609
8610 if (Err)
8611 return Err;
8612
8613 // If we don't require privatization of device pointers, we emit the body
8614 // in between the runtime calls. This avoids duplicating the body code.
8615 InsertPointOrErrorTy AfterIP =
8616 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8617 if (!AfterIP)
8618 return AfterIP.takeError();
8619 restoreIPandDebugLoc(Builder, *AfterIP);
8620
8621 if (IfCond)
8622 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8623 return EndThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8624 }
8625 if (IfCond)
8626 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8627 return BeginThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8628 }();
8629
8630 if (Err)
8631 return Err;
8632
8633 return Builder.saveIP();
8634}
8635
8638 bool IsGPUDistribute) {
8639 assert((IVSize == 32 || IVSize == 64) &&
8640 "IV size is not compatible with the omp runtime");
8641 RuntimeFunction Name;
8642 if (IsGPUDistribute)
8643 Name = IVSize == 32
8644 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8645 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8646 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8647 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8648 else
8649 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8650 : omp::OMPRTL___kmpc_for_static_init_4u)
8651 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8652 : omp::OMPRTL___kmpc_for_static_init_8u);
8653
8654 return getOrCreateRuntimeFunction(M, Name);
8655}
8656
8658 bool IVSigned) {
8659 assert((IVSize == 32 || IVSize == 64) &&
8660 "IV size is not compatible with the omp runtime");
8661 RuntimeFunction Name = IVSize == 32
8662 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8663 : omp::OMPRTL___kmpc_dispatch_init_4u)
8664 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8665 : omp::OMPRTL___kmpc_dispatch_init_8u);
8666
8667 return getOrCreateRuntimeFunction(M, Name);
8668}
8669
8671 bool IVSigned) {
8672 assert((IVSize == 32 || IVSize == 64) &&
8673 "IV size is not compatible with the omp runtime");
8674 RuntimeFunction Name = IVSize == 32
8675 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8676 : omp::OMPRTL___kmpc_dispatch_next_4u)
8677 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8678 : omp::OMPRTL___kmpc_dispatch_next_8u);
8679
8680 return getOrCreateRuntimeFunction(M, Name);
8681}
8682
8684 bool IVSigned) {
8685 assert((IVSize == 32 || IVSize == 64) &&
8686 "IV size is not compatible with the omp runtime");
8687 RuntimeFunction Name = IVSize == 32
8688 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8689 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8690 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8691 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8692
8693 return getOrCreateRuntimeFunction(M, Name);
8694}
8695
8697 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8698}
8699
8701 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8702 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8703
8704 DISubprogram *NewSP = Func->getSubprogram();
8705 if (!NewSP)
8706 return;
8707
8709
8710 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8711 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8712 // Only use cached variable if the arg number matches. This is important
8713 // so that DIVariable created for privatized variables are not discarded.
8714 if (NewVar && (arg == NewVar->getArg()))
8715 return NewVar;
8716
8718 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8719 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8720 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8721 return NewVar;
8722 };
8723
8724 auto UpdateDebugRecord = [&](auto *DR) {
8725 DILocalVariable *OldVar = DR->getVariable();
8726 unsigned ArgNo = 0;
8727 for (auto Loc : DR->location_ops()) {
8728 auto Iter = ValueReplacementMap.find(Loc);
8729 if (Iter != ValueReplacementMap.end()) {
8730 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8731 ArgNo = std::get<1>(Iter->second) + 1;
8732 }
8733 }
8734 if (ArgNo != 0)
8735 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8736 };
8737
8739 auto MoveDebugRecordToCorrectBlock = [&](DbgVariableRecord *DVR) {
8740 if (DVR->getNumVariableLocationOps() != 1u) {
8741 DVR->setKillLocation();
8742 return;
8743 }
8744 Value *Loc = DVR->getVariableLocationOp(0u);
8745 BasicBlock *CurBB = DVR->getParent();
8746 BasicBlock *RequiredBB = nullptr;
8747
8748 if (Instruction *LocInst = dyn_cast<Instruction>(Loc))
8749 RequiredBB = LocInst->getParent();
8750 else if (isa<llvm::Argument>(Loc))
8751 RequiredBB = &DVR->getFunction()->getEntryBlock();
8752
8753 if (RequiredBB && RequiredBB != CurBB) {
8754 assert(!RequiredBB->empty());
8755 RequiredBB->insertDbgRecordBefore(DVR->clone(),
8756 RequiredBB->back().getIterator());
8757 DVRsToDelete.push_back(DVR);
8758 }
8759 };
8760
8761 // The location and scope of variable intrinsics and records still point to
8762 // the parent function of the target region. Update them.
8763 for (Instruction &I : instructions(Func)) {
8765 "Unexpected debug intrinsic");
8766 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
8767 UpdateDebugRecord(&DVR);
8768 MoveDebugRecordToCorrectBlock(&DVR);
8769 }
8770 }
8771 for (auto *DVR : DVRsToDelete)
8772 DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
8773 // An extra argument is passed to the device. Create the debug data for it.
8774 if (OMPBuilder.Config.isTargetDevice()) {
8775 DICompileUnit *CU = NewSP->getUnit();
8776 Module *M = Func->getParent();
8777 DIBuilder DB(*M, true, CU);
8778 DIType *VoidPtrTy =
8779 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8780 unsigned ArgNo = Func->arg_size();
8781 DILocalVariable *Var = DB.createParameterVariable(
8782 NewSP, "dyn_ptr", ArgNo, NewSP->getFile(), /*LineNo=*/0, VoidPtrTy,
8783 /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8784 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8785 Argument *LastArg = Func->getArg(Func->arg_size() - 1);
8786 DB.insertDeclare(LastArg, Var, DB.createExpression(), Loc,
8787 &(*Func->begin()));
8788 }
8789}
8790
8792 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8793 return cast<Operator>(V)->getOperand(0);
8794 return V;
8795}
8796
8798 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8800 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8803 SmallVector<Type *> ParameterTypes;
8804 if (OMPBuilder.Config.isTargetDevice()) {
8805 // All parameters to target devices are passed as pointers
8806 // or i64. This assumes 64-bit address spaces/pointers.
8807 for (auto &Arg : Inputs)
8808 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8809 ? Arg->getType()
8810 : Type::getInt64Ty(Builder.getContext()));
8811 } else {
8812 for (auto &Arg : Inputs)
8813 ParameterTypes.push_back(Arg->getType());
8814 }
8815
8816 // The implicit dyn_ptr argument is always the last parameter on both host
8817 // and device so the argument counts match without runtime manipulation.
8818 auto *PtrTy = PointerType::getUnqual(Builder.getContext());
8819 ParameterTypes.push_back(PtrTy);
8820
8821 auto BB = Builder.GetInsertBlock();
8822 auto M = BB->getModule();
8823 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8824 /*isVarArg*/ false);
8825 auto Func =
8826 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8827
8828 // Forward target-cpu and target-features function attributes from the
8829 // original function to the new outlined function.
8830 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8831
8832 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8833 if (TargetCpuAttr.isStringAttribute())
8834 Func->addFnAttr(TargetCpuAttr);
8835
8836 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8837 if (TargetFeaturesAttr.isStringAttribute())
8838 Func->addFnAttr(TargetFeaturesAttr);
8839
8840 if (OMPBuilder.Config.isTargetDevice()) {
8841 Value *ExecMode =
8842 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8843 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8844 }
8845
8846 // Save insert point.
8847 IRBuilder<>::InsertPointGuard IPG(Builder);
8848 // We will generate the entries in the outlined function but the debug
8849 // location may still be pointing to the parent function. Reset it now.
8850 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8851
8852 // Generate the region into the function.
8853 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8854 Builder.SetInsertPoint(EntryBB);
8855
8856 // Insert target init call in the device compilation pass.
8857 if (OMPBuilder.Config.isTargetDevice())
8858 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8859
8860 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8861
8862 // As we embed the user code in the middle of our target region after we
8863 // generate entry code, we must move what allocas we can into the entry
8864 // block to avoid possible breaking optimisations for device
8865 if (OMPBuilder.Config.isTargetDevice())
8867
8868 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "target.exit");
8869 BasicBlock *OutlinedBodyBB =
8870 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8872 Builder.saveIP(),
8873 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()),
8874 ExitBB);
8875 if (!AfterIP)
8876 return AfterIP.takeError();
8877 Builder.SetInsertPoint(ExitBB);
8878
8879 // Insert target deinit call in the device compilation pass.
8880 if (OMPBuilder.Config.isTargetDevice())
8881 OMPBuilder.createTargetDeinit(Builder);
8882
8883 // Insert return instruction.
8884 Builder.CreateRetVoid();
8885
8886 // New Alloca IP at entry point of created device function.
8887 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8888 auto AllocaIP = Builder.saveIP();
8889
8890 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8891
8892 // Do not include the artificial dyn_ptr argument.
8893 const auto &ArgRange = make_range(Func->arg_begin(), Func->arg_end() - 1);
8894
8896
8897 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8898 // Things like GEP's can come in the form of Constants. Constants and
8899 // ConstantExpr's do not have access to the knowledge of what they're
8900 // contained in, so we must dig a little to find an instruction so we
8901 // can tell if they're used inside of the function we're outlining. We
8902 // also replace the original constant expression with a new instruction
8903 // equivalent; an instruction as it allows easy modification in the
8904 // following loop, as we can now know the constant (instruction) is
8905 // owned by our target function and replaceUsesOfWith can now be invoked
8906 // on it (cannot do this with constants it seems). A brand new one also
8907 // allows us to be cautious as it is perhaps possible the old expression
8908 // was used inside of the function but exists and is used externally
8909 // (unlikely by the nature of a Constant, but still).
8910 // NOTE: We cannot remove dead constants that have been rewritten to
8911 // instructions at this stage, we run the risk of breaking later lowering
8912 // by doing so as we could still be in the process of lowering the module
8913 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8914 // constants we have created rewritten versions of.
8915 if (auto *Const = dyn_cast<Constant>(Input))
8916 convertUsersOfConstantsToInstructions(Const, Func, false);
8917
8918 // Collect users before iterating over them to avoid invalidating the
8919 // iteration in case a user uses Input more than once (e.g. a call
8920 // instruction).
8921 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8922 // Collect all the instructions
8924 if (auto *Instr = dyn_cast<Instruction>(User))
8925 if (Instr->getFunction() == Func)
8926 Instr->replaceUsesOfWith(Input, InputCopy);
8927 };
8928
8929 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8930
8931 // Rewrite uses of input valus to parameters.
8932 for (auto InArg : zip(Inputs, ArgRange)) {
8933 Value *Input = std::get<0>(InArg);
8934 Argument &Arg = std::get<1>(InArg);
8935 Value *InputCopy = nullptr;
8936
8937 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = ArgAccessorFuncCB(
8938 Arg, Input, InputCopy, AllocaIP, Builder.saveIP(),
8939 OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin()));
8940 if (!AfterIP)
8941 return AfterIP.takeError();
8942 Builder.restoreIP(*AfterIP);
8943 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8944
8945 // In certain cases a Global may be set up for replacement, however, this
8946 // Global may be used in multiple arguments to the kernel, just segmented
8947 // apart, for example, if we have a global array, that is sectioned into
8948 // multiple mappings (technically not legal in OpenMP, but there is a case
8949 // in Fortran for Common Blocks where this is neccesary), we will end up
8950 // with GEP's into this array inside the kernel, that refer to the Global
8951 // but are technically separate arguments to the kernel for all intents and
8952 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8953 // index, it will fold into an referal to the Global, if we then encounter
8954 // this folded GEP during replacement all of the references to the
8955 // Global in the kernel will be replaced with the argument we have generated
8956 // that corresponds to it, including any other GEP's that refer to the
8957 // Global that may be other arguments. This will invalidate all of the other
8958 // preceding mapped arguments that refer to the same global that may be
8959 // separate segments. To prevent this, we defer global processing until all
8960 // other processing has been performed.
8963 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8964 continue;
8965 }
8966
8968 continue;
8969
8970 ReplaceValue(Input, InputCopy, Func);
8971 }
8972
8973 // Replace all of our deferred Input values, currently just Globals.
8974 for (auto Deferred : DeferredReplacement)
8975 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
8976
8977 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
8978 ValueReplacementMap);
8979 return Func;
8980}
8981/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
8982/// of pointers containing shared data between the parent task and the created
8983/// task.
8985 IRBuilderBase &Builder,
8986 Value *TaskWithPrivates,
8987 Type *TaskWithPrivatesTy) {
8988
8989 Type *TaskTy = OMPIRBuilder.Task;
8990 LLVMContext &Ctx = Builder.getContext();
8991 Value *TaskT =
8992 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
8993 Value *Shareds = TaskT;
8994 // TaskWithPrivatesTy can be one of the following
8995 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8996 // %struct.privates }
8997 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
8998 //
8999 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
9000 // its first member has to be the task descriptor. TaskTy is the type of the
9001 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
9002 // first member of TaskT, gives us the pointer to shared data.
9003 if (TaskWithPrivatesTy != TaskTy)
9004 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
9005 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
9006}
9007/// Create an entry point for a target task with the following.
9008/// It'll have the following signature
9009/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
9010/// This function is called from emitTargetTask once the
9011/// code to launch the target kernel has been outlined already.
9012/// NumOffloadingArrays is the number of offloading arrays that we need to copy
9013/// into the task structure so that the deferred target task can access this
9014/// data even after the stack frame of the generating task has been rolled
9015/// back. Offloading arrays contain base pointers, pointers, sizes etc
9016/// of the data that the target kernel will access. These in effect are the
9017/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
9019 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
9020 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
9021 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
9022
9023 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
9024 // This is because PrivatesTy is the type of the structure in which
9025 // we pass the offloading arrays to the deferred target task.
9026 assert((!NumOffloadingArrays || PrivatesTy) &&
9027 "PrivatesTy cannot be nullptr when there are offloadingArrays"
9028 "to privatize");
9029
9030 Module &M = OMPBuilder.M;
9031 // KernelLaunchFunction is the target launch function, i.e.
9032 // the function that sets up kernel arguments and calls
9033 // __tgt_target_kernel to launch the kernel on the device.
9034 //
9035 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
9036
9037 // StaleCI is the CallInst which is the call to the outlined
9038 // target kernel launch function. If there are local live-in values
9039 // that the outlined function uses then these are aggregated into a structure
9040 // which is passed as the second argument. If there are no local live-in
9041 // values or if all values used by the outlined kernel are global variables,
9042 // then there's only one argument, the threadID. So, StaleCI can be
9043 //
9044 // %structArg = alloca { ptr, ptr }, align 8
9045 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
9046 // store ptr %20, ptr %gep_, align 8
9047 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
9048 // store ptr %21, ptr %gep_8, align 8
9049 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
9050 //
9051 // OR
9052 //
9053 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
9055 StaleCI->getIterator());
9056
9057 LLVMContext &Ctx = StaleCI->getParent()->getContext();
9058
9059 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
9060 Type *TaskPtrTy = OMPBuilder.TaskPtr;
9061 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
9062
9063 auto ProxyFnTy =
9064 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
9065 /* isVarArg */ false);
9066 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
9067 ".omp_target_task_proxy_func",
9068 Builder.GetInsertBlock()->getModule());
9069 Value *ThreadId = ProxyFn->getArg(0);
9070 Value *TaskWithPrivates = ProxyFn->getArg(1);
9071 ThreadId->setName("thread.id");
9072 TaskWithPrivates->setName("task");
9073
9074 bool HasShareds = SharedArgsOperandNo > 0;
9075 bool HasOffloadingArrays = NumOffloadingArrays > 0;
9076 BasicBlock *EntryBB =
9077 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
9078 Builder.SetInsertPoint(EntryBB);
9079
9080 SmallVector<Value *> KernelLaunchArgs;
9081 KernelLaunchArgs.reserve(StaleCI->arg_size());
9082 KernelLaunchArgs.push_back(ThreadId);
9083
9084 if (HasOffloadingArrays) {
9085 assert(TaskTy != TaskWithPrivatesTy &&
9086 "If there are offloading arrays to pass to the target"
9087 "TaskTy cannot be the same as TaskWithPrivatesTy");
9088 (void)TaskTy;
9089 Value *Privates =
9090 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
9091 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
9092 KernelLaunchArgs.push_back(
9093 Builder.CreateStructGEP(PrivatesTy, Privates, i));
9094 }
9095
9096 if (HasShareds) {
9097 auto *ArgStructAlloca =
9098 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
9099 assert(ArgStructAlloca &&
9100 "Unable to find the alloca instruction corresponding to arguments "
9101 "for extracted function");
9102 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
9103 std::optional<TypeSize> ArgAllocSize =
9104 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9105 assert(ArgStructType && ArgAllocSize &&
9106 "Unable to determine size of arguments for extracted function");
9107 uint64_t StructSize = ArgAllocSize->getFixedValue();
9108
9109 AllocaInst *NewArgStructAlloca =
9110 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
9111
9112 Value *SharedsSize = Builder.getInt64(StructSize);
9113
9115 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
9116
9117 Builder.CreateMemCpy(
9118 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
9119 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
9120 KernelLaunchArgs.push_back(NewArgStructAlloca);
9121 }
9122 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
9123 Builder.CreateRetVoid();
9124 return ProxyFn;
9125}
9127
9128 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
9129 return GEP->getSourceElementType();
9130 if (auto *Alloca = dyn_cast<AllocaInst>(V))
9131 return Alloca->getAllocatedType();
9132
9133 llvm_unreachable("Unhandled Instruction type");
9134 return nullptr;
9135}
9136// This function returns a struct that has at most two members.
9137// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
9138// descriptor. The second member, if needed, is a struct containing arrays
9139// that need to be passed to the offloaded target kernel. For example,
9140// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
9141// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
9142// respectively, then the types created by this function are
9143//
9144// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
9145// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
9146// %struct.privates }
9147// %struct.task_with_privates is returned by this function.
9148// If there aren't any offloading arrays to pass to the target kernel,
9149// %struct.kmp_task_ompbuilder_t is returned.
9150static StructType *
9152 ArrayRef<Value *> OffloadingArraysToPrivatize) {
9153
9154 if (OffloadingArraysToPrivatize.empty())
9155 return OMPIRBuilder.Task;
9156
9157 SmallVector<Type *, 4> StructFieldTypes;
9158 for (Value *V : OffloadingArraysToPrivatize) {
9159 assert(V->getType()->isPointerTy() &&
9160 "Expected pointer to array to privatize. Got a non-pointer value "
9161 "instead");
9162 Type *ArrayTy = getOffloadingArrayType(V);
9163 assert(ArrayTy && "ArrayType cannot be nullptr");
9164 StructFieldTypes.push_back(ArrayTy);
9165 }
9166 StructType *PrivatesStructTy =
9167 StructType::create(StructFieldTypes, "struct.privates");
9168 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
9169 "struct.task_with_privates");
9170}
9172 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
9173 TargetRegionEntryInfo &EntryInfo,
9175 Function *&OutlinedFn, Constant *&OutlinedFnID,
9179
9180 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
9181 [&](StringRef EntryFnName) {
9182 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
9183 EntryFnName, Inputs, CBFunc,
9184 ArgAccessorFuncCB);
9185 };
9186
9187 return OMPBuilder.emitTargetRegionFunction(
9188 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
9189 OutlinedFnID);
9190}
9191
9193 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
9195 const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs,
9196 bool HasNoWait) {
9197
9198 // The following explains the code-gen scenario for the `target` directive. A
9199 // similar scneario is followed for other device-related directives (e.g.
9200 // `target enter data`) but in similar fashion since we only need to emit task
9201 // that encapsulates the proper runtime call.
9202 //
9203 // When we arrive at this function, the target region itself has been
9204 // outlined into the function OutlinedFn.
9205 // So at ths point, for
9206 // --------------------------------------------------------------
9207 // void user_code_that_offloads(...) {
9208 // omp target depend(..) map(from:a) map(to:b) private(i)
9209 // do i = 1, 10
9210 // a(i) = b(i) + n
9211 // }
9212 //
9213 // --------------------------------------------------------------
9214 //
9215 // we have
9216 //
9217 // --------------------------------------------------------------
9218 //
9219 // void user_code_that_offloads(...) {
9220 // %.offload_baseptrs = alloca [2 x ptr], align 8
9221 // %.offload_ptrs = alloca [2 x ptr], align 8
9222 // %.offload_mappers = alloca [2 x ptr], align 8
9223 // ;; target region has been outlined and now we need to
9224 // ;; offload to it via a target task.
9225 // }
9226 // void outlined_device_function(ptr a, ptr b, ptr n) {
9227 // n = *n_ptr;
9228 // do i = 1, 10
9229 // a(i) = b(i) + n
9230 // }
9231 //
9232 // We have to now do the following
9233 // (i) Make an offloading call to outlined_device_function using the OpenMP
9234 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
9235 // emitted by emitKernelLaunch
9236 // (ii) Create a task entry point function that calls kernel_launch_function
9237 // and is the entry point for the target task. See
9238 // '@.omp_target_task_proxy_func in the pseudocode below.
9239 // (iii) Create a task with the task entry point created in (ii)
9240 //
9241 // That is we create the following
9242 // struct task_with_privates {
9243 // struct kmp_task_ompbuilder_t task_struct;
9244 // struct privates {
9245 // [2 x ptr] ; baseptrs
9246 // [2 x ptr] ; ptrs
9247 // [2 x i64] ; sizes
9248 // }
9249 // }
9250 // void user_code_that_offloads(...) {
9251 // %.offload_baseptrs = alloca [2 x ptr], align 8
9252 // %.offload_ptrs = alloca [2 x ptr], align 8
9253 // %.offload_sizes = alloca [2 x i64], align 8
9254 //
9255 // %structArg = alloca { ptr, ptr, ptr }, align 8
9256 // %strucArg[0] = a
9257 // %strucArg[1] = b
9258 // %strucArg[2] = &n
9259 //
9260 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
9261 // sizeof(kmp_task_ompbuilder_t),
9262 // sizeof(structArg),
9263 // @.omp_target_task_proxy_func,
9264 // ...)
9265 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
9266 // sizeof(structArg))
9267 // memcpy(target_task_with_privates->privates->baseptrs,
9268 // offload_baseptrs, sizeof(offload_baseptrs)
9269 // memcpy(target_task_with_privates->privates->ptrs,
9270 // offload_ptrs, sizeof(offload_ptrs)
9271 // memcpy(target_task_with_privates->privates->sizes,
9272 // offload_sizes, sizeof(offload_sizes)
9273 // dependencies_array = ...
9274 // ;; if nowait not present
9275 // call @__kmpc_omp_wait_deps(..., dependencies_array)
9276 // call @__kmpc_omp_task_begin_if0(...)
9277 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
9278 // %target_task_with_privates)
9279 // call @__kmpc_omp_task_complete_if0(...)
9280 // }
9281 //
9282 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
9283 // ptr %task) {
9284 // %structArg = alloca {ptr, ptr, ptr}
9285 // %task_ptr = getelementptr(%task, 0, 0)
9286 // %shared_data = load (getelementptr %task_ptr, 0, 0)
9287 // mempcy(%structArg, %shared_data, sizeof(%structArg))
9288 //
9289 // %offloading_arrays = getelementptr(%task, 0, 1)
9290 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
9291 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
9292 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
9293 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
9294 // %offload_sizes, %structArg)
9295 // }
9296 //
9297 // We need the proxy function because the signature of the task entry point
9298 // expected by kmpc_omp_task is always the same and will be different from
9299 // that of the kernel_launch function.
9300 //
9301 // kernel_launch_function is generated by emitKernelLaunch and has the
9302 // always_inline attribute. For this example, it'll look like so:
9303 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
9304 // %offload_sizes, %structArg) alwaysinline {
9305 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
9306 // ; load aggregated data from %structArg
9307 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
9308 // ; offload_sizes
9309 // call i32 @__tgt_target_kernel(...,
9310 // outlined_device_function,
9311 // ptr %kernel_args)
9312 // }
9313 // void outlined_device_function(ptr a, ptr b, ptr n) {
9314 // n = *n_ptr;
9315 // do i = 1, 10
9316 // a(i) = b(i) + n
9317 // }
9318 //
9319 BasicBlock *TargetTaskBodyBB =
9320 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
9321 BasicBlock *TargetTaskAllocaBB =
9322 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
9323
9324 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
9325 TargetTaskAllocaBB->begin());
9326 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
9327
9328 auto OI = std::make_unique<OutlineInfo>();
9329 OI->EntryBB = TargetTaskAllocaBB;
9330 OI->OuterAllocBB = AllocaIP.getBlock();
9331
9332 // Add the thread ID argument.
9334 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9335 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
9336
9337 // Generate the task body which will subsequently be outlined.
9338 Builder.restoreIP(TargetTaskBodyIP);
9339 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
9340 return Err;
9341
9342 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
9343 // it is given. These blocks are enumerated by
9344 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
9345 // to be outside the region. In other words, OI.ExitBlock is expected to be
9346 // the start of the region after the outlining. We used to set OI.ExitBlock
9347 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
9348 // except when the task body is a single basic block. In that case,
9349 // OI.ExitBlock is set to the single task body block and will get left out of
9350 // the outlining process. So, simply create a new empty block to which we
9351 // uncoditionally branch from where TaskBodyCB left off
9352 OI->ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
9353 emitBlock(OI->ExitBB, Builder.GetInsertBlock()->getParent(),
9354 /*IsFinished=*/true);
9355
9356 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
9357 bool NeedsTargetTask = HasNoWait && DeviceID;
9358 if (NeedsTargetTask) {
9359 for (auto *V :
9360 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
9361 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
9362 RTArgs.SizesArray}) {
9364 OffloadingArraysToPrivatize.push_back(V);
9365 OI->ExcludeArgsFromAggregate.push_back(V);
9366 }
9367 }
9368 }
9369 OI->PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
9370 DeviceID, OffloadingArraysToPrivatize](
9371 Function &OutlinedFn) mutable {
9372 assert(OutlinedFn.hasOneUse() &&
9373 "there must be a single user for the outlined function");
9374
9375 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9376
9377 // The first argument of StaleCI is always the thread id.
9378 // The next few arguments are the pointers to offloading arrays
9379 // if any. (see OffloadingArraysToPrivatize)
9380 // Finally, all other local values that are live-in into the outlined region
9381 // end up in a structure whose pointer is passed as the last argument. This
9382 // piece of data is passed in the "shared" field of the task structure. So,
9383 // we know we have to pass shareds to the task if the number of arguments is
9384 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
9385 // thread id. Further, for safety, we assert that the number of arguments of
9386 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
9387 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
9388 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
9389 assert((!HasShareds ||
9390 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
9391 "Wrong number of arguments for StaleCI when shareds are present");
9392 int SharedArgOperandNo =
9393 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
9394
9395 StructType *TaskWithPrivatesTy =
9396 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
9397 StructType *PrivatesTy = nullptr;
9398
9399 if (!OffloadingArraysToPrivatize.empty())
9400 PrivatesTy =
9401 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
9402
9404 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
9405 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
9406
9407 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
9408 << "\n");
9409
9410 Builder.SetInsertPoint(StaleCI);
9411
9412 // Gather the arguments for emitting the runtime call.
9413 uint32_t SrcLocStrSize;
9414 Constant *SrcLocStr =
9416 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9417
9418 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
9419 //
9420 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
9421 // the DeviceID to the deferred task and also since
9422 // @__kmpc_omp_target_task_alloc creates an untied/async task.
9423 Function *TaskAllocFn =
9424 !NeedsTargetTask
9425 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
9427 OMPRTL___kmpc_omp_target_task_alloc);
9428
9429 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
9430 // call.
9431 Value *ThreadID = getOrCreateThreadID(Ident);
9432
9433 // Argument - `sizeof_kmp_task_t` (TaskSize)
9434 // Tasksize refers to the size in bytes of kmp_task_t data structure
9435 // plus any other data to be passed to the target task, if any, which
9436 // is packed into a struct. kmp_task_t and the struct so created are
9437 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
9438 Value *TaskSize = Builder.getInt64(
9439 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
9440
9441 // Argument - `sizeof_shareds` (SharedsSize)
9442 // SharedsSize refers to the shareds array size in the kmp_task_t data
9443 // structure.
9444 Value *SharedsSize = Builder.getInt64(0);
9445 if (HasShareds) {
9446 auto *ArgStructAlloca =
9447 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
9448 assert(ArgStructAlloca &&
9449 "Unable to find the alloca instruction corresponding to arguments "
9450 "for extracted function");
9451 std::optional<TypeSize> ArgAllocSize =
9452 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9453 assert(ArgAllocSize &&
9454 "Unable to determine size of arguments for extracted function");
9455 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
9456 }
9457
9458 // Argument - `flags`
9459 // Task is tied iff (Flags & 1) == 1.
9460 // Task is untied iff (Flags & 1) == 0.
9461 // Task is final iff (Flags & 2) == 2.
9462 // Task is not final iff (Flags & 2) == 0.
9463 // A target task is not final and is untied.
9464 Value *Flags = Builder.getInt32(0);
9465
9466 // Emit the @__kmpc_omp_task_alloc runtime call
9467 // The runtime call returns a pointer to an area where the task captured
9468 // variables must be copied before the task is run (TaskData)
9469 CallInst *TaskData = nullptr;
9470
9471 SmallVector<llvm::Value *> TaskAllocArgs = {
9472 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
9473 /*flags=*/Flags,
9474 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
9475 /*task_func=*/ProxyFn};
9476
9477 if (NeedsTargetTask) {
9478 assert(DeviceID && "Expected non-empty device ID.");
9479 TaskAllocArgs.push_back(DeviceID);
9480 }
9481
9482 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
9483
9484 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
9485 if (HasShareds) {
9486 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
9488 *this, Builder, TaskData, TaskWithPrivatesTy);
9489 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
9490 SharedsSize);
9491 }
9492 if (!OffloadingArraysToPrivatize.empty()) {
9493 Value *Privates =
9494 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
9495 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
9496 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
9497 [[maybe_unused]] Type *ArrayType =
9498 getOffloadingArrayType(PtrToPrivatize);
9499 assert(ArrayType && "ArrayType cannot be nullptr");
9500
9501 Type *ElementType = PrivatesTy->getElementType(i);
9502 assert(ElementType == ArrayType &&
9503 "ElementType should match ArrayType");
9504 (void)ArrayType;
9505
9506 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
9507 Builder.CreateMemCpy(
9508 Dst, Alignment, PtrToPrivatize, Alignment,
9509 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
9510 }
9511 }
9512
9513 Value *DepArray = nullptr;
9514 Value *NumDeps = nullptr;
9515 if (Dependencies.DepArray) {
9516 DepArray = Dependencies.DepArray;
9517 NumDeps = Dependencies.NumDeps;
9518 } else if (!Dependencies.Deps.empty()) {
9519 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
9520 NumDeps = Builder.getInt32(Dependencies.Deps.size());
9521 }
9522
9523 // ---------------------------------------------------------------
9524 // V5.2 13.8 target construct
9525 // If the nowait clause is present, execution of the target task
9526 // may be deferred. If the nowait clause is not present, the target task is
9527 // an included task.
9528 // ---------------------------------------------------------------
9529 // The above means that the lack of a nowait on the target construct
9530 // translates to '#pragma omp task if(0)'
9531 if (!NeedsTargetTask) {
9532 if (DepArray) {
9533 Function *TaskWaitFn =
9534 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
9536 TaskWaitFn,
9537 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
9538 /*ndeps=*/NumDeps,
9539 /*dep_list=*/DepArray,
9540 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
9541 /*noalias_dep_list=*/
9543 }
9544 // Included task.
9545 Function *TaskBeginFn =
9546 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
9547 Function *TaskCompleteFn =
9548 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
9549 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
9550 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
9551 CI->setDebugLoc(StaleCI->getDebugLoc());
9552 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
9553 } else if (DepArray) {
9554 // HasNoWait - meaning the task may be deferred. Call
9555 // __kmpc_omp_task_with_deps if there are dependencies,
9556 // else call __kmpc_omp_task
9557 Function *TaskFn =
9558 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
9560 TaskFn,
9561 {Ident, ThreadID, TaskData, NumDeps, DepArray,
9562 ConstantInt::get(Builder.getInt32Ty(), 0),
9564 } else {
9565 // Emit the @__kmpc_omp_task runtime call to spawn the task
9566 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
9567 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
9568 }
9569
9570 StaleCI->eraseFromParent();
9571 for (Instruction *I : llvm::reverse(ToBeDeleted))
9572 I->eraseFromParent();
9573 };
9574 addOutlineInfo(std::move(OI));
9575
9576 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
9577 << *(Builder.GetInsertBlock()) << "\n");
9578 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
9579 << *(Builder.GetInsertBlock()->getParent()->getParent())
9580 << "\n");
9581 return Builder.saveIP();
9582}
9583
9585 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
9586 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
9587 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
9588 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9589 if (Error Err =
9590 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
9591 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
9592 return Err;
9593 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
9594 return Error::success();
9595}
9596
9597static void emitTargetCall(
9598 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
9603 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
9607 const OpenMPIRBuilder::DependenciesInfo &Dependencies, bool HasNoWait,
9608 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9609 // Generate a function call to the host fallback implementation of the target
9610 // region. This is called by the host when no offload entry was generated for
9611 // the target region and when the offloading call fails at runtime.
9612 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
9614 Builder.restoreIP(IP);
9615 // Ensure the host fallback has the same dyn_ptr ABI as the device.
9616 SmallVector<Value *> FallbackArgs(Args.begin(), Args.end());
9617 FallbackArgs.push_back(
9618 Constant::getNullValue(PointerType::getUnqual(Builder.getContext())));
9619 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, FallbackArgs);
9620 return Builder.saveIP();
9621 };
9622
9623 bool HasDependencies = !Dependencies.empty();
9624 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
9625
9627
9628 auto TaskBodyCB =
9629 [&](Value *DeviceID, Value *RTLoc,
9630 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
9631 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9632 // produce any.
9634 // emitKernelLaunch makes the necessary runtime call to offload the
9635 // kernel. We then outline all that code into a separate function
9636 // ('kernel_launch_function' in the pseudo code above). This function is
9637 // then called by the target task proxy function (see
9638 // '@.omp_target_task_proxy_func' in the pseudo code above)
9639 // "@.omp_target_task_proxy_func' is generated by
9640 // emitTargetTaskProxyFunction.
9641 if (OutlinedFnID && DeviceID)
9642 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9643 EmitTargetCallFallbackCB, KArgs,
9644 DeviceID, RTLoc, TargetTaskAllocaIP);
9645
9646 // We only need to do the outlining if `DeviceID` is set to avoid calling
9647 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9648 // generating the `else` branch of an `if` clause.
9649 //
9650 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9651 // In this case, we execute the host implementation directly.
9652 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9653 }());
9654
9655 OMPBuilder.Builder.restoreIP(AfterIP);
9656 return Error::success();
9657 };
9658
9659 auto &&EmitTargetCallElse =
9660 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9662 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
9663 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9664 // produce any.
9666 if (RequiresOuterTargetTask) {
9667 // Arguments that are intended to be directly forwarded to an
9668 // emitKernelLaunch call are pased as nullptr, since
9669 // OutlinedFnID=nullptr results in that call not being done.
9671 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9672 /*RTLoc=*/nullptr, AllocaIP,
9673 Dependencies, EmptyRTArgs, HasNoWait);
9674 }
9675 return EmitTargetCallFallbackCB(Builder.saveIP());
9676 }());
9677
9678 Builder.restoreIP(AfterIP);
9679 return Error::success();
9680 };
9681
9682 auto &&EmitTargetCallThen =
9683 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9685 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
9686 Info.HasNoWait = HasNoWait;
9687 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9688
9690 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9691 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9692 /*IsNonContiguous=*/true,
9693 /*ForEndCall=*/false))
9694 return Err;
9695
9696 SmallVector<Value *, 3> NumTeamsC;
9697 for (auto [DefaultVal, RuntimeVal] :
9698 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9699 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9700 : Builder.getInt32(DefaultVal));
9701
9702 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9703 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9704 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9705 if (Clause)
9706 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9707 /*isSigned=*/false);
9708 return Clause;
9709 };
9710 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9711 if (Clause)
9712 Result =
9713 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9714 Result, Clause)
9715 : Clause;
9716 };
9717
9718 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9719 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9720 SmallVector<Value *, 3> NumThreadsC;
9721 Value *MaxThreadsClause =
9722 RuntimeAttrs.TeamsThreadLimit.size() == 1
9723 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9724 : nullptr;
9725
9726 for (auto [TeamsVal, TargetVal] : zip_equal(
9727 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9728 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9729 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9730
9731 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9732 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9733
9734 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9735 }
9736
9737 unsigned NumTargetItems = Info.NumberOfPtrs;
9738 uint32_t SrcLocStrSize;
9739 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9740 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9741 llvm::omp::IdentFlag(0), 0);
9742
9743 Value *TripCount = RuntimeAttrs.LoopTripCount
9744 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9745 Builder.getInt64Ty(),
9746 /*isSigned=*/false)
9747 : Builder.getInt64(0);
9748
9749 // Request zero groupprivate bytes by default.
9750 if (!DynCGroupMem)
9751 DynCGroupMem = Builder.getInt32(0);
9752
9754 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9755 HasNoWait, DynCGroupMemFallback);
9756
9757 // Assume no error was returned because TaskBodyCB and
9758 // EmitTargetCallFallbackCB don't produce any.
9760 // The presence of certain clauses on the target directive require the
9761 // explicit generation of the target task.
9762 if (RequiresOuterTargetTask)
9763 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9764 RTLoc, AllocaIP, Dependencies,
9765 KArgs.RTArgs, Info.HasNoWait);
9766
9767 return OMPBuilder.emitKernelLaunch(
9768 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9769 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9770 }());
9771
9772 Builder.restoreIP(AfterIP);
9773 return Error::success();
9774 };
9775
9776 // If we don't have an ID for the target region, it means an offload entry
9777 // wasn't created. In this case we just run the host fallback directly and
9778 // ignore any potential 'if' clauses.
9779 if (!OutlinedFnID) {
9780 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP(), DeallocBlocks));
9781 return;
9782 }
9783
9784 // If there's no 'if' clause, only generate the kernel launch code path.
9785 if (!IfCond) {
9786 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP(), DeallocBlocks));
9787 return;
9788 }
9789
9790 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9791 EmitTargetCallElse, AllocaIP));
9792}
9793
9795 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9796 InsertPointTy CodeGenIP, ArrayRef<BasicBlock *> DeallocBlocks,
9797 TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo,
9798 const TargetKernelDefaultAttrs &DefaultAttrs,
9799 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9800 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9803 CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies,
9804 bool HasNowait, Value *DynCGroupMem,
9805 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9806
9807 if (!updateToLocation(Loc))
9808 return InsertPointTy();
9809
9810 Builder.restoreIP(CodeGenIP);
9811
9812 Function *OutlinedFn;
9813 Constant *OutlinedFnID = nullptr;
9814 // The target region is outlined into its own function. The LLVM IR for
9815 // the target region itself is generated using the callbacks CBFunc
9816 // and ArgAccessorFuncCB
9818 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9819 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9820 return Err;
9821
9822 // If we are not on the target device, then we need to generate code
9823 // to make a remote call (offload) to the previously outlined function
9824 // that represents the target region. Do that now.
9825 if (!Config.isTargetDevice())
9826 emitTargetCall(*this, Builder, AllocaIP, DeallocBlocks, Info, DefaultAttrs,
9827 RuntimeAttrs, IfCond, OutlinedFn, OutlinedFnID, Inputs,
9828 GenMapInfoCB, CustomMapperCB, Dependencies, HasNowait,
9829 DynCGroupMem, DynCGroupMemFallback);
9830 return Builder.saveIP();
9831}
9832
9833std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9834 StringRef FirstSeparator,
9835 StringRef Separator) {
9836 SmallString<128> Buffer;
9837 llvm::raw_svector_ostream OS(Buffer);
9838 StringRef Sep = FirstSeparator;
9839 for (StringRef Part : Parts) {
9840 OS << Sep << Part;
9841 Sep = Separator;
9842 }
9843 return OS.str().str();
9844}
9845
9846std::string
9848 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9849 Config.separator());
9850}
9851
9853 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9854 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9855 if (Elem.second) {
9856 assert(Elem.second->getValueType() == Ty &&
9857 "OMP internal variable has different type than requested");
9858 } else {
9859 // TODO: investigate the appropriate linkage type used for the global
9860 // variable for possibly changing that to internal or private, or maybe
9861 // create different versions of the function for different OMP internal
9862 // variables.
9863 const DataLayout &DL = M.getDataLayout();
9864 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9865 // default global AS is 1.
9866 // See double-target-call-with-declare-target.f90 and
9867 // declare-target-vars-in-target-region.f90 libomptarget
9868 // tests.
9869 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9870 : M.getTargetTriple().isAMDGPU()
9871 ? 0
9872 : DL.getDefaultGlobalsAddressSpace();
9873 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9876 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9877 Constant::getNullValue(Ty), Elem.first(),
9878 /*InsertBefore=*/nullptr,
9879 GlobalValue::NotThreadLocal, AddressSpaceVal);
9880 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9881 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9882 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9883 Elem.second = GV;
9884 }
9885
9886 return Elem.second;
9887}
9888
9889Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9890 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9891 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9892 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9893}
9894
9896 LLVMContext &Ctx = Builder.getContext();
9897 Value *Null =
9898 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9899 Value *SizeGep =
9900 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9901 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9902 return SizePtrToInt;
9903}
9904
9907 std::string VarName) {
9908 llvm::Constant *MaptypesArrayInit =
9909 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9910 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9911 M, MaptypesArrayInit->getType(),
9912 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9913 VarName);
9914 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9915 return MaptypesArrayGlobal;
9916}
9917
9919 InsertPointTy AllocaIP,
9920 unsigned NumOperands,
9921 struct MapperAllocas &MapperAllocas) {
9922 if (!updateToLocation(Loc))
9923 return;
9924
9925 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9926 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9927 Builder.restoreIP(AllocaIP);
9928 AllocaInst *ArgsBase = Builder.CreateAlloca(
9929 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9930 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9931 ".offload_ptrs");
9932 AllocaInst *ArgSizes = Builder.CreateAlloca(
9933 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9935 MapperAllocas.ArgsBase = ArgsBase;
9936 MapperAllocas.Args = Args;
9937 MapperAllocas.ArgSizes = ArgSizes;
9938}
9939
9941 Function *MapperFunc, Value *SrcLocInfo,
9942 Value *MaptypesArg, Value *MapnamesArg,
9944 int64_t DeviceID, unsigned NumOperands) {
9945 if (!updateToLocation(Loc))
9946 return;
9947
9948 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9949 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9950 Value *ArgsBaseGEP =
9951 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9952 {Builder.getInt32(0), Builder.getInt32(0)});
9953 Value *ArgsGEP =
9954 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9955 {Builder.getInt32(0), Builder.getInt32(0)});
9956 Value *ArgSizesGEP =
9957 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9958 {Builder.getInt32(0), Builder.getInt32(0)});
9959 Value *NullPtr =
9960 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9961 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9962 Builder.getInt32(NumOperands),
9963 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9964 MaptypesArg, MapnamesArg, NullPtr});
9965}
9966
9968 TargetDataRTArgs &RTArgs,
9969 TargetDataInfo &Info,
9970 bool ForEndCall) {
9971 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
9972 "expected region end call to runtime only when end call is separate");
9973 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
9974 auto VoidPtrTy = UnqualPtrTy;
9975 auto VoidPtrPtrTy = UnqualPtrTy;
9976 auto Int64Ty = Type::getInt64Ty(M.getContext());
9977 auto Int64PtrTy = UnqualPtrTy;
9978
9979 if (!Info.NumberOfPtrs) {
9980 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9981 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9982 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
9983 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
9984 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9985 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9986 return;
9987 }
9988
9989 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
9990 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
9991 Info.RTArgs.BasePointersArray,
9992 /*Idx0=*/0, /*Idx1=*/0);
9993 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
9994 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
9995 /*Idx0=*/0,
9996 /*Idx1=*/0);
9997 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
9998 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9999 /*Idx0=*/0, /*Idx1=*/0);
10000 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
10001 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
10002 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
10003 : Info.RTArgs.MapTypesArray,
10004 /*Idx0=*/0,
10005 /*Idx1=*/0);
10006
10007 // Only emit the mapper information arrays if debug information is
10008 // requested.
10009 if (!Info.EmitDebug)
10010 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
10011 else
10012 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
10013 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
10014 /*Idx0=*/0,
10015 /*Idx1=*/0);
10016 // If there is no user-defined mapper, set the mapper array to nullptr to
10017 // avoid an unnecessary data privatization
10018 if (!Info.HasMapper)
10019 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
10020 else
10021 RTArgs.MappersArray =
10022 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
10023}
10024
10026 InsertPointTy CodeGenIP,
10027 MapInfosTy &CombinedInfo,
10028 TargetDataInfo &Info) {
10030 CombinedInfo.NonContigInfo;
10031
10032 // Build an array of struct descriptor_dim and then assign it to
10033 // offload_args.
10034 //
10035 // struct descriptor_dim {
10036 // uint64_t offset;
10037 // uint64_t count;
10038 // uint64_t stride
10039 // };
10040 Type *Int64Ty = Builder.getInt64Ty();
10042 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
10043 "struct.descriptor_dim");
10044
10045 enum { OffsetFD = 0, CountFD, StrideFD };
10046 // We need two index variable here since the size of "Dims" is the same as
10047 // the size of Components, however, the size of offset, count, and stride is
10048 // equal to the size of base declaration that is non-contiguous.
10049 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
10050 // Skip emitting ir if dimension size is 1 since it cannot be
10051 // non-contiguous.
10052 if (NonContigInfo.Dims[I] == 1)
10053 continue;
10054 Builder.restoreIP(AllocaIP);
10055 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
10056 AllocaInst *DimsAddr =
10057 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
10058 Builder.restoreIP(CodeGenIP);
10059 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
10060 unsigned RevIdx = EE - II - 1;
10061 Value *DimsLVal = Builder.CreateInBoundsGEP(
10062 ArrayTy, DimsAddr, {Builder.getInt64(0), Builder.getInt64(II)});
10063 // Offset
10064 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
10065 Builder.CreateAlignedStore(
10066 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
10067 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
10068 // Count
10069 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
10070 Builder.CreateAlignedStore(
10071 NonContigInfo.Counts[L][RevIdx], CountLVal,
10072 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
10073 // Stride
10074 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
10075 Builder.CreateAlignedStore(
10076 NonContigInfo.Strides[L][RevIdx], StrideLVal,
10077 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
10078 }
10079 // args[I] = &dims
10080 Builder.restoreIP(CodeGenIP);
10081 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
10082 DimsAddr, Builder.getPtrTy());
10083 Value *P = Builder.CreateConstInBoundsGEP2_32(
10084 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
10085 Info.RTArgs.PointersArray, 0, I);
10086 Builder.CreateAlignedStore(
10087 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
10088 ++L;
10089 }
10090}
10091
10092void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
10093 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
10094 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
10095 BasicBlock *ExitBB, bool IsInit) {
10096 StringRef Prefix = IsInit ? ".init" : ".del";
10097
10098 // Evaluate if this is an array section.
10100 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
10101 Value *IsArray =
10102 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
10103 Value *DeleteBit = Builder.CreateAnd(
10104 MapType,
10105 Builder.getInt64(
10106 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10107 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
10108 Value *DeleteCond;
10109 Value *Cond;
10110 if (IsInit) {
10111 // base != begin?
10112 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
10113 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
10114 DeleteCond = Builder.CreateIsNull(
10115 DeleteBit,
10116 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
10117 } else {
10118 Cond = IsArray;
10119 DeleteCond = Builder.CreateIsNotNull(
10120 DeleteBit,
10121 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
10122 }
10123 Cond = Builder.CreateAnd(Cond, DeleteCond);
10124 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
10125
10126 emitBlock(BodyBB, MapperFn);
10127 // Get the array size by multiplying element size and element number (i.e., \p
10128 // Size).
10129 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
10130 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
10131 // memory allocation/deletion purpose only.
10132 Value *MapTypeArg = Builder.CreateAnd(
10133 MapType,
10134 Builder.getInt64(
10135 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10136 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10137 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10138 MapTypeArg = Builder.CreateOr(
10139 MapTypeArg,
10140 Builder.getInt64(
10141 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10142 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
10143
10144 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
10145 // data structure.
10146 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
10147 ArraySize, MapTypeArg, MapName};
10149 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
10150 OffloadingArgs);
10151}
10152
10155 llvm::Value *BeginArg)>
10156 GenMapInfoCB,
10157 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
10158 SmallVector<Type *> Params;
10159 Params.emplace_back(Builder.getPtrTy());
10160 Params.emplace_back(Builder.getPtrTy());
10161 Params.emplace_back(Builder.getPtrTy());
10162 Params.emplace_back(Builder.getInt64Ty());
10163 Params.emplace_back(Builder.getInt64Ty());
10164 Params.emplace_back(Builder.getPtrTy());
10165
10166 auto *FnTy =
10167 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
10168
10169 SmallString<64> TyStr;
10170 raw_svector_ostream Out(TyStr);
10171 Function *MapperFn =
10173 MapperFn->addFnAttr(Attribute::NoInline);
10174 MapperFn->addFnAttr(Attribute::NoUnwind);
10175 MapperFn->addParamAttr(0, Attribute::NoUndef);
10176 MapperFn->addParamAttr(1, Attribute::NoUndef);
10177 MapperFn->addParamAttr(2, Attribute::NoUndef);
10178 MapperFn->addParamAttr(3, Attribute::NoUndef);
10179 MapperFn->addParamAttr(4, Attribute::NoUndef);
10180 MapperFn->addParamAttr(5, Attribute::NoUndef);
10181
10182 // Start the mapper function code generation.
10183 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
10184 auto SavedIP = Builder.saveIP();
10185 Builder.SetInsertPoint(EntryBB);
10186
10187 Value *MapperHandle = MapperFn->getArg(0);
10188 Value *BaseIn = MapperFn->getArg(1);
10189 Value *BeginIn = MapperFn->getArg(2);
10190 Value *Size = MapperFn->getArg(3);
10191 Value *MapType = MapperFn->getArg(4);
10192 Value *MapName = MapperFn->getArg(5);
10193
10194 // Compute the starting and end addresses of array elements.
10195 // Prepare common arguments for array initiation and deletion.
10196 // Convert the size in bytes into the number of array elements.
10197 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
10198 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
10199 Value *PtrBegin = BeginIn;
10200 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
10201
10202 // Emit array initiation if this is an array section and \p MapType indicates
10203 // that memory allocation is required.
10204 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
10205 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
10206 MapType, MapName, ElementSize, HeadBB,
10207 /*IsInit=*/true);
10208
10209 // Emit a for loop to iterate through SizeArg of elements and map all of them.
10210
10211 // Emit the loop header block.
10212 emitBlock(HeadBB, MapperFn);
10213 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
10214 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
10215 // Evaluate whether the initial condition is satisfied.
10216 Value *IsEmpty =
10217 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
10218 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
10219
10220 // Emit the loop body block.
10221 emitBlock(BodyBB, MapperFn);
10222 BasicBlock *LastBB = BodyBB;
10223 PHINode *PtrPHI =
10224 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
10225 PtrPHI->addIncoming(PtrBegin, HeadBB);
10226
10227 // Get map clause information. Fill up the arrays with all mapped variables.
10228 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
10229 if (!Info)
10230 return Info.takeError();
10231
10232 // Call the runtime API __tgt_mapper_num_components to get the number of
10233 // pre-existing components.
10234 Value *OffloadingArgs[] = {MapperHandle};
10235 Value *PreviousSize = createRuntimeFunctionCall(
10236 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
10237 OffloadingArgs);
10238 Value *ShiftedPreviousSize =
10239 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
10240
10241 // Fill up the runtime mapper handle for all components.
10242 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
10243 Value *CurBaseArg = Info->BasePointers[I];
10244 Value *CurBeginArg = Info->Pointers[I];
10245 Value *CurSizeArg = Info->Sizes[I];
10246 Value *CurNameArg = Info->Names.size()
10247 ? Info->Names[I]
10248 : Constant::getNullValue(Builder.getPtrTy());
10249
10250 // Extract the MEMBER_OF field from the map type.
10251 Value *OriMapType = Builder.getInt64(
10252 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10253 Info->Types[I]));
10254 Value *MemberMapType =
10255 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
10256
10257 // Combine the map type inherited from user-defined mapper with that
10258 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
10259 // bits of the \a MapType, which is the input argument of the mapper
10260 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
10261 // bits of MemberMapType.
10262 // [OpenMP 5.0], 1.2.6. map-type decay.
10263 // | alloc | to | from | tofrom | release | delete
10264 // ----------------------------------------------------------
10265 // alloc | alloc | alloc | alloc | alloc | release | delete
10266 // to | alloc | to | alloc | to | release | delete
10267 // from | alloc | alloc | from | from | release | delete
10268 // tofrom | alloc | to | from | tofrom | release | delete
10269 Value *LeftToFrom = Builder.CreateAnd(
10270 MapType,
10271 Builder.getInt64(
10272 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10273 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10274 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10275 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
10276 BasicBlock *AllocElseBB =
10277 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
10278 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
10279 BasicBlock *ToElseBB =
10280 BasicBlock::Create(M.getContext(), "omp.type.to.else");
10281 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
10282 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
10283 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
10284 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
10285 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
10286 emitBlock(AllocBB, MapperFn);
10287 Value *AllocMapType = Builder.CreateAnd(
10288 MemberMapType,
10289 Builder.getInt64(
10290 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10291 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10292 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10293 Builder.CreateBr(EndBB);
10294 emitBlock(AllocElseBB, MapperFn);
10295 Value *IsTo = Builder.CreateICmpEQ(
10296 LeftToFrom,
10297 Builder.getInt64(
10298 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10299 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10300 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
10301 // In case of to, clear OMP_MAP_FROM.
10302 emitBlock(ToBB, MapperFn);
10303 Value *ToMapType = Builder.CreateAnd(
10304 MemberMapType,
10305 Builder.getInt64(
10306 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10307 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10308 Builder.CreateBr(EndBB);
10309 emitBlock(ToElseBB, MapperFn);
10310 Value *IsFrom = Builder.CreateICmpEQ(
10311 LeftToFrom,
10312 Builder.getInt64(
10313 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10314 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10315 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
10316 // In case of from, clear OMP_MAP_TO.
10317 emitBlock(FromBB, MapperFn);
10318 Value *FromMapType = Builder.CreateAnd(
10319 MemberMapType,
10320 Builder.getInt64(
10321 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10322 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10323 // In case of tofrom, do nothing.
10324 emitBlock(EndBB, MapperFn);
10325 LastBB = EndBB;
10326 PHINode *CurMapType =
10327 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
10328 CurMapType->addIncoming(AllocMapType, AllocBB);
10329 CurMapType->addIncoming(ToMapType, ToBB);
10330 CurMapType->addIncoming(FromMapType, FromBB);
10331 CurMapType->addIncoming(MemberMapType, ToElseBB);
10332
10333 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
10334 CurSizeArg, CurMapType, CurNameArg};
10335
10336 auto ChildMapperFn = CustomMapperCB(I);
10337 if (!ChildMapperFn)
10338 return ChildMapperFn.takeError();
10339 if (*ChildMapperFn) {
10340 // Call the corresponding mapper function.
10341 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
10342 ->setDoesNotThrow();
10343 } else {
10344 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
10345 // data structure.
10347 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
10348 OffloadingArgs);
10349 }
10350 }
10351
10352 // Update the pointer to point to the next element that needs to be mapped,
10353 // and check whether we have mapped all elements.
10354 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
10355 "omp.arraymap.next");
10356 PtrPHI->addIncoming(PtrNext, LastBB);
10357 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
10358 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
10359 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
10360
10361 emitBlock(ExitBB, MapperFn);
10362 // Emit array deletion if this is an array section and \p MapType indicates
10363 // that deletion is required.
10364 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
10365 MapType, MapName, ElementSize, DoneBB,
10366 /*IsInit=*/false);
10367
10368 // Emit the function exit block.
10369 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
10370
10371 Builder.CreateRetVoid();
10372 Builder.restoreIP(SavedIP);
10373 return MapperFn;
10374}
10375
10377 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
10378 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
10379 bool IsNonContiguous,
10380 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
10381
10382 // Reset the array information.
10383 Info.clearArrayInfo();
10384 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
10385
10386 if (Info.NumberOfPtrs == 0)
10387 return Error::success();
10388
10389 Builder.restoreIP(AllocaIP);
10390 // Detect if we have any capture size requiring runtime evaluation of the
10391 // size so that a constant array could be eventually used.
10392 ArrayType *PointerArrayType =
10393 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
10394
10395 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
10396 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
10397
10398 Info.RTArgs.PointersArray = Builder.CreateAlloca(
10399 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
10400 AllocaInst *MappersArray = Builder.CreateAlloca(
10401 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
10402 Info.RTArgs.MappersArray = MappersArray;
10403
10404 // If we don't have any VLA types or other types that require runtime
10405 // evaluation, we can use a constant array for the map sizes, otherwise we
10406 // need to fill up the arrays as we do for the pointers.
10407 Type *Int64Ty = Builder.getInt64Ty();
10408 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
10409 ConstantInt::get(Int64Ty, 0));
10410 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
10411 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
10412 bool IsNonContigEntry =
10413 IsNonContiguous &&
10414 (static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10415 CombinedInfo.Types[I] &
10416 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG) != 0);
10417 // For NON_CONTIG entries, ArgSizes stores the dimension count (number of
10418 // descriptor_dim records), not the byte size.
10419 if (IsNonContigEntry) {
10420 assert(I < CombinedInfo.NonContigInfo.Dims.size() &&
10421 "Index must be in-bounds for NON_CONTIG Dims array");
10422 const uint64_t DimCount = CombinedInfo.NonContigInfo.Dims[I];
10423 assert(DimCount > 0 && "NON_CONTIG DimCount must be > 0");
10424 ConstSizes[I] = ConstantInt::get(Int64Ty, DimCount);
10425 continue;
10426 }
10427 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
10428 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
10429 ConstSizes[I] = CI;
10430 continue;
10431 }
10432 }
10433 RuntimeSizes.set(I);
10434 }
10435
10436 if (RuntimeSizes.all()) {
10437 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10438 Info.RTArgs.SizesArray = Builder.CreateAlloca(
10439 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10440 restoreIPandDebugLoc(Builder, CodeGenIP);
10441 } else {
10442 auto *SizesArrayInit = ConstantArray::get(
10443 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
10444 std::string Name = createPlatformSpecificName({"offload_sizes"});
10445 auto *SizesArrayGbl =
10446 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
10447 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
10448 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
10449
10450 if (!RuntimeSizes.any()) {
10451 Info.RTArgs.SizesArray = SizesArrayGbl;
10452 } else {
10453 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10454 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
10455 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10456 AllocaInst *Buffer = Builder.CreateAlloca(
10457 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10458 Buffer->setAlignment(OffloadSizeAlign);
10459 restoreIPandDebugLoc(Builder, CodeGenIP);
10460 Builder.CreateMemCpy(
10461 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
10462 SizesArrayGbl, OffloadSizeAlign,
10463 Builder.getIntN(
10464 IndexSize,
10465 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
10466
10467 Info.RTArgs.SizesArray = Buffer;
10468 }
10469 restoreIPandDebugLoc(Builder, CodeGenIP);
10470 }
10471
10472 // The map types are always constant so we don't need to generate code to
10473 // fill arrays. Instead, we create an array constant.
10475 for (auto mapFlag : CombinedInfo.Types)
10476 Mapping.push_back(
10477 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10478 mapFlag));
10479 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
10480 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10481 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
10482
10483 // The information types are only built if provided.
10484 if (!CombinedInfo.Names.empty()) {
10485 auto *MapNamesArrayGbl = createOffloadMapnames(
10486 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
10487 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
10488 Info.EmitDebug = true;
10489 } else {
10490 Info.RTArgs.MapNamesArray =
10492 Info.EmitDebug = false;
10493 }
10494
10495 // If there's a present map type modifier, it must not be applied to the end
10496 // of a region, so generate a separate map type array in that case.
10497 if (Info.separateBeginEndCalls()) {
10498 bool EndMapTypesDiffer = false;
10499 for (uint64_t &Type : Mapping) {
10500 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10501 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
10502 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10503 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
10504 EndMapTypesDiffer = true;
10505 }
10506 }
10507 if (EndMapTypesDiffer) {
10508 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10509 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
10510 }
10511 }
10512
10513 PointerType *PtrTy = Builder.getPtrTy();
10514 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
10515 Value *BPVal = CombinedInfo.BasePointers[I];
10516 Value *BP = Builder.CreateConstInBoundsGEP2_32(
10517 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
10518 0, I);
10519 Builder.CreateAlignedStore(BPVal, BP,
10520 M.getDataLayout().getPrefTypeAlign(PtrTy));
10521
10522 if (Info.requiresDevicePointerInfo()) {
10523 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
10524 CodeGenIP = Builder.saveIP();
10525 Builder.restoreIP(AllocaIP);
10526 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
10527 Builder.restoreIP(CodeGenIP);
10528 if (DeviceAddrCB)
10529 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
10530 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
10531 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
10532 if (DeviceAddrCB)
10533 DeviceAddrCB(I, BP);
10534 }
10535 }
10536
10537 Value *PVal = CombinedInfo.Pointers[I];
10538 Value *P = Builder.CreateConstInBoundsGEP2_32(
10539 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
10540 I);
10541 // TODO: Check alignment correct.
10542 Builder.CreateAlignedStore(PVal, P,
10543 M.getDataLayout().getPrefTypeAlign(PtrTy));
10544
10545 if (RuntimeSizes.test(I)) {
10546 Value *S = Builder.CreateConstInBoundsGEP2_32(
10547 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10548 /*Idx0=*/0,
10549 /*Idx1=*/I);
10550 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
10551 Int64Ty,
10552 /*isSigned=*/true),
10553 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
10554 }
10555 // Fill up the mapper array.
10556 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10557 Value *MFunc = ConstantPointerNull::get(PtrTy);
10558
10559 auto CustomMFunc = CustomMapperCB(I);
10560 if (!CustomMFunc)
10561 return CustomMFunc.takeError();
10562 if (*CustomMFunc)
10563 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
10564
10565 Value *MAddr = Builder.CreateInBoundsGEP(
10566 PointerArrayType, MappersArray,
10567 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
10568 Builder.CreateAlignedStore(
10569 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
10570 }
10571
10572 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
10573 Info.NumberOfPtrs == 0)
10574 return Error::success();
10575 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
10576 return Error::success();
10577}
10578
10580 BasicBlock *CurBB = Builder.GetInsertBlock();
10581
10582 if (!CurBB || CurBB->hasTerminator()) {
10583 // If there is no insert point or the previous block is already
10584 // terminated, don't touch it.
10585 } else {
10586 // Otherwise, create a fall-through branch.
10587 Builder.CreateBr(Target);
10588 }
10589
10590 Builder.ClearInsertionPoint();
10591}
10592
10594 bool IsFinished) {
10595 BasicBlock *CurBB = Builder.GetInsertBlock();
10596
10597 // Fall out of the current block (if necessary).
10598 emitBranch(BB);
10599
10600 if (IsFinished && BB->use_empty()) {
10601 BB->eraseFromParent();
10602 return;
10603 }
10604
10605 // Place the block after the current block, if possible, or else at
10606 // the end of the function.
10607 if (CurBB && CurBB->getParent())
10608 CurFn->insert(std::next(CurBB->getIterator()), BB);
10609 else
10610 CurFn->insert(CurFn->end(), BB);
10611 Builder.SetInsertPoint(BB);
10612}
10613
10615 BodyGenCallbackTy ElseGen,
10616 InsertPointTy AllocaIP,
10617 ArrayRef<BasicBlock *> DeallocBlocks) {
10618 // If the condition constant folds and can be elided, try to avoid emitting
10619 // the condition and the dead arm of the if/else.
10620 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
10621 auto CondConstant = CI->getSExtValue();
10622 if (CondConstant)
10623 return ThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
10624
10625 return ElseGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
10626 }
10627
10628 Function *CurFn = Builder.GetInsertBlock()->getParent();
10629
10630 // Otherwise, the condition did not fold, or we couldn't elide it. Just
10631 // emit the conditional branch.
10632 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
10633 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
10634 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
10635 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
10636 // Emit the 'then' code.
10637 emitBlock(ThenBlock, CurFn);
10638 if (Error Err = ThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks))
10639 return Err;
10640 emitBranch(ContBlock);
10641 // Emit the 'else' code if present.
10642 // There is no need to emit line number for unconditional branch.
10643 emitBlock(ElseBlock, CurFn);
10644 if (Error Err = ElseGen(AllocaIP, Builder.saveIP(), DeallocBlocks))
10645 return Err;
10646 // There is no need to emit line number for unconditional branch.
10647 emitBranch(ContBlock);
10648 // Emit the continuation block for code after the if.
10649 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10650 return Error::success();
10651}
10652
10653bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10654 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10657 "Unexpected Atomic Ordering.");
10658
10659 bool Flush = false;
10661
10662 switch (AK) {
10663 case Read:
10666 FlushAO = AtomicOrdering::Acquire;
10667 Flush = true;
10668 }
10669 break;
10670 case Write:
10671 case Compare:
10672 case Update:
10675 FlushAO = AtomicOrdering::Release;
10676 Flush = true;
10677 }
10678 break;
10679 case Capture:
10680 switch (AO) {
10682 FlushAO = AtomicOrdering::Acquire;
10683 Flush = true;
10684 break;
10686 FlushAO = AtomicOrdering::Release;
10687 Flush = true;
10688 break;
10692 Flush = true;
10693 break;
10694 default:
10695 // do nothing - leave silently.
10696 break;
10697 }
10698 }
10699
10700 if (Flush) {
10701 // Currently Flush RT call still doesn't take memory_ordering, so for when
10702 // that happens, this tries to do the resolution of which atomic ordering
10703 // to use with but issue the flush call
10704 // TODO: pass `FlushAO` after memory ordering support is added
10705 (void)FlushAO;
10706 emitFlush(Loc);
10707 }
10708
10709 // for AO == AtomicOrdering::Monotonic and all other case combinations
10710 // do nothing
10711 return Flush;
10712}
10713
10717 AtomicOrdering AO, InsertPointTy AllocaIP) {
10718 if (!updateToLocation(Loc))
10719 return Loc.IP;
10720
10721 assert(X.Var->getType()->isPointerTy() &&
10722 "OMP Atomic expects a pointer to target memory");
10723 Type *XElemTy = X.ElemTy;
10724 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10725 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10726 "OMP atomic read expected a scalar type");
10727
10728 Value *XRead = nullptr;
10729
10730 if (XElemTy->isIntegerTy()) {
10731 LoadInst *XLD =
10732 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10733 XLD->setAtomic(AO);
10734 XRead = cast<Value>(XLD);
10735 } else if (XElemTy->isStructTy()) {
10736 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10737 // target does not support `atomicrmw` of the size of the struct
10738 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10739 OldVal->setAtomic(AO);
10740 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10741 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10742 OpenMPIRBuilder::AtomicInfo atomicInfo(
10743 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10744 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10745 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10746 XRead = AtomicLoadRes.first;
10747 OldVal->eraseFromParent();
10748 } else {
10749 // We need to perform atomic op as integer
10750 IntegerType *IntCastTy =
10751 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10752 LoadInst *XLoad =
10753 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10754 XLoad->setAtomic(AO);
10755 if (XElemTy->isFloatingPointTy()) {
10756 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10757 } else {
10758 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10759 }
10760 }
10761 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10762 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10763 return Builder.saveIP();
10764}
10765
10768 AtomicOpValue &X, Value *Expr,
10769 AtomicOrdering AO, InsertPointTy AllocaIP) {
10770 if (!updateToLocation(Loc))
10771 return Loc.IP;
10772
10773 assert(X.Var->getType()->isPointerTy() &&
10774 "OMP Atomic expects a pointer to target memory");
10775 Type *XElemTy = X.ElemTy;
10776 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10777 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10778 "OMP atomic write expected a scalar type");
10779
10780 if (XElemTy->isIntegerTy()) {
10781 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10782 XSt->setAtomic(AO);
10783 } else if (XElemTy->isStructTy()) {
10784 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10785 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10786 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10787 OpenMPIRBuilder::AtomicInfo atomicInfo(
10788 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10789 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10790 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10791 OldVal->eraseFromParent();
10792 } else {
10793 // We need to bitcast and perform atomic op as integers
10794 IntegerType *IntCastTy =
10795 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10796 Value *ExprCast =
10797 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10798 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10799 XSt->setAtomic(AO);
10800 }
10801
10802 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10803 return Builder.saveIP();
10804}
10805
10808 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10809 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10810 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10811 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10812 if (!updateToLocation(Loc))
10813 return Loc.IP;
10814
10815 LLVM_DEBUG({
10816 Type *XTy = X.Var->getType();
10817 assert(XTy->isPointerTy() &&
10818 "OMP Atomic expects a pointer to target memory");
10819 Type *XElemTy = X.ElemTy;
10820 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10821 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10822 "OMP atomic update expected a scalar or struct type");
10823 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10824 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10825 "OpenMP atomic does not support LT or GT operations");
10826 });
10827
10828 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10829 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10830 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10831 if (!AtomicResult)
10832 return AtomicResult.takeError();
10833 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10834 return Builder.saveIP();
10835}
10836
10837// FIXME: Duplicating AtomicExpand
10838Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10839 AtomicRMWInst::BinOp RMWOp) {
10840 switch (RMWOp) {
10841 case AtomicRMWInst::Add:
10842 return Builder.CreateAdd(Src1, Src2);
10843 case AtomicRMWInst::Sub:
10844 return Builder.CreateSub(Src1, Src2);
10845 case AtomicRMWInst::And:
10846 return Builder.CreateAnd(Src1, Src2);
10848 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10849 case AtomicRMWInst::Or:
10850 return Builder.CreateOr(Src1, Src2);
10851 case AtomicRMWInst::Xor:
10852 return Builder.CreateXor(Src1, Src2);
10857 case AtomicRMWInst::Max:
10858 case AtomicRMWInst::Min:
10871 llvm_unreachable("Unsupported atomic update operation");
10872 }
10873 llvm_unreachable("Unsupported atomic update operation");
10874}
10875
10876Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10877 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10879 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10880 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10881 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2.
10882 bool emitRMWOp = false;
10883 switch (RMWOp) {
10884 case AtomicRMWInst::Add:
10885 case AtomicRMWInst::And:
10887 case AtomicRMWInst::Or:
10888 case AtomicRMWInst::Xor:
10890 emitRMWOp = XElemTy;
10891 break;
10892 case AtomicRMWInst::Sub:
10893 emitRMWOp = (IsXBinopExpr && XElemTy);
10894 break;
10895 default:
10896 emitRMWOp = false;
10897 }
10898 emitRMWOp &= XElemTy->isIntegerTy();
10899
10900 std::pair<Value *, Value *> Res;
10901 if (emitRMWOp) {
10902 AtomicRMWInst *RMWInst =
10903 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10904 if (T.isAMDGPU()) {
10905 if (IsIgnoreDenormalMode)
10906 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10907 llvm::MDNode::get(Builder.getContext(), {}));
10908 if (!IsFineGrainedMemory)
10909 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10910 llvm::MDNode::get(Builder.getContext(), {}));
10911 if (!IsRemoteMemory)
10912 RMWInst->setMetadata("amdgpu.no.remote.memory",
10913 llvm::MDNode::get(Builder.getContext(), {}));
10914 }
10915 Res.first = RMWInst;
10916 // not needed except in case of postfix captures. Generate anyway for
10917 // consistency with the else part. Will be removed with any DCE pass.
10918 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10919 if (RMWOp == AtomicRMWInst::Xchg)
10920 Res.second = Res.first;
10921 else
10922 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10923 } else if (XElemTy->isStructTy()) {
10924 LoadInst *OldVal =
10925 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10926 OldVal->setAtomic(AO);
10927 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10928 unsigned LoadSize = LoadDL.getTypeStoreSize(XElemTy);
10929
10930 OpenMPIRBuilder::AtomicInfo atomicInfo(
10931 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10932 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10933 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10934 BasicBlock *CurBB = Builder.GetInsertBlock();
10935 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10936 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10937 BasicBlock *ExitBB =
10938 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10939 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10940 X->getName() + ".atomic.cont");
10941 ContBB->getTerminator()->eraseFromParent();
10942 Builder.restoreIP(AllocaIP);
10943 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10944 NewAtomicAddr->setName(X->getName() + "x.new.val");
10945 Builder.SetInsertPoint(ContBB);
10946 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10947 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10948 Value *OldExprVal = PHI;
10949 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10950 if (!CBResult)
10951 return CBResult.takeError();
10952 Value *Upd = *CBResult;
10953 Builder.CreateStore(Upd, NewAtomicAddr);
10956 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
10957 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
10958 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
10959 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
10960 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
10961 OldVal->eraseFromParent();
10962 Res.first = OldExprVal;
10963 Res.second = Upd;
10964
10965 if (UnreachableInst *ExitTI =
10967 CurBBTI->eraseFromParent();
10968 Builder.SetInsertPoint(ExitBB);
10969 } else {
10970 Builder.SetInsertPoint(ExitTI);
10971 }
10972 } else {
10973 IntegerType *IntCastTy =
10974 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10975 LoadInst *OldVal =
10976 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
10977 OldVal->setAtomic(AO);
10978 // CurBB
10979 // | /---\
10980 // ContBB |
10981 // | \---/
10982 // ExitBB
10983 BasicBlock *CurBB = Builder.GetInsertBlock();
10984 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10985 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10986 BasicBlock *ExitBB =
10987 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10988 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10989 X->getName() + ".atomic.cont");
10990 ContBB->getTerminator()->eraseFromParent();
10991 Builder.restoreIP(AllocaIP);
10992 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10993 NewAtomicAddr->setName(X->getName() + "x.new.val");
10994 Builder.SetInsertPoint(ContBB);
10995 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10996 PHI->addIncoming(OldVal, CurBB);
10997 bool IsIntTy = XElemTy->isIntegerTy();
10998 Value *OldExprVal = PHI;
10999 if (!IsIntTy) {
11000 if (XElemTy->isFloatingPointTy()) {
11001 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
11002 X->getName() + ".atomic.fltCast");
11003 } else {
11004 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
11005 X->getName() + ".atomic.ptrCast");
11006 }
11007 }
11008
11009 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
11010 if (!CBResult)
11011 return CBResult.takeError();
11012 Value *Upd = *CBResult;
11013 Builder.CreateStore(Upd, NewAtomicAddr);
11014 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
11017 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
11018 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
11019 Result->setVolatile(VolatileX);
11020 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
11021 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
11022 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
11023 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
11024
11025 Res.first = OldExprVal;
11026 Res.second = Upd;
11027
11028 // set Insertion point in exit block
11029 if (UnreachableInst *ExitTI =
11031 CurBBTI->eraseFromParent();
11032 Builder.SetInsertPoint(ExitBB);
11033 } else {
11034 Builder.SetInsertPoint(ExitTI);
11035 }
11036 }
11037
11038 return Res;
11039}
11040
11043 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
11044 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
11045 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
11046 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
11047 if (!updateToLocation(Loc))
11048 return Loc.IP;
11049
11050 LLVM_DEBUG({
11051 Type *XTy = X.Var->getType();
11052 assert(XTy->isPointerTy() &&
11053 "OMP Atomic expects a pointer to target memory");
11054 Type *XElemTy = X.ElemTy;
11055 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
11056 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
11057 "OMP atomic capture expected a scalar or struct type");
11058 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
11059 "OpenMP atomic does not support LT or GT operations");
11060 });
11061
11062 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
11063 // 'x' is simply atomically rewritten with 'expr'.
11064 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
11065 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
11066 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
11067 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
11068 if (!AtomicResult)
11069 return AtomicResult.takeError();
11070 Value *CapturedVal =
11071 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
11072 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
11073
11074 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
11075 return Builder.saveIP();
11076}
11077
11081 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
11082 bool IsFailOnly) {
11083
11085 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
11086 IsPostfixUpdate, IsFailOnly, Failure);
11087}
11088
11092 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
11093 bool IsFailOnly, AtomicOrdering Failure) {
11094
11095 if (!updateToLocation(Loc))
11096 return Loc.IP;
11097
11098 assert(X.Var->getType()->isPointerTy() &&
11099 "OMP atomic expects a pointer to target memory");
11100 // compare capture
11101 if (V.Var) {
11102 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
11103 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
11104 }
11105
11106 bool IsInteger = E->getType()->isIntegerTy();
11107
11108 if (Op == OMPAtomicCompareOp::EQ) {
11109 AtomicCmpXchgInst *Result = nullptr;
11110 if (!IsInteger) {
11111 IntegerType *IntCastTy =
11112 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
11113 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
11114 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
11115 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
11116 AO, Failure);
11117 } else {
11118 Result =
11119 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
11120 }
11121
11122 if (V.Var) {
11123 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
11124 if (!IsInteger)
11125 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
11126 assert(OldValue->getType() == V.ElemTy &&
11127 "OldValue and V must be of same type");
11128 if (IsPostfixUpdate) {
11129 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
11130 } else {
11131 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
11132 if (IsFailOnly) {
11133 // CurBB----
11134 // | |
11135 // v |
11136 // ContBB |
11137 // | |
11138 // v |
11139 // ExitBB <-
11140 //
11141 // where ContBB only contains the store of old value to 'v'.
11142 BasicBlock *CurBB = Builder.GetInsertBlock();
11143 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11144 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11145 BasicBlock *ExitBB = CurBB->splitBasicBlock(
11146 CurBBTI, X.Var->getName() + ".atomic.exit");
11147 BasicBlock *ContBB = CurBB->splitBasicBlock(
11148 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
11149 ContBB->getTerminator()->eraseFromParent();
11150 CurBB->getTerminator()->eraseFromParent();
11151
11152 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
11153
11154 Builder.SetInsertPoint(ContBB);
11155 Builder.CreateStore(OldValue, V.Var);
11156 Builder.CreateBr(ExitBB);
11157
11158 if (UnreachableInst *ExitTI =
11160 CurBBTI->eraseFromParent();
11161 Builder.SetInsertPoint(ExitBB);
11162 } else {
11163 Builder.SetInsertPoint(ExitTI);
11164 }
11165 } else {
11166 Value *CapturedValue =
11167 Builder.CreateSelect(SuccessOrFail, E, OldValue);
11168 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
11169 }
11170 }
11171 }
11172 // The comparison result has to be stored.
11173 if (R.Var) {
11174 assert(R.Var->getType()->isPointerTy() &&
11175 "r.var must be of pointer type");
11176 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
11177
11178 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
11179 Value *ResultCast = R.IsSigned
11180 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
11181 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
11182 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
11183 }
11184 } else {
11185 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
11186 "Op should be either max or min at this point");
11187 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
11188
11189 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
11190 // Let's take max as example.
11191 // OpenMP form:
11192 // x = x > expr ? expr : x;
11193 // LLVM form:
11194 // *ptr = *ptr > val ? *ptr : val;
11195 // We need to transform to LLVM form.
11196 // x = x <= expr ? x : expr;
11198 if (IsXBinopExpr) {
11199 if (IsInteger) {
11200 if (X.IsSigned)
11201 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
11203 else
11204 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
11206 } else {
11207 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
11209 }
11210 } else {
11211 if (IsInteger) {
11212 if (X.IsSigned)
11213 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
11215 else
11216 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
11218 } else {
11219 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
11221 }
11222 }
11223
11224 AtomicRMWInst *OldValue =
11225 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
11226 if (V.Var) {
11227 Value *CapturedValue = nullptr;
11228 if (IsPostfixUpdate) {
11229 CapturedValue = OldValue;
11230 } else {
11231 CmpInst::Predicate Pred;
11232 switch (NewOp) {
11233 case AtomicRMWInst::Max:
11234 Pred = CmpInst::ICMP_SGT;
11235 break;
11237 Pred = CmpInst::ICMP_UGT;
11238 break;
11240 Pred = CmpInst::FCMP_OGT;
11241 break;
11242 case AtomicRMWInst::Min:
11243 Pred = CmpInst::ICMP_SLT;
11244 break;
11246 Pred = CmpInst::ICMP_ULT;
11247 break;
11249 Pred = CmpInst::FCMP_OLT;
11250 break;
11251 default:
11252 llvm_unreachable("unexpected comparison op");
11253 }
11254 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
11255 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
11256 }
11257 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
11258 }
11259 }
11260
11261 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
11262
11263 return Builder.saveIP();
11264}
11265
11268 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
11269 Value *NumTeamsUpper, Value *ThreadLimit,
11270 Value *IfExpr) {
11271 if (!updateToLocation(Loc))
11272 return InsertPointTy();
11273
11274 uint32_t SrcLocStrSize;
11275 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
11276 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
11277 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
11278
11279 // Outer allocation basicblock is the entry block of the current function.
11280 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
11281 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
11282 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
11283 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11284 }
11285
11286 // The current basic block is split into four basic blocks. After outlining,
11287 // they will be mapped as follows:
11288 // ```
11289 // def current_fn() {
11290 // current_basic_block:
11291 // br label %teams.exit
11292 // teams.exit:
11293 // ; instructions after teams
11294 // }
11295 //
11296 // def outlined_fn() {
11297 // teams.alloca:
11298 // br label %teams.body
11299 // teams.body:
11300 // ; instructions within teams body
11301 // }
11302 // ```
11303 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
11304 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
11305 BasicBlock *AllocaBB =
11306 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
11307
11308 bool SubClausesPresent =
11309 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
11310 // Push num_teams
11311 if (!Config.isTargetDevice() && SubClausesPresent) {
11312 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
11313 "if lowerbound is non-null, then upperbound must also be non-null "
11314 "for bounds on num_teams");
11315
11316 if (NumTeamsUpper == nullptr)
11317 NumTeamsUpper = Builder.getInt32(0);
11318
11319 if (NumTeamsLower == nullptr)
11320 NumTeamsLower = NumTeamsUpper;
11321
11322 if (IfExpr) {
11323 assert(IfExpr->getType()->isIntegerTy() &&
11324 "argument to if clause must be an integer value");
11325
11326 // upper = ifexpr ? upper : 1
11327 if (IfExpr->getType() != Int1)
11328 IfExpr = Builder.CreateICmpNE(IfExpr,
11329 ConstantInt::get(IfExpr->getType(), 0));
11330 NumTeamsUpper = Builder.CreateSelect(
11331 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
11332
11333 // lower = ifexpr ? lower : 1
11334 NumTeamsLower = Builder.CreateSelect(
11335 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
11336 }
11337
11338 if (ThreadLimit == nullptr)
11339 ThreadLimit = Builder.getInt32(0);
11340
11341 // The __kmpc_push_num_teams_51 function expects int32 as the arguments. So,
11342 // truncate or sign extend the passed values to match the int32 parameters.
11343 Value *NumTeamsLowerInt32 =
11344 Builder.CreateSExtOrTrunc(NumTeamsLower, Builder.getInt32Ty());
11345 Value *NumTeamsUpperInt32 =
11346 Builder.CreateSExtOrTrunc(NumTeamsUpper, Builder.getInt32Ty());
11347 Value *ThreadLimitInt32 =
11348 Builder.CreateSExtOrTrunc(ThreadLimit, Builder.getInt32Ty());
11349
11350 Value *ThreadNum = getOrCreateThreadID(Ident);
11351
11353 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
11354 {Ident, ThreadNum, NumTeamsLowerInt32, NumTeamsUpperInt32,
11355 ThreadLimitInt32});
11356 }
11357 // Generate the body of teams.
11358 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11359 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11360 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, ExitBB))
11361 return Err;
11362
11363 auto OI = std::make_unique<OutlineInfo>();
11364 OI->EntryBB = AllocaBB;
11365 OI->ExitBB = ExitBB;
11366 OI->OuterAllocBB = &OuterAllocaBB;
11367
11368 // Insert fake values for global tid and bound tid.
11370 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
11371 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
11372 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
11373 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
11374 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
11375
11376 auto HostPostOutlineCB = [this, Ident,
11377 ToBeDeleted](Function &OutlinedFn) mutable {
11378 // The stale call instruction will be replaced with a new call instruction
11379 // for runtime call with the outlined function.
11380
11381 assert(OutlinedFn.hasOneUse() &&
11382 "there must be a single user for the outlined function");
11383 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
11384 ToBeDeleted.push_back(StaleCI);
11385
11386 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
11387 "Outlined function must have two or three arguments only");
11388
11389 bool HasShared = OutlinedFn.arg_size() == 3;
11390
11391 OutlinedFn.getArg(0)->setName("global.tid.ptr");
11392 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
11393 if (HasShared)
11394 OutlinedFn.getArg(2)->setName("data");
11395
11396 // Call to the runtime function for teams in the current function.
11397 assert(StaleCI && "Error while outlining - no CallInst user found for the "
11398 "outlined function.");
11399 Builder.SetInsertPoint(StaleCI);
11400 SmallVector<Value *> Args = {
11401 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
11402 if (HasShared)
11403 Args.push_back(StaleCI->getArgOperand(2));
11406 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
11407 Args);
11408
11409 for (Instruction *I : llvm::reverse(ToBeDeleted))
11410 I->eraseFromParent();
11411 };
11412
11413 if (!Config.isTargetDevice())
11414 OI->PostOutlineCB = HostPostOutlineCB;
11415
11416 addOutlineInfo(std::move(OI));
11417
11418 Builder.SetInsertPoint(ExitBB);
11419
11420 return Builder.saveIP();
11421}
11422
11424 const LocationDescription &Loc, InsertPointTy OuterAllocIP,
11425 ArrayRef<BasicBlock *> OuterDeallocBlocks, BodyGenCallbackTy BodyGenCB) {
11426 if (!updateToLocation(Loc))
11427 return InsertPointTy();
11428
11429 BasicBlock *OuterAllocaBB = OuterAllocIP.getBlock();
11430
11431 if (OuterAllocaBB == Builder.GetInsertBlock()) {
11432 BasicBlock *BodyBB =
11433 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
11434 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11435 }
11436 BasicBlock *ExitBB =
11437 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
11438 BasicBlock *BodyBB =
11439 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
11440 BasicBlock *AllocaBB =
11441 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
11442
11443 // Generate the body of distribute clause
11444 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11445 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11446 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, ExitBB))
11447 return Err;
11448
11449 // When using target we use different runtime functions which require a
11450 // callback.
11451 if (Config.isTargetDevice()) {
11452 auto OI = std::make_unique<OutlineInfo>();
11453 OI->OuterAllocBB = OuterAllocIP.getBlock();
11454 OI->EntryBB = AllocaBB;
11455 OI->ExitBB = ExitBB;
11456 OI->OuterDeallocBBs.reserve(OuterDeallocBlocks.size());
11457 copy(OuterDeallocBlocks, OI->OuterDeallocBBs.end());
11458
11459 addOutlineInfo(std::move(OI));
11460 }
11461 Builder.SetInsertPoint(ExitBB);
11462
11463 return Builder.saveIP();
11464}
11465
11468 std::string VarName) {
11469 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
11471 Names.size()),
11472 Names);
11473 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
11474 M, MapNamesArrayInit->getType(),
11475 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
11476 VarName);
11477 return MapNamesArrayGlobal;
11478}
11479
11480// Create all simple and struct types exposed by the runtime and remember
11481// the llvm::PointerTypes of them for easy access later.
11482void OpenMPIRBuilder::initializeTypes(Module &M) {
11483 LLVMContext &Ctx = M.getContext();
11484 StructType *T;
11485 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
11486 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
11487#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
11488#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
11489 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
11490 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
11491#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
11492 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
11493 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
11494#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
11495 T = StructType::getTypeByName(Ctx, StructName); \
11496 if (!T) \
11497 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
11498 VarName = T; \
11499 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
11500#include "llvm/Frontend/OpenMP/OMPKinds.def"
11501}
11502
11505 SmallVectorImpl<BasicBlock *> &BlockVector) {
11507 BlockSet.insert(EntryBB);
11508 BlockSet.insert(ExitBB);
11509
11510 Worklist.push_back(EntryBB);
11511 while (!Worklist.empty()) {
11512 BasicBlock *BB = Worklist.pop_back_val();
11513 BlockVector.push_back(BB);
11514 for (BasicBlock *SuccBB : successors(BB))
11515 if (BlockSet.insert(SuccBB).second)
11516 Worklist.push_back(SuccBB);
11517 }
11518}
11519
11520std::unique_ptr<CodeExtractor>
11522 bool ArgsInZeroAddressSpace,
11523 Twine Suffix) {
11524 return std::make_unique<CodeExtractor>(
11525 Blocks, /* DominatorTree */ nullptr,
11526 /* AggregateArgs */ true,
11527 /* BlockFrequencyInfo */ nullptr,
11528 /* BranchProbabilityInfo */ nullptr,
11529 /* AssumptionCache */ nullptr,
11530 /* AllowVarArgs */ true,
11531 /* AllowAlloca */ true,
11532 /* AllocationBlock*/ OuterAllocBB,
11533 /* DeallocationBlocks */ ArrayRef<BasicBlock *>(),
11534 /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace);
11535}
11536
11537std::unique_ptr<CodeExtractor> DeviceSharedMemOutlineInfo::createCodeExtractor(
11538 ArrayRef<BasicBlock *> Blocks, bool ArgsInZeroAddressSpace, Twine Suffix) {
11539 return std::make_unique<DeviceSharedMemCodeExtractor>(
11540 OMPBuilder, Blocks, /* DominatorTree */ nullptr,
11541 /* AggregateArgs */ true,
11542 /* BlockFrequencyInfo */ nullptr,
11543 /* BranchProbabilityInfo */ nullptr,
11544 /* AssumptionCache */ nullptr,
11545 /* AllowVarArgs */ true,
11546 /* AllowAlloca */ true,
11547 /* AllocationBlock*/ OuterAllocBB,
11548 /* DeallocationBlocks */ OuterDeallocBBs.empty()
11550 : OuterDeallocBBs,
11551 /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace);
11552}
11553
11555 uint64_t Size, int32_t Flags,
11557 StringRef Name) {
11558 if (!Config.isGPU()) {
11561 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
11562 return;
11563 }
11564 // TODO: Add support for global variables on the device after declare target
11565 // support.
11566 Function *Fn = dyn_cast<Function>(Addr);
11567 if (!Fn)
11568 return;
11569
11570 // Add a function attribute for the kernel.
11571 Fn->addFnAttr("kernel");
11572 if (T.isAMDGCN())
11573 Fn->addFnAttr("uniform-work-group-size");
11574 Fn->addFnAttr(Attribute::MustProgress);
11575}
11576
11577// We only generate metadata for function that contain target regions.
11580
11581 // If there are no entries, we don't need to do anything.
11582 if (OffloadInfoManager.empty())
11583 return;
11584
11585 LLVMContext &C = M.getContext();
11588 16>
11589 OrderedEntries(OffloadInfoManager.size());
11590
11591 // Auxiliary methods to create metadata values and strings.
11592 auto &&GetMDInt = [this](unsigned V) {
11593 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
11594 };
11595
11596 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
11597
11598 // Create the offloading info metadata node.
11599 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
11600 auto &&TargetRegionMetadataEmitter =
11601 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
11602 const TargetRegionEntryInfo &EntryInfo,
11604 // Generate metadata for target regions. Each entry of this metadata
11605 // contains:
11606 // - Entry 0 -> Kind of this type of metadata (0).
11607 // - Entry 1 -> Device ID of the file where the entry was identified.
11608 // - Entry 2 -> File ID of the file where the entry was identified.
11609 // - Entry 3 -> Mangled name of the function where the entry was
11610 // identified.
11611 // - Entry 4 -> Line in the file where the entry was identified.
11612 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
11613 // - Entry 6 -> Order the entry was created.
11614 // The first element of the metadata node is the kind.
11615 Metadata *Ops[] = {
11616 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
11617 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
11618 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
11619 GetMDInt(E.getOrder())};
11620
11621 // Save this entry in the right position of the ordered entries array.
11622 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
11623
11624 // Add metadata to the named metadata node.
11625 MD->addOperand(MDNode::get(C, Ops));
11626 };
11627
11628 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
11629
11630 // Create function that emits metadata for each device global variable entry;
11631 auto &&DeviceGlobalVarMetadataEmitter =
11632 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
11633 StringRef MangledName,
11635 // Generate metadata for global variables. Each entry of this metadata
11636 // contains:
11637 // - Entry 0 -> Kind of this type of metadata (1).
11638 // - Entry 1 -> Mangled name of the variable.
11639 // - Entry 2 -> Declare target kind.
11640 // - Entry 3 -> Order the entry was created.
11641 // The first element of the metadata node is the kind.
11642 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
11643 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
11644
11645 // Save this entry in the right position of the ordered entries array.
11646 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
11647 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
11648
11649 // Add metadata to the named metadata node.
11650 MD->addOperand(MDNode::get(C, Ops));
11651 };
11652
11653 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
11654 DeviceGlobalVarMetadataEmitter);
11655
11656 for (const auto &E : OrderedEntries) {
11657 assert(E.first && "All ordered entries must exist!");
11658 if (const auto *CE =
11660 E.first)) {
11661 if (!CE->getID() || !CE->getAddress()) {
11662 // Do not blame the entry if the parent funtion is not emitted.
11663 TargetRegionEntryInfo EntryInfo = E.second;
11664 StringRef FnName = EntryInfo.ParentName;
11665 if (!M.getNamedValue(FnName))
11666 continue;
11667 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
11668 continue;
11669 }
11670 createOffloadEntry(CE->getID(), CE->getAddress(),
11671 /*Size=*/0, CE->getFlags(),
11673 } else if (const auto *CE = dyn_cast<
11675 E.first)) {
11678 CE->getFlags());
11679 switch (Flags) {
11682 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
11683 continue;
11684 if (!CE->getAddress()) {
11685 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
11686 continue;
11687 }
11688 // The vaiable has no definition - no need to add the entry.
11689 if (CE->getVarSize() == 0)
11690 continue;
11691 break;
11693 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11694 (!Config.isTargetDevice() && CE->getAddress())) &&
11695 "Declaret target link address is set.");
11696 if (Config.isTargetDevice())
11697 continue;
11698 if (!CE->getAddress()) {
11700 continue;
11701 }
11702 break;
11705 if (!CE->getAddress()) {
11706 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11707 continue;
11708 }
11709 break;
11710 default:
11711 break;
11712 }
11713
11714 // Hidden or internal symbols on the device are not externally visible.
11715 // We should not attempt to register them by creating an offloading
11716 // entry. Indirect variables are handled separately on the device.
11717 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11718 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11719 (Flags !=
11721 Flags != OffloadEntriesInfoManager::
11722 OMPTargetGlobalVarEntryIndirectVTable))
11723 continue;
11724
11725 // Indirect globals need to use a special name that doesn't match the name
11726 // of the associated host global.
11728 Flags ==
11730 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11731 Flags, CE->getLinkage(), CE->getVarName());
11732 else
11733 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11734 Flags, CE->getLinkage());
11735
11736 } else {
11737 llvm_unreachable("Unsupported entry kind.");
11738 }
11739 }
11740
11741 // Emit requires directive globals to a special entry so the runtime can
11742 // register them when the device image is loaded.
11743 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11744 // entries should be redesigned to better suit this use-case.
11745 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11749 ".requires", /*Size=*/0,
11751 Config.getRequiresFlags());
11752}
11753
11756 unsigned FileID, unsigned Line, unsigned Count) {
11757 raw_svector_ostream OS(Name);
11758 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11759 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11760 if (Count)
11761 OS << "_" << Count;
11762}
11763
11765 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11766 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11768 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11769 EntryInfo.Line, NewCount);
11770}
11771
11774 vfs::FileSystem &VFS,
11775 StringRef ParentName) {
11776 sys::fs::UniqueID ID(0xdeadf17e, 0);
11777 auto FileIDInfo = CallBack();
11778 uint64_t FileID = 0;
11779 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11780 ID = Status->getUniqueID();
11781 FileID = Status->getUniqueID().getFile();
11782 } else {
11783 // If the inode ID could not be determined, create a hash value
11784 // the current file name and use that as an ID.
11785 FileID = hash_value(std::get<0>(FileIDInfo));
11786 }
11787
11788 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
11789 std::get<1>(FileIDInfo));
11790}
11791
11793 unsigned Offset = 0;
11794 for (uint64_t Remain =
11795 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11797 !(Remain & 1); Remain = Remain >> 1)
11798 Offset++;
11799 return Offset;
11800}
11801
11804 // Rotate by getFlagMemberOffset() bits.
11805 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
11806 << getFlagMemberOffset());
11807}
11808
11811 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
11812 // If the entry is PTR_AND_OBJ but has not been marked with the special
11813 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
11814 // marked as MEMBER_OF.
11815 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11817 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11820 return;
11821
11822 // Entries with ATTACH are not members-of anything. They are handled
11823 // separately by the runtime after other maps have been handled.
11824 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11826 return;
11827
11828 // Reset the placeholder value to prepare the flag for the assignment of the
11829 // proper MEMBER_OF value.
11830 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
11831 Flags |= MemberOfFlag;
11832}
11833
11837 bool IsDeclaration, bool IsExternallyVisible,
11838 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11839 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11840 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
11841 std::function<Constant *()> GlobalInitializer,
11842 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
11843 // TODO: convert this to utilise the IRBuilder Config rather than
11844 // a passed down argument.
11845 if (OpenMPSIMD)
11846 return nullptr;
11847
11850 CaptureClause ==
11852 Config.hasRequiresUnifiedSharedMemory())) {
11853 SmallString<64> PtrName;
11854 {
11855 raw_svector_ostream OS(PtrName);
11856 OS << MangledName;
11857 if (!IsExternallyVisible)
11858 OS << format("_%x", EntryInfo.FileID);
11859 OS << "_decl_tgt_ref_ptr";
11860 }
11861
11862 Value *Ptr = M.getNamedValue(PtrName);
11863
11864 if (!Ptr) {
11865 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
11866 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
11867
11868 auto *GV = cast<GlobalVariable>(Ptr);
11869 GV->setLinkage(GlobalValue::WeakAnyLinkage);
11870
11871 if (!Config.isTargetDevice()) {
11872 if (GlobalInitializer)
11873 GV->setInitializer(GlobalInitializer());
11874 else
11875 GV->setInitializer(GlobalValue);
11876 }
11877
11879 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11880 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11881 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
11882 }
11883
11884 return cast<Constant>(Ptr);
11885 }
11886
11887 return nullptr;
11888}
11889
11893 bool IsDeclaration, bool IsExternallyVisible,
11894 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11895 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11896 std::vector<Triple> TargetTriple,
11897 std::function<Constant *()> GlobalInitializer,
11898 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
11899 Constant *Addr) {
11901 (TargetTriple.empty() && !Config.isTargetDevice()))
11902 return;
11903
11905 StringRef VarName;
11906 int64_t VarSize;
11908
11910 CaptureClause ==
11912 !Config.hasRequiresUnifiedSharedMemory()) {
11914 VarName = MangledName;
11915 GlobalValue *LlvmVal = M.getNamedValue(VarName);
11916
11917 if (!IsDeclaration)
11918 VarSize = divideCeil(
11919 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
11920 else
11921 VarSize = 0;
11922 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
11923
11924 // This is a workaround carried over from Clang which prevents undesired
11925 // optimisation of internal variables.
11926 if (Config.isTargetDevice() &&
11927 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
11928 // Do not create a "ref-variable" if the original is not also available
11929 // on the host.
11930 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
11931 return;
11932
11933 std::string RefName = createPlatformSpecificName({VarName, "ref"});
11934
11935 if (!M.getNamedValue(RefName)) {
11936 Constant *AddrRef =
11937 getOrCreateInternalVariable(Addr->getType(), RefName);
11938 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
11939 GvAddrRef->setConstant(true);
11940 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
11941 GvAddrRef->setInitializer(Addr);
11942 GeneratedRefs.push_back(GvAddrRef);
11943 }
11944 }
11945 } else {
11948 else
11950
11951 if (Config.isTargetDevice()) {
11952 VarName = (Addr) ? Addr->getName() : "";
11953 Addr = nullptr;
11954 } else {
11956 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11957 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11958 LlvmPtrTy, GlobalInitializer, VariableLinkage);
11959 VarName = (Addr) ? Addr->getName() : "";
11960 }
11961 VarSize = M.getDataLayout().getPointerSize();
11963 }
11964
11965 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
11966 Flags, Linkage);
11967}
11968
11969/// Loads all the offload entries information from the host IR
11970/// metadata.
11972 // If we are in target mode, load the metadata from the host IR. This code has
11973 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
11974
11975 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
11976 if (!MD)
11977 return;
11978
11979 for (MDNode *MN : MD->operands()) {
11980 auto &&GetMDInt = [MN](unsigned Idx) {
11981 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
11982 return cast<ConstantInt>(V->getValue())->getZExtValue();
11983 };
11984
11985 auto &&GetMDString = [MN](unsigned Idx) {
11986 auto *V = cast<MDString>(MN->getOperand(Idx));
11987 return V->getString();
11988 };
11989
11990 switch (GetMDInt(0)) {
11991 default:
11992 llvm_unreachable("Unexpected metadata!");
11993 break;
11994 case OffloadEntriesInfoManager::OffloadEntryInfo::
11995 OffloadingEntryInfoTargetRegion: {
11996 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
11997 /*DeviceID=*/GetMDInt(1),
11998 /*FileID=*/GetMDInt(2),
11999 /*Line=*/GetMDInt(4),
12000 /*Count=*/GetMDInt(5));
12001 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
12002 /*Order=*/GetMDInt(6));
12003 break;
12004 }
12005 case OffloadEntriesInfoManager::OffloadEntryInfo::
12006 OffloadingEntryInfoDeviceGlobalVar:
12007 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
12008 /*MangledName=*/GetMDString(1),
12010 /*Flags=*/GetMDInt(2)),
12011 /*Order=*/GetMDInt(3));
12012 break;
12013 }
12014 }
12015}
12016
12018 StringRef HostFilePath) {
12019 if (HostFilePath.empty())
12020 return;
12021
12022 auto Buf = VFS.getBufferForFile(HostFilePath);
12023 if (std::error_code Err = Buf.getError()) {
12024 report_fatal_error(("error opening host file from host file path inside of "
12025 "OpenMPIRBuilder: " +
12026 Err.message())
12027 .c_str());
12028 }
12029
12030 LLVMContext Ctx;
12032 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
12033 if (std::error_code Err = M.getError()) {
12035 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
12036 .c_str());
12037 }
12038
12039 loadOffloadInfoMetadata(*M.get());
12040}
12041
12044 llvm::StringRef Name) {
12045 Builder.restoreIP(Loc.IP);
12046
12047 BasicBlock *CurBB = Builder.GetInsertBlock();
12048 assert(CurBB &&
12049 "expected a valid insertion block for creating an iterator loop");
12050 Function *F = CurBB->getParent();
12051
12052 InsertPointTy SplitIP = Builder.saveIP();
12053 if (SplitIP.getPoint() == CurBB->end())
12054 if (Instruction *Terminator = CurBB->getTerminatorOrNull())
12055 SplitIP = InsertPointTy(CurBB, Terminator->getIterator());
12056
12057 BasicBlock *ContBB =
12058 splitBB(SplitIP, /*CreateBranch=*/false,
12059 Builder.getCurrentDebugLocation(), "omp.it.cont");
12060
12061 CanonicalLoopInfo *CLI =
12062 createLoopSkeleton(Builder.getCurrentDebugLocation(), TripCount, F,
12063 /*PreInsertBefore=*/ContBB,
12064 /*PostInsertBefore=*/ContBB, Name);
12065
12066 // Enter loop from original block.
12067 redirectTo(CurBB, CLI->getPreheader(), Builder.getCurrentDebugLocation());
12068
12069 // Remove the unconditional branch inserted by createLoopSkeleton in the body
12070 if (Instruction *T = CLI->getBody()->getTerminatorOrNull())
12071 T->eraseFromParent();
12072
12073 InsertPointTy BodyIP = CLI->getBodyIP();
12074 if (llvm::Error Err = BodyGen(BodyIP, CLI->getIndVar()))
12075 return Err;
12076
12077 // Body must either fallthrough to the latch or branch directly to it.
12078 if (Instruction *BodyTerminator = CLI->getBody()->getTerminatorOrNull()) {
12079 auto *BodyBr = dyn_cast<UncondBrInst>(BodyTerminator);
12080 if (!BodyBr || BodyBr->getSuccessor() != CLI->getLatch()) {
12082 "iterator bodygen must terminate the canonical body with an "
12083 "unconditional branch to the loop latch",
12085 }
12086 } else {
12087 // Ensure we end the loop body by jumping to the latch.
12088 Builder.SetInsertPoint(CLI->getBody());
12089 Builder.CreateBr(CLI->getLatch());
12090 }
12091
12092 // Link After -> ContBB
12093 Builder.SetInsertPoint(CLI->getAfter(), CLI->getAfter()->begin());
12094 if (!CLI->getAfter()->hasTerminator())
12095 Builder.CreateBr(ContBB);
12096
12097 return InsertPointTy{ContBB, ContBB->begin()};
12098}
12099
12100/// Mangle the parameter part of the vector function name according to
12101/// their OpenMP classification. The mangling function is defined in
12102/// section 4.5 of the AAVFABI(2021Q1).
12103static std::string mangleVectorParameters(
12105 SmallString<256> Buffer;
12106 llvm::raw_svector_ostream Out(Buffer);
12107 for (const auto &ParamAttr : ParamAttrs) {
12108 switch (ParamAttr.Kind) {
12110 Out << 'l';
12111 break;
12113 Out << 'R';
12114 break;
12116 Out << 'U';
12117 break;
12119 Out << 'L';
12120 break;
12122 Out << 'u';
12123 break;
12125 Out << 'v';
12126 break;
12127 }
12128 if (ParamAttr.HasVarStride)
12129 Out << "s" << ParamAttr.StrideOrArg;
12130 else if (ParamAttr.Kind ==
12132 ParamAttr.Kind ==
12134 ParamAttr.Kind ==
12136 ParamAttr.Kind ==
12138 // Don't print the step value if it is not present or if it is
12139 // equal to 1.
12140 if (ParamAttr.StrideOrArg < 0)
12141 Out << 'n' << -ParamAttr.StrideOrArg;
12142 else if (ParamAttr.StrideOrArg != 1)
12143 Out << ParamAttr.StrideOrArg;
12144 }
12145
12146 if (!!ParamAttr.Alignment)
12147 Out << 'a' << ParamAttr.Alignment;
12148 }
12149
12150 return std::string(Out.str());
12151}
12152
12154 llvm::Function *Fn, unsigned NumElts, const llvm::APSInt &VLENVal,
12156 struct ISADataTy {
12157 char ISA;
12158 unsigned VecRegSize;
12159 };
12160 ISADataTy ISAData[] = {
12161 {'b', 128}, // SSE
12162 {'c', 256}, // AVX
12163 {'d', 256}, // AVX2
12164 {'e', 512}, // AVX512
12165 };
12167 switch (Branch) {
12169 Masked.push_back('N');
12170 Masked.push_back('M');
12171 break;
12173 Masked.push_back('N');
12174 break;
12176 Masked.push_back('M');
12177 break;
12178 }
12179 for (char Mask : Masked) {
12180 for (const ISADataTy &Data : ISAData) {
12182 llvm::raw_svector_ostream Out(Buffer);
12183 Out << "_ZGV" << Data.ISA << Mask;
12184 if (!VLENVal) {
12185 assert(NumElts && "Non-zero simdlen/cdtsize expected");
12186 Out << llvm::APSInt::getUnsigned(Data.VecRegSize / NumElts);
12187 } else {
12188 Out << VLENVal;
12189 }
12190 Out << mangleVectorParameters(ParamAttrs);
12191 Out << '_' << Fn->getName();
12192 Fn->addFnAttr(Out.str());
12193 }
12194 }
12195}
12196
12197// Function used to add the attribute. The parameter `VLEN` is templated to
12198// allow the use of `x` when targeting scalable functions for SVE.
12199template <typename T>
12200static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix,
12201 char ISA, StringRef ParSeq,
12202 StringRef MangledName, bool OutputBecomesInput,
12203 llvm::Function *Fn) {
12204 SmallString<256> Buffer;
12205 llvm::raw_svector_ostream Out(Buffer);
12206 Out << Prefix << ISA << LMask << VLEN;
12207 if (OutputBecomesInput)
12208 Out << 'v';
12209 Out << ParSeq << '_' << MangledName;
12210 Fn->addFnAttr(Out.str());
12211}
12212
12213// Helper function to generate the Advanced SIMD names depending on the value
12214// of the NDS when simdlen is not present.
12215static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask,
12216 StringRef Prefix, char ISA,
12217 StringRef ParSeq, StringRef MangledName,
12218 bool OutputBecomesInput,
12219 llvm::Function *Fn) {
12220 switch (NDS) {
12221 case 8:
12222 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
12223 OutputBecomesInput, Fn);
12224 addAArch64VectorName(16, Mask, Prefix, ISA, ParSeq, MangledName,
12225 OutputBecomesInput, Fn);
12226 break;
12227 case 16:
12228 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
12229 OutputBecomesInput, Fn);
12230 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
12231 OutputBecomesInput, Fn);
12232 break;
12233 case 32:
12234 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
12235 OutputBecomesInput, Fn);
12236 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
12237 OutputBecomesInput, Fn);
12238 break;
12239 case 64:
12240 case 128:
12241 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
12242 OutputBecomesInput, Fn);
12243 break;
12244 default:
12245 llvm_unreachable("Scalar type is too wide.");
12246 }
12247}
12248
12249/// Emit vector function attributes for AArch64, as defined in the AAVFABI.
12251 llvm::Function *Fn, unsigned UserVLEN,
12253 char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput) {
12254 assert((ISA == 'n' || ISA == 's') && "Expected ISA either 's' or 'n'.");
12255
12256 // Sort out parameter sequence.
12257 const std::string ParSeq = mangleVectorParameters(ParamAttrs);
12258 StringRef Prefix = "_ZGV";
12259 StringRef MangledName = Fn->getName();
12260
12261 // Generate simdlen from user input (if any).
12262 if (UserVLEN) {
12263 if (ISA == 's') {
12264 // SVE generates only a masked function.
12265 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12266 OutputBecomesInput, Fn);
12267 return;
12268 }
12269
12270 switch (Branch) {
12272 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
12273 OutputBecomesInput, Fn);
12274 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12275 OutputBecomesInput, Fn);
12276 break;
12278 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12279 OutputBecomesInput, Fn);
12280 break;
12282 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
12283 OutputBecomesInput, Fn);
12284 break;
12285 }
12286 return;
12287 }
12288
12289 if (ISA == 's') {
12290 // SVE, section 3.4.1, item 1.
12291 addAArch64VectorName("x", "M", Prefix, ISA, ParSeq, MangledName,
12292 OutputBecomesInput, Fn);
12293 return;
12294 }
12295
12296 switch (Branch) {
12298 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
12299 MangledName, OutputBecomesInput, Fn);
12300 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
12301 MangledName, OutputBecomesInput, Fn);
12302 break;
12304 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
12305 MangledName, OutputBecomesInput, Fn);
12306 break;
12308 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
12309 MangledName, OutputBecomesInput, Fn);
12310 break;
12311 }
12312}
12313
12314//===----------------------------------------------------------------------===//
12315// OffloadEntriesInfoManager
12316//===----------------------------------------------------------------------===//
12317
12319 return OffloadEntriesTargetRegion.empty() &&
12320 OffloadEntriesDeviceGlobalVar.empty();
12321}
12322
12323unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
12324 const TargetRegionEntryInfo &EntryInfo) const {
12325 auto It = OffloadEntriesTargetRegionCount.find(
12326 getTargetRegionEntryCountKey(EntryInfo));
12327 if (It == OffloadEntriesTargetRegionCount.end())
12328 return 0;
12329 return It->second;
12330}
12331
12332void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
12333 const TargetRegionEntryInfo &EntryInfo) {
12334 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
12335 EntryInfo.Count + 1;
12336}
12337
12338/// Initialize target region entry.
12340 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
12341 OffloadEntriesTargetRegion[EntryInfo] =
12342 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
12344 ++OffloadingEntriesNum;
12345}
12346
12348 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
12350 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
12351
12352 // Update the EntryInfo with the next available count for this location.
12353 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12354
12355 // If we are emitting code for a target, the entry is already initialized,
12356 // only has to be registered.
12357 if (OMPBuilder->Config.isTargetDevice()) {
12358 // This could happen if the device compilation is invoked standalone.
12359 if (!hasTargetRegionEntryInfo(EntryInfo)) {
12360 return;
12361 }
12362 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
12363 Entry.setAddress(Addr);
12364 Entry.setID(ID);
12365 Entry.setFlags(Flags);
12366 } else {
12368 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
12369 return;
12370 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
12371 "Target region entry already registered!");
12372 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
12373 OffloadEntriesTargetRegion[EntryInfo] = Entry;
12374 ++OffloadingEntriesNum;
12375 }
12376 incrementTargetRegionEntryInfoCount(EntryInfo);
12377}
12378
12380 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
12381
12382 // Update the EntryInfo with the next available count for this location.
12383 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12384
12385 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
12386 if (It == OffloadEntriesTargetRegion.end()) {
12387 return false;
12388 }
12389 // Fail if this entry is already registered.
12390 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
12391 return false;
12392 return true;
12393}
12394
12396 const OffloadTargetRegionEntryInfoActTy &Action) {
12397 // Scan all target region entries and perform the provided action.
12398 for (const auto &It : OffloadEntriesTargetRegion) {
12399 Action(It.first, It.second);
12400 }
12401}
12402
12404 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
12405 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
12406 ++OffloadingEntriesNum;
12407}
12408
12410 StringRef VarName, Constant *Addr, int64_t VarSize,
12412 if (OMPBuilder->Config.isTargetDevice()) {
12413 // This could happen if the device compilation is invoked standalone.
12414 if (!hasDeviceGlobalVarEntryInfo(VarName))
12415 return;
12416 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12417 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
12418 if (Entry.getVarSize() == 0) {
12419 Entry.setVarSize(VarSize);
12420 Entry.setLinkage(Linkage);
12421 }
12422 return;
12423 }
12424 Entry.setVarSize(VarSize);
12425 Entry.setLinkage(Linkage);
12426 Entry.setAddress(Addr);
12427 } else {
12428 if (hasDeviceGlobalVarEntryInfo(VarName)) {
12429 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12430 assert(Entry.isValid() && Entry.getFlags() == Flags &&
12431 "Entry not initialized!");
12432 if (Entry.getVarSize() == 0) {
12433 Entry.setVarSize(VarSize);
12434 Entry.setLinkage(Linkage);
12435 }
12436 return;
12437 }
12439 Flags ==
12441 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
12442 Addr, VarSize, Flags, Linkage,
12443 VarName.str());
12444 else
12445 OffloadEntriesDeviceGlobalVar.try_emplace(
12446 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
12447 ++OffloadingEntriesNum;
12448 }
12449}
12450
12453 // Scan all target region entries and perform the provided action.
12454 for (const auto &E : OffloadEntriesDeviceGlobalVar)
12455 Action(E.getKey(), E.getValue());
12456}
12457
12458//===----------------------------------------------------------------------===//
12459// CanonicalLoopInfo
12460//===----------------------------------------------------------------------===//
12461
12462void CanonicalLoopInfo::collectControlBlocks(
12464 // We only count those BBs as control block for which we do not need to
12465 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
12466 // flow. For consistency, this also means we do not add the Body block, which
12467 // is just the entry to the body code.
12468 BBs.reserve(BBs.size() + 6);
12469 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
12470}
12471
12473 assert(isValid() && "Requires a valid canonical loop");
12474 for (BasicBlock *Pred : predecessors(Header)) {
12475 if (Pred != Latch)
12476 return Pred;
12477 }
12478 llvm_unreachable("Missing preheader");
12479}
12480
12481void CanonicalLoopInfo::setTripCount(Value *TripCount) {
12482 assert(isValid() && "Requires a valid canonical loop");
12483
12484 Instruction *CmpI = &getCond()->front();
12485 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
12486 CmpI->setOperand(1, TripCount);
12487
12488#ifndef NDEBUG
12489 assertOK();
12490#endif
12491}
12492
12493void CanonicalLoopInfo::mapIndVar(
12494 llvm::function_ref<Value *(Instruction *)> Updater) {
12495 assert(isValid() && "Requires a valid canonical loop");
12496
12497 Instruction *OldIV = getIndVar();
12498
12499 // Record all uses excluding those introduced by the updater. Uses by the
12500 // CanonicalLoopInfo itself to keep track of the number of iterations are
12501 // excluded.
12502 SmallVector<Use *> ReplacableUses;
12503 for (Use &U : OldIV->uses()) {
12504 auto *User = dyn_cast<Instruction>(U.getUser());
12505 if (!User)
12506 continue;
12507 if (User->getParent() == getCond())
12508 continue;
12509 if (User->getParent() == getLatch())
12510 continue;
12511 ReplacableUses.push_back(&U);
12512 }
12513
12514 // Run the updater that may introduce new uses
12515 Value *NewIV = Updater(OldIV);
12516
12517 // Replace the old uses with the value returned by the updater.
12518 for (Use *U : ReplacableUses)
12519 U->set(NewIV);
12520
12521#ifndef NDEBUG
12522 assertOK();
12523#endif
12524}
12525
12527#ifndef NDEBUG
12528 // No constraints if this object currently does not describe a loop.
12529 if (!isValid())
12530 return;
12531
12532 BasicBlock *Preheader = getPreheader();
12533 BasicBlock *Body = getBody();
12534 BasicBlock *After = getAfter();
12535
12536 // Verify standard control-flow we use for OpenMP loops.
12537 assert(Preheader);
12538 assert(isa<UncondBrInst>(Preheader->getTerminator()) &&
12539 "Preheader must terminate with unconditional branch");
12540 assert(Preheader->getSingleSuccessor() == Header &&
12541 "Preheader must jump to header");
12542
12543 assert(Header);
12544 assert(isa<UncondBrInst>(Header->getTerminator()) &&
12545 "Header must terminate with unconditional branch");
12546 assert(Header->getSingleSuccessor() == Cond &&
12547 "Header must jump to exiting block");
12548
12549 assert(Cond);
12550 assert(Cond->getSinglePredecessor() == Header &&
12551 "Exiting block only reachable from header");
12552
12553 assert(isa<CondBrInst>(Cond->getTerminator()) &&
12554 "Exiting block must terminate with conditional branch");
12555 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
12556 "Exiting block's first successor jump to the body");
12557 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
12558 "Exiting block's second successor must exit the loop");
12559
12560 assert(Body);
12561 assert(Body->getSinglePredecessor() == Cond &&
12562 "Body only reachable from exiting block");
12563 assert(!isa<PHINode>(Body->front()));
12564
12565 assert(Latch);
12566 assert(isa<UncondBrInst>(Latch->getTerminator()) &&
12567 "Latch must terminate with unconditional branch");
12568 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
12569 // TODO: To support simple redirecting of the end of the body code that has
12570 // multiple; introduce another auxiliary basic block like preheader and after.
12571 assert(Latch->getSinglePredecessor() != nullptr);
12572 assert(!isa<PHINode>(Latch->front()));
12573
12574 assert(Exit);
12575 assert(isa<UncondBrInst>(Exit->getTerminator()) &&
12576 "Exit block must terminate with unconditional branch");
12577 assert(Exit->getSingleSuccessor() == After &&
12578 "Exit block must jump to after block");
12579
12580 assert(After);
12581 assert(After->getSinglePredecessor() == Exit &&
12582 "After block only reachable from exit block");
12583 assert(After->empty() || !isa<PHINode>(After->front()));
12584
12585 Instruction *IndVar = getIndVar();
12586 assert(IndVar && "Canonical induction variable not found?");
12587 assert(isa<IntegerType>(IndVar->getType()) &&
12588 "Induction variable must be an integer");
12589 assert(cast<PHINode>(IndVar)->getParent() == Header &&
12590 "Induction variable must be a PHI in the loop header");
12591 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
12592 assert(
12593 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
12594 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
12595
12596 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
12597 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
12598 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
12599 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
12600 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
12601 ->isOne());
12602
12603 Value *TripCount = getTripCount();
12604 assert(TripCount && "Loop trip count not found?");
12605 assert(IndVar->getType() == TripCount->getType() &&
12606 "Trip count and induction variable must have the same type");
12607
12608 auto *CmpI = cast<CmpInst>(&Cond->front());
12609 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
12610 "Exit condition must be a signed less-than comparison");
12611 assert(CmpI->getOperand(0) == IndVar &&
12612 "Exit condition must compare the induction variable");
12613 assert(CmpI->getOperand(1) == TripCount &&
12614 "Exit condition must compare with the trip count");
12615#endif
12616}
12617
12619 Header = nullptr;
12620 Cond = nullptr;
12621 Latch = nullptr;
12622 Exit = nullptr;
12623}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
@ ParamAttr
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static Function * createTargetParallelWrapper(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn)
Create wrapper function used to gather the outlined function's argument structure from a shared buffe...
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static std::string mangleVectorParameters(ArrayRef< llvm::OpenMPIRBuilder::DeclareSimdAttrTy > ParamAttrs)
Mangle the parameter part of the vector function name according to their OpenMP classification.
static bool isGenericKernel(Function &Fn)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static std::optional< omp::OMPTgtExecModeFlags > getTargetKernelExecMode(Function &Kernel)
Given a function, if it represents the entry point of a target kernel, this returns the execution mod...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const OpenMPIRBuilder::DependenciesInfo &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static bool hasGridValue(const Triple &T)
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getUnsigned(uint64_t X)
Definition APSInt.h:349
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:131
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:109
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:150
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMaximumNum
*p = maximumnum(old, v) maximumnum matches the behavior of llvm.maximumnum.
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ FMinimumNum
*p = minimumnum(old, v) minimumnum matches the behavior of llvm.minimumnum.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:477
bool hasTerminator() const LLVM_READONLY
Returns whether the block has a terminator.
Definition BasicBlock.h:232
bool empty() const
Definition BasicBlock.h:483
const Instruction & back() const
Definition BasicBlock.h:486
LLVM_ABI BasicBlock * splitBasicBlockBefore(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction and insert the new basic blo...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI void insertDbgRecordBefore(DbgRecord *DR, InstListType::iterator Here)
Insert a DbgRecord into a block at the position given by Here.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:484
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
const Instruction * getTerminatorOrNull() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:248
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:859
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true, bool ByteString=false)
This method constructs a CDS and initializes it with a text string.
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:579
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:638
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:168
const BasicBlock & getEntryBlock() const
Definition Function.h:809
Argument * arg_iterator
Definition Function.h:73
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:445
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
const Function & getFunction() const
Definition Function.h:166
iterator begin()
Definition Function.h:853
arg_iterator arg_begin()
Definition Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:666
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:755
size_t arg_size() const
Definition Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
iterator end()
Definition Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition Function.h:276
Argument * getArg(unsigned i) const
Definition Function.h:886
bool hasMetadata() const
Return true if this GlobalObject has any metadata attached to it.
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
InsertPoint - A saved insertion point.
Definition IRBuilder.h:298
BasicBlock * getBlock() const
Definition IRBuilder.h:313
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:311
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:314
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:318
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:330
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2847
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:589
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:996
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1080
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1580
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1760
iterator_range< op_iterator > operands()
Definition Metadata.h:1856
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI void emitAArch64DeclareSimdFunction(llvm::Function *Fn, unsigned VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch, char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput)
Emit AArch64 vector-function ABI attributes for a declare simd function.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
LLVM_ABI CallInst * createOMPAllocShared(const LocationDescription &Loc, Value *Size, const Twine &Name=Twine(""))
Create a runtime call for kmpc_alloc_shared.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
LLVM_ABI InsertPointOrErrorTy createScope(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait)
Generator for 'omp scope'.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI llvm::StructType * getKmpTaskAffinityInfoTy()
Return the LLVM struct type matching runtime kmp_task_affinity_info_t.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies={}, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI InsertPointOrErrorTy createIteratorLoop(LocationDescription Loc, llvm::Value *TripCount, IteratorBodyGenTy BodyGen, llvm::StringRef Name="iterator")
Create a canonical iterator loop at the current insertion point.
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks)> TargetBodyGenCallbackTy
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, const DependenciesInfo &Dependencies={}, const AffinityData &Affinities={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={}, ArrayRef< BasicBlock * > DeallocBlocks={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
void addOutlineInfo(std::unique_ptr< OutlineInfo > &&OI)
Add a new region that will be outlined later.
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitTaskDependency(IRBuilderBase &Builder, Value *Entry, const DependData &Dep)
Store one kmp_depend_info entry at the given Entry pointer.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI CallInst * createOMPAlignedAlloc(const LocationDescription &Loc, Value *Align, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_align_alloc.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPFreeShared(const LocationDescription &Loc, Value *Addr, Value *Size, const Twine &Name=Twine(""))
Create a runtime call for kmpc_free_shared.
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
SmallVector< std::unique_ptr< OutlineInfo >, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
llvm::function_ref< llvm::Error( InsertPointTy BodyIP, llvm::Value *LinearIV)> IteratorBodyGenTy
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
LLVM_ABI CanonicalLoopInfo * fuseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops)
Fuse a sequence of loops.
LLVM_ABI void emitX86DeclareSimdFunction(llvm::Function *Fn, unsigned NumElements, const llvm::APSInt &VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch)
Emit x86 vector-function ABI attributes for a declare simd function.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< InsertPointTy > DeallocIPs)> TargetGenArgAccessorsCallbackTy
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:222
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:471
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:636
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:689
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1047
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1107
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1121
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Unconditional Branch instruction.
static UncondBrInst * Create(BasicBlock *Target, InsertPosition InsertBefore=nullptr)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:151
LLVM_ABI bool canUnroll(OptimizationRemarkEmitter *ORE=nullptr, const Loop *L=nullptr) const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:173
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:146
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:184
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:709
bool use_empty() const
Definition Value.h:346
user_iterator user_end()
Definition Value.h:410
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:557
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
constexpr StringLiteral MaxNTID("nvvm.maxntid")
constexpr StringLiteral MaxClusterRank("nvvm.maxclusterrank")
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr)
Definition Utility.cpp:105
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
@ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:557
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:378
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
auto successors(const MachineBasicBlock *BB)
LLVM_ABI std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition Error.cpp:94
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:334
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
Error make_error(ArgTs &&... Args)
Make a Error instance representing failure using the given error info type.
Definition Error.h:340
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1884
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:27
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
LLVM_ABI void computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
A struct to pack the relevant information for an OpenMP affinity clause.
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
omp::RTLDependenceKindTy DepKind
A struct to pack static and dynamic dependency information for a task.
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
virtual LLVM_ABI std::unique_ptr< CodeExtractor > createCodeExtractor(ArrayRef< BasicBlock * > Blocks, bool ArgsInZeroAddressSpace, Twine Suffix=Twine(""))
Create a CodeExtractor instance based on the information stored in this structure,...
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...