LLVM 23.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
20#include "llvm/ADT/StringRef.h"
31#include "llvm/IR/Attributes.h"
32#include "llvm/IR/BasicBlock.h"
33#include "llvm/IR/CFG.h"
34#include "llvm/IR/CallingConv.h"
35#include "llvm/IR/Constant.h"
36#include "llvm/IR/Constants.h"
37#include "llvm/IR/DIBuilder.h"
40#include "llvm/IR/Function.h"
42#include "llvm/IR/IRBuilder.h"
45#include "llvm/IR/LLVMContext.h"
46#include "llvm/IR/MDBuilder.h"
47#include "llvm/IR/Metadata.h"
49#include "llvm/IR/PassManager.h"
51#include "llvm/IR/Value.h"
54#include "llvm/Support/Error.h"
66
67#include <cstdint>
68#include <optional>
69
70#define DEBUG_TYPE "openmp-ir-builder"
71
72using namespace llvm;
73using namespace omp;
74
75static cl::opt<bool>
76 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
77 cl::desc("Use optimistic attributes describing "
78 "'as-if' properties of runtime calls."),
79 cl::init(false));
80
82 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
83 cl::desc("Factor for the unroll threshold to account for code "
84 "simplifications still taking place"),
85 cl::init(1.5));
86
88 "openmp-ir-builder-use-default-max-threads", cl::Hidden,
89 cl::desc("Use a default max threads if none is provided."), cl::init(true));
90
91#ifndef NDEBUG
92/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
93/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
94/// an InsertPoint stores the instruction before something is inserted. For
95/// instance, if both point to the same instruction, two IRBuilders alternating
96/// creating instruction will cause the instructions to be interleaved.
99 if (!IP1.isSet() || !IP2.isSet())
100 return false;
101 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
102}
103
105 // Valid ordered/unordered and base algorithm combinations.
106 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
107 case OMPScheduleType::UnorderedStaticChunked:
108 case OMPScheduleType::UnorderedStatic:
109 case OMPScheduleType::UnorderedDynamicChunked:
110 case OMPScheduleType::UnorderedGuidedChunked:
111 case OMPScheduleType::UnorderedRuntime:
112 case OMPScheduleType::UnorderedAuto:
113 case OMPScheduleType::UnorderedTrapezoidal:
114 case OMPScheduleType::UnorderedGreedy:
115 case OMPScheduleType::UnorderedBalanced:
116 case OMPScheduleType::UnorderedGuidedIterativeChunked:
117 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
118 case OMPScheduleType::UnorderedSteal:
119 case OMPScheduleType::UnorderedStaticBalancedChunked:
120 case OMPScheduleType::UnorderedGuidedSimd:
121 case OMPScheduleType::UnorderedRuntimeSimd:
122 case OMPScheduleType::OrderedStaticChunked:
123 case OMPScheduleType::OrderedStatic:
124 case OMPScheduleType::OrderedDynamicChunked:
125 case OMPScheduleType::OrderedGuidedChunked:
126 case OMPScheduleType::OrderedRuntime:
127 case OMPScheduleType::OrderedAuto:
128 case OMPScheduleType::OrderdTrapezoidal:
129 case OMPScheduleType::NomergeUnorderedStaticChunked:
130 case OMPScheduleType::NomergeUnorderedStatic:
131 case OMPScheduleType::NomergeUnorderedDynamicChunked:
132 case OMPScheduleType::NomergeUnorderedGuidedChunked:
133 case OMPScheduleType::NomergeUnorderedRuntime:
134 case OMPScheduleType::NomergeUnorderedAuto:
135 case OMPScheduleType::NomergeUnorderedTrapezoidal:
136 case OMPScheduleType::NomergeUnorderedGreedy:
137 case OMPScheduleType::NomergeUnorderedBalanced:
138 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
139 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
140 case OMPScheduleType::NomergeUnorderedSteal:
141 case OMPScheduleType::NomergeOrderedStaticChunked:
142 case OMPScheduleType::NomergeOrderedStatic:
143 case OMPScheduleType::NomergeOrderedDynamicChunked:
144 case OMPScheduleType::NomergeOrderedGuidedChunked:
145 case OMPScheduleType::NomergeOrderedRuntime:
146 case OMPScheduleType::NomergeOrderedAuto:
147 case OMPScheduleType::NomergeOrderedTrapezoidal:
148 case OMPScheduleType::OrderedDistributeChunked:
149 case OMPScheduleType::OrderedDistribute:
150 break;
151 default:
152 return false;
153 }
154
155 // Must not set both monotonicity modifiers at the same time.
156 OMPScheduleType MonotonicityFlags =
157 SchedType & OMPScheduleType::MonotonicityMask;
158 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
159 return false;
160
161 return true;
162}
163#endif
164
165/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
166/// debug location to the last instruction in the specified basic block if the
167/// insert point points to the end of the block.
170 Builder.restoreIP(IP);
171 llvm::BasicBlock *BB = Builder.GetInsertBlock();
172 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
173 if (!BB->empty() && I == BB->end())
174 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
175}
176
177static bool hasGridValue(const Triple &T) {
178 return T.isAMDGPU() || T.isNVPTX() || T.isSPIRV();
179}
180
181static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
182 if (T.isAMDGPU()) {
183 StringRef Features =
184 Kernel->getFnAttribute("target-features").getValueAsString();
185 if (Features.count("+wavefrontsize64"))
188 }
189 if (T.isNVPTX())
191 if (T.isSPIRV())
193 llvm_unreachable("No grid value available for this architecture!");
194}
195
196/// Determine which scheduling algorithm to use, determined from schedule clause
197/// arguments.
198static OMPScheduleType
199getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
200 bool HasSimdModifier, bool HasDistScheduleChunks) {
201 // Currently, the default schedule it static.
202 switch (ClauseKind) {
203 case OMP_SCHEDULE_Default:
204 case OMP_SCHEDULE_Static:
205 return HasChunks ? OMPScheduleType::BaseStaticChunked
206 : OMPScheduleType::BaseStatic;
207 case OMP_SCHEDULE_Dynamic:
208 return OMPScheduleType::BaseDynamicChunked;
209 case OMP_SCHEDULE_Guided:
210 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
211 : OMPScheduleType::BaseGuidedChunked;
212 case OMP_SCHEDULE_Auto:
214 case OMP_SCHEDULE_Runtime:
215 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
216 : OMPScheduleType::BaseRuntime;
217 case OMP_SCHEDULE_Distribute:
218 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
219 : OMPScheduleType::BaseDistribute;
220 }
221 llvm_unreachable("unhandled schedule clause argument");
222}
223
224/// Adds ordering modifier flags to schedule type.
225static OMPScheduleType
227 bool HasOrderedClause) {
228 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
229 OMPScheduleType::None &&
230 "Must not have ordering nor monotonicity flags already set");
231
232 OMPScheduleType OrderingModifier = HasOrderedClause
233 ? OMPScheduleType::ModifierOrdered
234 : OMPScheduleType::ModifierUnordered;
235 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
236
237 // Unsupported combinations
238 if (OrderingScheduleType ==
239 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
240 return OMPScheduleType::OrderedGuidedChunked;
241 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
242 OMPScheduleType::ModifierOrdered))
243 return OMPScheduleType::OrderedRuntime;
244
245 return OrderingScheduleType;
246}
247
248/// Adds monotonicity modifier flags to schedule type.
249static OMPScheduleType
251 bool HasSimdModifier, bool HasMonotonic,
252 bool HasNonmonotonic, bool HasOrderedClause) {
253 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
254 OMPScheduleType::None &&
255 "Must not have monotonicity flags already set");
256 assert((!HasMonotonic || !HasNonmonotonic) &&
257 "Monotonic and Nonmonotonic are contradicting each other");
258
259 if (HasMonotonic) {
260 return ScheduleType | OMPScheduleType::ModifierMonotonic;
261 } else if (HasNonmonotonic) {
262 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
263 } else {
264 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
265 // If the static schedule kind is specified or if the ordered clause is
266 // specified, and if the nonmonotonic modifier is not specified, the
267 // effect is as if the monotonic modifier is specified. Otherwise, unless
268 // the monotonic modifier is specified, the effect is as if the
269 // nonmonotonic modifier is specified.
270 OMPScheduleType BaseScheduleType =
271 ScheduleType & ~OMPScheduleType::ModifierMask;
272 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
273 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
274 HasOrderedClause) {
275 // The monotonic is used by default in openmp runtime library, so no need
276 // to set it.
277 return ScheduleType;
278 } else {
279 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
280 }
281 }
282}
283
284/// Determine the schedule type using schedule and ordering clause arguments.
285static OMPScheduleType
286computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
287 bool HasSimdModifier, bool HasMonotonicModifier,
288 bool HasNonmonotonicModifier, bool HasOrderedClause,
289 bool HasDistScheduleChunks) {
291 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
292 OMPScheduleType OrderedSchedule =
293 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
295 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
296 HasNonmonotonicModifier, HasOrderedClause);
297
299 return Result;
300}
301
302/// Given a function, if it represents the entry point of a target kernel, this
303/// returns the execution mode flags associated with that kernel.
304static std::optional<omp::OMPTgtExecModeFlags>
306 CallInst *TargetInitCall = nullptr;
307 for (Instruction &Inst : Kernel.getEntryBlock()) {
308 if (auto *Call = dyn_cast<CallInst>(&Inst)) {
309 if (Call->getCalledFunction()->getName() == "__kmpc_target_init") {
310 TargetInitCall = Call;
311 break;
312 }
313 }
314 }
315
316 if (!TargetInitCall)
317 return std::nullopt;
318
319 // Get the kernel mode information from the global variable associated to the
320 // first argument to the call to __kmpc_target_init. Refer to
321 // createTargetInit() to see how this is initialized.
322 Value *InitOperand = TargetInitCall->getArgOperand(0);
323 GlobalVariable *KernelEnv = nullptr;
324 if (auto *Cast = dyn_cast<ConstantExpr>(InitOperand))
325 KernelEnv = cast<GlobalVariable>(Cast->getOperand(0));
326 else
327 KernelEnv = cast<GlobalVariable>(InitOperand);
328 auto *KernelEnvInit = cast<ConstantStruct>(KernelEnv->getInitializer());
329 auto *ConfigEnv = cast<ConstantStruct>(KernelEnvInit->getOperand(0));
330 auto *KernelMode = cast<ConstantInt>(ConfigEnv->getOperand(2));
331 return static_cast<OMPTgtExecModeFlags>(KernelMode->getZExtValue());
332}
333
334static bool isGenericKernel(Function &Fn) {
335 std::optional<omp::OMPTgtExecModeFlags> ExecMode =
337 return !ExecMode || (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC);
338}
339
340/// Make \p Source branch to \p Target.
341///
342/// Handles two situations:
343/// * \p Source already has an unconditional branch.
344/// * \p Source is a degenerate block (no terminator because the BB is
345/// the current head of the IR construction).
347 if (Instruction *Term = Source->getTerminatorOrNull()) {
348 auto *Br = cast<UncondBrInst>(Term);
349 BasicBlock *Succ = Br->getSuccessor();
350 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
351 Br->setSuccessor(Target);
352 return;
353 }
354
355 auto *NewBr = UncondBrInst::Create(Target, Source);
356 NewBr->setDebugLoc(DL);
357}
358
360 bool CreateBranch, DebugLoc DL) {
361 assert(New->getFirstInsertionPt() == New->begin() &&
362 "Target BB must not have PHI nodes");
363
364 // Move instructions to new block.
365 BasicBlock *Old = IP.getBlock();
366 // If the `Old` block is empty then there are no instructions to move. But in
367 // the new debug scheme, it could have trailing debug records which will be
368 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
369 // reasons:
370 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
371 // 2. Even if `New` is not empty, the rationale to move those records to `New`
372 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
373 // assumes that `Old` is optimized out and is going away. This is not the case
374 // here. The `Old` block is still being used e.g. a branch instruction is
375 // added to it later in this function.
376 // So we call `BasicBlock::splice` only when `Old` is not empty.
377 if (!Old->empty())
378 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
379
380 if (CreateBranch) {
381 auto *NewBr = UncondBrInst::Create(New, Old);
382 NewBr->setDebugLoc(DL);
383 }
384}
385
386void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
387 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
388 BasicBlock *Old = Builder.GetInsertBlock();
389
390 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
391 if (CreateBranch)
392 Builder.SetInsertPoint(Old->getTerminator());
393 else
394 Builder.SetInsertPoint(Old);
395
396 // SetInsertPoint also updates the Builder's debug location, but we want to
397 // keep the one the Builder was configured to use.
398 Builder.SetCurrentDebugLocation(DebugLoc);
399}
400
402 DebugLoc DL, llvm::Twine Name) {
403 BasicBlock *Old = IP.getBlock();
405 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
406 Old->getParent(), Old->getNextNode());
407 spliceBB(IP, New, CreateBranch, DL);
408 New->replaceSuccessorsPhiUsesWith(Old, New);
409 return New;
410}
411
412BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
413 llvm::Twine Name) {
414 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
415 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
416 if (CreateBranch)
417 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
418 else
419 Builder.SetInsertPoint(Builder.GetInsertBlock());
420 // SetInsertPoint also updates the Builder's debug location, but we want to
421 // keep the one the Builder was configured to use.
422 Builder.SetCurrentDebugLocation(DebugLoc);
423 return New;
424}
425
426BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
427 llvm::Twine Name) {
428 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
429 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
430 if (CreateBranch)
431 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
432 else
433 Builder.SetInsertPoint(Builder.GetInsertBlock());
434 // SetInsertPoint also updates the Builder's debug location, but we want to
435 // keep the one the Builder was configured to use.
436 Builder.SetCurrentDebugLocation(DebugLoc);
437 return New;
438}
439
441 llvm::Twine Suffix) {
442 BasicBlock *Old = Builder.GetInsertBlock();
443 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
444}
445
446// This function creates a fake integer value and a fake use for the integer
447// value. It returns the fake value created. This is useful in modeling the
448// extra arguments to the outlined functions.
450 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
452 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
453 const Twine &Name = "", bool AsPtr = true,
454 bool Is64Bit = false) {
455 Builder.restoreIP(OuterAllocaIP);
456 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
457 Instruction *FakeVal;
458 AllocaInst *FakeValAddr =
459 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
460 ToBeDeleted.push_back(FakeValAddr);
461
462 if (AsPtr) {
463 FakeVal = FakeValAddr;
464 } else {
465 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
466 ToBeDeleted.push_back(FakeVal);
467 }
468
469 // Generate a fake use of this value
470 Builder.restoreIP(InnerAllocaIP);
471 Instruction *UseFakeVal;
472 if (AsPtr) {
473 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
474 } else {
475 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
476 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
477 }
478 ToBeDeleted.push_back(UseFakeVal);
479 return FakeVal;
480}
481
482//===----------------------------------------------------------------------===//
483// OpenMPIRBuilderConfig
484//===----------------------------------------------------------------------===//
485
486namespace {
488/// Values for bit flags for marking which requires clauses have been used.
489enum OpenMPOffloadingRequiresDirFlags {
490 /// flag undefined.
491 OMP_REQ_UNDEFINED = 0x000,
492 /// no requires directive present.
493 OMP_REQ_NONE = 0x001,
494 /// reverse_offload clause.
495 OMP_REQ_REVERSE_OFFLOAD = 0x002,
496 /// unified_address clause.
497 OMP_REQ_UNIFIED_ADDRESS = 0x004,
498 /// unified_shared_memory clause.
499 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
500 /// dynamic_allocators clause.
501 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
502 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
503};
504
505class OMPCodeExtractor : public CodeExtractor {
506public:
507 OMPCodeExtractor(OpenMPIRBuilder &OMPBuilder, ArrayRef<BasicBlock *> BBs,
508 DominatorTree *DT = nullptr, bool AggregateArgs = false,
509 BlockFrequencyInfo *BFI = nullptr,
510 BranchProbabilityInfo *BPI = nullptr,
511 AssumptionCache *AC = nullptr, bool AllowVarArgs = false,
512 bool AllowAlloca = false,
513 BasicBlock *AllocationBlock = nullptr,
514 ArrayRef<BasicBlock *> DeallocationBlocks = {},
515 std::string Suffix = "", bool ArgsInZeroAddressSpace = false)
516 : CodeExtractor(BBs, DT, AggregateArgs, BFI, BPI, AC, AllowVarArgs,
517 AllowAlloca, AllocationBlock, DeallocationBlocks, Suffix,
518 ArgsInZeroAddressSpace),
519 OMPBuilder(OMPBuilder) {}
520
521 virtual ~OMPCodeExtractor() = default;
522
523protected:
524 OpenMPIRBuilder &OMPBuilder;
525};
526
527class DeviceSharedMemCodeExtractor : public OMPCodeExtractor {
528public:
529 using OMPCodeExtractor::OMPCodeExtractor;
530 virtual ~DeviceSharedMemCodeExtractor() = default;
531
532protected:
533 virtual Instruction *
534 allocateVar(IRBuilder<>::InsertPoint AllocaIP, Type *VarType,
535 const Twine &Name = Twine(""),
536 AddrSpaceCastInst **CastedAlloc = nullptr) override {
537 return OMPBuilder.createOMPAllocShared(AllocaIP, VarType, Name);
538 }
539
540 virtual Instruction *deallocateVar(IRBuilder<>::InsertPoint DeallocIP,
541 Value *Var, Type *VarType) override {
542 return OMPBuilder.createOMPFreeShared(DeallocIP, Var, VarType);
543 }
544};
545
546/// Helper storing information about regions to outline using device shared
547/// memory for intermediate allocations.
548struct DeviceSharedMemOutlineInfo : public OpenMPIRBuilder::OutlineInfo {
549 OpenMPIRBuilder &OMPBuilder;
550
551 DeviceSharedMemOutlineInfo(OpenMPIRBuilder &OMPBuilder)
552 : OMPBuilder(OMPBuilder) {}
553 virtual ~DeviceSharedMemOutlineInfo() = default;
554
555 virtual std::unique_ptr<CodeExtractor>
556 createCodeExtractor(ArrayRef<BasicBlock *> Blocks,
557 bool ArgsInZeroAddressSpace,
558 Twine Suffix = Twine("")) override;
559};
560
561} // anonymous namespace
562
564 : RequiresFlags(OMP_REQ_UNDEFINED) {}
565
568 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
569 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
572 RequiresFlags(OMP_REQ_UNDEFINED) {
573 if (HasRequiresReverseOffload)
574 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
575 if (HasRequiresUnifiedAddress)
576 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
577 if (HasRequiresUnifiedSharedMemory)
578 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
579 if (HasRequiresDynamicAllocators)
580 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
581}
582
584 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
585}
586
588 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
589}
590
592 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
593}
594
596 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
597}
598
600 return hasRequiresFlags() ? RequiresFlags
601 : static_cast<int64_t>(OMP_REQ_NONE);
602}
603
605 if (Value)
606 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
607 else
608 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
609}
610
612 if (Value)
613 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
614 else
615 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
616}
617
619 if (Value)
620 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
621 else
622 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
623}
624
626 if (Value)
627 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
628 else
629 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
630}
631
632//===----------------------------------------------------------------------===//
633// OpenMPIRBuilder
634//===----------------------------------------------------------------------===//
635
638 SmallVector<Value *> &ArgsVector) {
640 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
641 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
642 constexpr size_t MaxDim = 3;
643 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
644
645 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
646
647 Value *DynCGroupMemFallbackFlag =
648 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
649 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
650 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
651
652 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
653
654 Value *NumTeams3D =
655 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
656 Value *NumThreads3D =
657 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
658 for (unsigned I :
659 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
660 NumTeams3D =
661 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
662 for (unsigned I :
663 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
664 NumThreads3D =
665 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
666
667 ArgsVector = {Version,
668 PointerNum,
669 KernelArgs.RTArgs.BasePointersArray,
670 KernelArgs.RTArgs.PointersArray,
671 KernelArgs.RTArgs.SizesArray,
672 KernelArgs.RTArgs.MapTypesArray,
673 KernelArgs.RTArgs.MapNamesArray,
674 KernelArgs.RTArgs.MappersArray,
675 KernelArgs.NumIterations,
676 Flags,
677 NumTeams3D,
678 NumThreads3D,
679 KernelArgs.DynCGroupMem};
680}
681
683 LLVMContext &Ctx = Fn.getContext();
684
685 // Get the function's current attributes.
686 auto Attrs = Fn.getAttributes();
687 auto FnAttrs = Attrs.getFnAttrs();
688 auto RetAttrs = Attrs.getRetAttrs();
690 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
691 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
692
693 // Add AS to FnAS while taking special care with integer extensions.
694 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
695 bool Param = true) -> void {
696 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
697 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
698 if (HasSignExt || HasZeroExt) {
699 assert(AS.getNumAttributes() == 1 &&
700 "Currently not handling extension attr combined with others.");
701 if (Param) {
702 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
703 FnAS = FnAS.addAttribute(Ctx, AK);
704 } else if (auto AK =
705 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
706 FnAS = FnAS.addAttribute(Ctx, AK);
707 } else {
708 FnAS = FnAS.addAttributes(Ctx, AS);
709 }
710 };
711
712#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
713#include "llvm/Frontend/OpenMP/OMPKinds.def"
714
715 // Add attributes to the function declaration.
716 switch (FnID) {
717#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
718 case Enum: \
719 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
720 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
721 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
722 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
723 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
724 break;
725#include "llvm/Frontend/OpenMP/OMPKinds.def"
726 default:
727 // Attributes are optional.
728 break;
729 }
730}
731
734 FunctionType *FnTy = nullptr;
735 Function *Fn = nullptr;
736
737 // Try to find the declation in the module first.
738 switch (FnID) {
739#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
740 case Enum: \
741 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
742 IsVarArg); \
743 Fn = M.getFunction(Str); \
744 break;
745#include "llvm/Frontend/OpenMP/OMPKinds.def"
746 }
747
748 if (!Fn) {
749 // Create a new declaration if we need one.
750 switch (FnID) {
751#define OMP_RTL(Enum, Str, ...) \
752 case Enum: \
753 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
754 break;
755#include "llvm/Frontend/OpenMP/OMPKinds.def"
756 }
757 Fn->setCallingConv(Config.getRuntimeCC());
758 // Add information if the runtime function takes a callback function
759 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
760 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
761 LLVMContext &Ctx = Fn->getContext();
762 MDBuilder MDB(Ctx);
763 // Annotate the callback behavior of the runtime function:
764 // - The callback callee is argument number 2 (microtask).
765 // - The first two arguments of the callback callee are unknown (-1).
766 // - All variadic arguments to the runtime function are passed to the
767 // callback callee.
768 Fn->addMetadata(
769 LLVMContext::MD_callback,
771 2, {-1, -1}, /* VarArgsArePassed */ true)}));
772 }
773 }
774
775 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
776 << " with type " << *Fn->getFunctionType() << "\n");
777 addAttributes(FnID, *Fn);
778
779 } else {
780 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
781 << " with type " << *Fn->getFunctionType() << "\n");
782 }
783
784 assert(Fn && "Failed to create OpenMP runtime function");
785
786 return {FnTy, Fn};
787}
788
791 if (!FiniBB) {
792 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
794 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
795 Builder.SetInsertPoint(FiniBB);
796 // FiniCB adds the branch to the exit stub.
797 if (Error Err = FiniCB(Builder.saveIP()))
798 return Err;
799 }
800 return FiniBB;
801}
802
804 BasicBlock *OtherFiniBB) {
805 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
806 if (!FiniBB) {
807 FiniBB = OtherFiniBB;
808
809 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
810 if (Error Err = FiniCB(Builder.saveIP()))
811 return Err;
812
813 return Error::success();
814 }
815
816 // Move instructions from FiniBB to the start of OtherFiniBB.
817 auto EndIt = FiniBB->end();
818 if (FiniBB->size() >= 1)
819 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
820 EndIt = Prev;
821 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
822 EndIt);
823
824 FiniBB->replaceAllUsesWith(OtherFiniBB);
825 FiniBB->eraseFromParent();
826 FiniBB = OtherFiniBB;
827 return Error::success();
828}
829
832 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
833 assert(Fn && "Failed to create OpenMP runtime function pointer");
834 return Fn;
835}
836
839 StringRef Name) {
840 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
841 Call->setCallingConv(Config.getRuntimeCC());
842 return Call;
843}
844
845void OpenMPIRBuilder::initialize() { initializeTypes(M); }
846
849 BasicBlock &EntryBlock = Function->getEntryBlock();
850 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
851
852 // Loop over blocks looking for constant allocas, skipping the entry block
853 // as any allocas there are already in the desired location.
854 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
855 Block++) {
856 for (auto Inst = Block->getReverseIterator()->begin();
857 Inst != Block->getReverseIterator()->end();) {
859 Inst++;
861 continue;
862 AllocaInst->moveBeforePreserving(MoveLocInst);
863 } else {
864 Inst++;
865 }
866 }
867 }
868}
869
872
873 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
874 // TODO: For now, we support simple static allocations, we might need to
875 // move non-static ones as well. However, this will need further analysis to
876 // move the lenght arguments as well.
878 };
879
880 for (llvm::Instruction &Inst : Block)
882 if (ShouldHoistAlloca(*AllocaInst))
883 AllocasToMove.push_back(AllocaInst);
884
885 auto InsertPoint =
886 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
887
888 for (llvm::Instruction *AllocaInst : AllocasToMove)
890}
891
893 PostDominatorTree PostDomTree(*Func);
894 for (llvm::BasicBlock &BB : *Func)
895 if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
897}
898
900 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
902 SmallVector<std::unique_ptr<OutlineInfo>, 16> DeferredOutlines;
903 for (std::unique_ptr<OutlineInfo> &OI : OutlineInfos) {
904 // Skip functions that have not finalized yet; may happen with nested
905 // function generation.
906 if (Fn && OI->getFunction() != Fn) {
907 DeferredOutlines.push_back(std::move(OI));
908 continue;
909 }
910
911 ParallelRegionBlockSet.clear();
912 Blocks.clear();
913 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
914
915 Function *OuterFn = OI->getFunction();
916 CodeExtractorAnalysisCache CEAC(*OuterFn);
917 // If we generate code for the target device, we need to allocate
918 // struct for aggregate params in the device default alloca address space.
919 // OpenMP runtime requires that the params of the extracted functions are
920 // passed as zero address space pointers. This flag ensures that
921 // CodeExtractor generates correct code for extracted functions
922 // which are used by OpenMP runtime.
923 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
924 std::unique_ptr<CodeExtractor> Extractor =
925 OI->createCodeExtractor(Blocks, ArgsInZeroAddressSpace, ".omp_par");
926
927 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
928 LLVM_DEBUG(dbgs() << "Entry " << OI->EntryBB->getName()
929 << " Exit: " << OI->ExitBB->getName() << "\n");
930 assert(Extractor->isEligible() &&
931 "Expected OpenMP outlining to be possible!");
932
933 for (auto *V : OI->ExcludeArgsFromAggregate)
934 Extractor->excludeArgFromAggregate(V);
935
936 Function *OutlinedFn =
937 Extractor->extractCodeRegion(CEAC, OI->Inputs, OI->Outputs);
938
939 // Forward target-cpu, target-features attributes to the outlined function.
940 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
941 if (TargetCpuAttr.isStringAttribute())
942 OutlinedFn->addFnAttr(TargetCpuAttr);
943
944 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
945 if (TargetFeaturesAttr.isStringAttribute())
946 OutlinedFn->addFnAttr(TargetFeaturesAttr);
947
948 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
949 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
950 assert(OutlinedFn->getReturnType()->isVoidTy() &&
951 "OpenMP outlined functions should not return a value!");
952
953 // For compability with the clang CG we move the outlined function after the
954 // one with the parallel region.
955 OutlinedFn->removeFromParent();
956 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
957
958 // Remove the artificial entry introduced by the extractor right away, we
959 // made our own entry block after all.
960 {
961 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
962 assert(ArtificialEntry.getUniqueSuccessor() == OI->EntryBB);
963 assert(OI->EntryBB->getUniquePredecessor() == &ArtificialEntry);
964 // Move instructions from the to-be-deleted ArtificialEntry to the entry
965 // basic block of the parallel region. CodeExtractor generates
966 // instructions to unwrap the aggregate argument and may sink
967 // allocas/bitcasts for values that are solely used in the outlined region
968 // and do not escape.
969 assert(!ArtificialEntry.empty() &&
970 "Expected instructions to add in the outlined region entry");
971 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
972 End = ArtificialEntry.rend();
973 It != End;) {
974 Instruction &I = *It;
975 It++;
976
977 if (I.isTerminator()) {
978 // Absorb any debug value that terminator may have
979 if (Instruction *TI = OI->EntryBB->getTerminatorOrNull())
980 TI->adoptDbgRecords(&ArtificialEntry, I.getIterator(), false);
981 continue;
982 }
983
984 I.moveBeforePreserving(*OI->EntryBB,
985 OI->EntryBB->getFirstInsertionPt());
986 }
987
988 OI->EntryBB->moveBefore(&ArtificialEntry);
989 ArtificialEntry.eraseFromParent();
990 }
991 assert(&OutlinedFn->getEntryBlock() == OI->EntryBB);
992 assert(OutlinedFn && OutlinedFn->hasNUses(1));
993
994 // Run a user callback, e.g. to add attributes.
995 if (OI->PostOutlineCB)
996 OI->PostOutlineCB(*OutlinedFn);
997
998 if (OI->FixUpNonEntryAllocas)
1000 }
1001
1002 // Remove work items that have been completed.
1003 OutlineInfos = std::move(DeferredOutlines);
1004
1005 // The createTarget functions embeds user written code into
1006 // the target region which may inject allocas which need to
1007 // be moved to the entry block of our target or risk malformed
1008 // optimisations by later passes, this is only relevant for
1009 // the device pass which appears to be a little more delicate
1010 // when it comes to optimisations (however, we do not block on
1011 // that here, it's up to the inserter to the list to do so).
1012 // This notbaly has to occur after the OutlinedInfo candidates
1013 // have been extracted so we have an end product that will not
1014 // be implicitly adversely affected by any raises unless
1015 // intentionally appended to the list.
1016 // NOTE: This only does so for ConstantData, it could be extended
1017 // to ConstantExpr's with further effort, however, they should
1018 // largely be folded when they get here. Extending it to runtime
1019 // defined/read+writeable allocation sizes would be non-trivial
1020 // (need to factor in movement of any stores to variables the
1021 // allocation size depends on, as well as the usual loads,
1022 // otherwise it'll yield the wrong result after movement) and
1023 // likely be more suitable as an LLVM optimisation pass.
1026
1027 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
1028 [](EmitMetadataErrorKind Kind,
1029 const TargetRegionEntryInfo &EntryInfo) -> void {
1030 errs() << "Error of kind: " << Kind
1031 << " when emitting offload entries and metadata during "
1032 "OMPIRBuilder finalization \n";
1033 };
1034
1035 if (!OffloadInfoManager.empty())
1037
1038 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
1039 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
1040 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
1041 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
1042 }
1043
1044 IsFinalized = true;
1045}
1046
1047bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
1048
1050 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
1051}
1052
1054 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
1055 auto *GV =
1056 new GlobalVariable(M, I32Ty,
1057 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
1058 ConstantInt::get(I32Ty, Value), Name);
1059 GV->setVisibility(GlobalValue::HiddenVisibility);
1060
1061 return GV;
1062}
1063
1065 if (List.empty())
1066 return;
1067
1068 // Convert List to what ConstantArray needs.
1070 UsedArray.resize(List.size());
1071 for (unsigned I = 0, E = List.size(); I != E; ++I)
1073 cast<Constant>(&*List[I]), Builder.getPtrTy());
1074
1075 if (UsedArray.empty())
1076 return;
1077 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
1078
1079 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
1080 ConstantArray::get(ATy, UsedArray), Name);
1081
1082 GV->setSection("llvm.metadata");
1083}
1084
1087 OMPTgtExecModeFlags Mode) {
1088 auto *Int8Ty = Builder.getInt8Ty();
1089 auto *GVMode = new GlobalVariable(
1090 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
1091 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
1092 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
1093 return GVMode;
1094}
1095
1097 uint32_t SrcLocStrSize,
1098 IdentFlag LocFlags,
1099 unsigned Reserve2Flags) {
1100 // Enable "C-mode".
1101 LocFlags |= OMP_IDENT_FLAG_KMPC;
1102
1103 Constant *&Ident =
1104 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1105 if (!Ident) {
1106 Constant *I32Null = ConstantInt::getNullValue(Int32);
1107 Constant *IdentData[] = {I32Null,
1108 ConstantInt::get(Int32, uint32_t(LocFlags)),
1109 ConstantInt::get(Int32, Reserve2Flags),
1110 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1111
1112 size_t SrcLocStrArgIdx = 4;
1113 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1115 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1116 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1117 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1118 Constant *Initializer =
1119 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1120
1121 // Look for existing encoding of the location + flags, not needed but
1122 // minimizes the difference to the existing solution while we transition.
1123 for (GlobalVariable &GV : M.globals())
1124 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1125 if (GV.getInitializer() == Initializer)
1126 Ident = &GV;
1127
1128 if (!Ident) {
1129 auto *GV = new GlobalVariable(
1130 M, OpenMPIRBuilder::Ident,
1131 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1133 M.getDataLayout().getDefaultGlobalsAddressSpace());
1134 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1135 GV->setAlignment(Align(8));
1136 Ident = GV;
1137 }
1138 }
1139
1140 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1141}
1142
1144 uint32_t &SrcLocStrSize) {
1145 SrcLocStrSize = LocStr.size();
1146 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1147 if (!SrcLocStr) {
1148 Constant *Initializer =
1149 ConstantDataArray::getString(M.getContext(), LocStr);
1150
1151 // Look for existing encoding of the location, not needed but minimizes the
1152 // difference to the existing solution while we transition.
1153 for (GlobalVariable &GV : M.globals())
1154 if (GV.isConstant() && GV.hasInitializer() &&
1155 GV.getInitializer() == Initializer)
1156 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1157
1158 SrcLocStr = Builder.CreateGlobalString(
1159 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1160 &M);
1161 }
1162 return SrcLocStr;
1163}
1164
1166 StringRef FileName,
1167 unsigned Line, unsigned Column,
1168 uint32_t &SrcLocStrSize) {
1169 SmallString<128> Buffer;
1170 Buffer.push_back(';');
1171 Buffer.append(FileName);
1172 Buffer.push_back(';');
1173 Buffer.append(FunctionName);
1174 Buffer.push_back(';');
1175 Buffer.append(std::to_string(Line));
1176 Buffer.push_back(';');
1177 Buffer.append(std::to_string(Column));
1178 Buffer.push_back(';');
1179 Buffer.push_back(';');
1180 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1181}
1182
1183Constant *
1185 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1186 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1187}
1188
1190 uint32_t &SrcLocStrSize,
1191 Function *F) {
1192 DILocation *DIL = DL.get();
1193 if (!DIL)
1194 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1195 StringRef FileName =
1196 !DIL->getFilename().empty() ? DIL->getFilename() : M.getName();
1197 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1198 if (Function.empty() && F)
1199 Function = F->getName();
1200 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1201 DIL->getColumn(), SrcLocStrSize);
1202}
1203
1205 uint32_t &SrcLocStrSize) {
1206 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1207 Loc.IP.getBlock()->getParent());
1208}
1209
1212 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1213 "omp_global_thread_num");
1214}
1215
1218 bool ForceSimpleCall, bool CheckCancelFlag) {
1219 if (!updateToLocation(Loc))
1220 return Loc.IP;
1221
1222 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1223 // __kmpc_barrier(loc, thread_id);
1224
1225 IdentFlag BarrierLocFlags;
1226 switch (Kind) {
1227 case OMPD_for:
1228 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1229 break;
1230 case OMPD_sections:
1231 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1232 break;
1233 case OMPD_single:
1234 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1235 break;
1236 case OMPD_barrier:
1237 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1238 break;
1239 default:
1240 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1241 break;
1242 }
1243
1244 uint32_t SrcLocStrSize;
1245 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1246 Value *Args[] = {
1247 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1248 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1249
1250 // If we are in a cancellable parallel region, barriers are cancellation
1251 // points.
1252 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1253 bool UseCancelBarrier =
1254 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1255
1257 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1258 ? OMPRTL___kmpc_cancel_barrier
1259 : OMPRTL___kmpc_barrier),
1260 Args);
1261
1262 if (UseCancelBarrier && CheckCancelFlag)
1263 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1264 return Err;
1265
1266 return Builder.saveIP();
1267}
1268
1271 Value *IfCondition,
1272 omp::Directive CanceledDirective) {
1273 if (!updateToLocation(Loc))
1274 return Loc.IP;
1275
1276 // LLVM utilities like blocks with terminators.
1277 auto *UI = Builder.CreateUnreachable();
1278
1279 Instruction *ThenTI = UI, *ElseTI = nullptr;
1280 if (IfCondition) {
1281 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1282
1283 // Even if the if condition evaluates to false, this should count as a
1284 // cancellation point
1285 Builder.SetInsertPoint(ElseTI);
1286 auto ElseIP = Builder.saveIP();
1287
1289 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1290 if (!IPOrErr)
1291 return IPOrErr;
1292 }
1293
1294 Builder.SetInsertPoint(ThenTI);
1295
1296 Value *CancelKind = nullptr;
1297 switch (CanceledDirective) {
1298#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1299 case DirectiveEnum: \
1300 CancelKind = Builder.getInt32(Value); \
1301 break;
1302#include "llvm/Frontend/OpenMP/OMPKinds.def"
1303 default:
1304 llvm_unreachable("Unknown cancel kind!");
1305 }
1306
1307 uint32_t SrcLocStrSize;
1308 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1309 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1310 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1312 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1313
1314 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1315 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1316 return Err;
1317
1318 // Update the insertion point and remove the terminator we introduced.
1319 Builder.SetInsertPoint(UI->getParent());
1320 UI->eraseFromParent();
1321
1322 return Builder.saveIP();
1323}
1324
1327 omp::Directive CanceledDirective) {
1328 if (!updateToLocation(Loc))
1329 return Loc.IP;
1330
1331 // LLVM utilities like blocks with terminators.
1332 auto *UI = Builder.CreateUnreachable();
1333 Builder.SetInsertPoint(UI);
1334
1335 Value *CancelKind = nullptr;
1336 switch (CanceledDirective) {
1337#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1338 case DirectiveEnum: \
1339 CancelKind = Builder.getInt32(Value); \
1340 break;
1341#include "llvm/Frontend/OpenMP/OMPKinds.def"
1342 default:
1343 llvm_unreachable("Unknown cancel kind!");
1344 }
1345
1346 uint32_t SrcLocStrSize;
1347 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1348 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1349 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1351 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1352
1353 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1354 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1355 return Err;
1356
1357 // Update the insertion point and remove the terminator we introduced.
1358 Builder.SetInsertPoint(UI->getParent());
1359 UI->eraseFromParent();
1360
1361 return Builder.saveIP();
1362}
1363
1365 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1366 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1367 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1368 if (!updateToLocation(Loc))
1369 return Loc.IP;
1370
1371 Builder.restoreIP(AllocaIP);
1372 auto *KernelArgsPtr =
1373 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1375
1376 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1377 llvm::Value *Arg =
1378 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1379 Builder.CreateAlignedStore(
1380 KernelArgs[I], Arg,
1381 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1382 }
1383
1384 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1385 NumThreads, HostPtr, KernelArgsPtr};
1386
1388 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1389 OffloadingArgs);
1390
1391 return Builder.saveIP();
1392}
1393
1395 const LocationDescription &Loc, Value *OutlinedFnID,
1396 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1397 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1398
1399 if (!updateToLocation(Loc))
1400 return Loc.IP;
1401
1402 // On top of the arrays that were filled up, the target offloading call
1403 // takes as arguments the device id as well as the host pointer. The host
1404 // pointer is used by the runtime library to identify the current target
1405 // region, so it only has to be unique and not necessarily point to
1406 // anything. It could be the pointer to the outlined function that
1407 // implements the target region, but we aren't using that so that the
1408 // compiler doesn't need to keep that, and could therefore inline the host
1409 // function if proven worthwhile during optimization.
1410
1411 // From this point on, we need to have an ID of the target region defined.
1412 assert(OutlinedFnID && "Invalid outlined function ID!");
1413 (void)OutlinedFnID;
1414
1415 // Return value of the runtime offloading call.
1416 Value *Return = nullptr;
1417
1418 // Arguments for the target kernel.
1419 SmallVector<Value *> ArgsVector;
1420 getKernelArgsVector(Args, Builder, ArgsVector);
1421
1422 // The target region is an outlined function launched by the runtime
1423 // via calls to __tgt_target_kernel().
1424 //
1425 // Note that on the host and CPU targets, the runtime implementation of
1426 // these calls simply call the outlined function without forking threads.
1427 // The outlined functions themselves have runtime calls to
1428 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1429 // the compiler in emitTeamsCall() and emitParallelCall().
1430 //
1431 // In contrast, on the NVPTX target, the implementation of
1432 // __tgt_target_teams() launches a GPU kernel with the requested number
1433 // of teams and threads so no additional calls to the runtime are required.
1434 // Check the error code and execute the host version if required.
1435 Builder.restoreIP(emitTargetKernel(
1436 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1437 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1438
1439 BasicBlock *OffloadFailedBlock =
1440 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1441 BasicBlock *OffloadContBlock =
1442 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1443 Value *Failed = Builder.CreateIsNotNull(Return);
1444 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1445
1446 auto CurFn = Builder.GetInsertBlock()->getParent();
1447 emitBlock(OffloadFailedBlock, CurFn);
1448 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1449 if (!AfterIP)
1450 return AfterIP.takeError();
1451 Builder.restoreIP(*AfterIP);
1452 emitBranch(OffloadContBlock);
1453 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1454 return Builder.saveIP();
1455}
1456
1458 Value *CancelFlag, omp::Directive CanceledDirective) {
1459 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1460 "Unexpected cancellation!");
1461
1462 // For a cancel barrier we create two new blocks.
1463 BasicBlock *BB = Builder.GetInsertBlock();
1464 BasicBlock *NonCancellationBlock;
1465 if (Builder.GetInsertPoint() == BB->end()) {
1466 // TODO: This branch will not be needed once we moved to the
1467 // OpenMPIRBuilder codegen completely.
1468 NonCancellationBlock = BasicBlock::Create(
1469 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1470 } else {
1471 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1473 Builder.SetInsertPoint(BB);
1474 }
1475 BasicBlock *CancellationBlock = BasicBlock::Create(
1476 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1477
1478 // Jump to them based on the return value.
1479 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1480 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1481 /* TODO weight */ nullptr, nullptr);
1482
1483 // From the cancellation block we finalize all variables and go to the
1484 // post finalization block that is known to the FiniCB callback.
1485 auto &FI = FinalizationStack.back();
1486 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1487 if (!FiniBBOrErr)
1488 return FiniBBOrErr.takeError();
1489 Builder.SetInsertPoint(CancellationBlock);
1490 Builder.CreateBr(*FiniBBOrErr);
1491
1492 // The continuation block is where code generation continues.
1493 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1494 return Error::success();
1495}
1496
1497/// Create wrapper function used to gather the outlined function's argument
1498/// structure from a shared buffer and to forward them to it when running in
1499/// Generic mode.
1500///
1501/// The outlined function is expected to receive 2 integer arguments followed by
1502/// an optional pointer argument to an argument structure holding the rest.
1504 Function &OutlinedFn) {
1505 size_t NumArgs = OutlinedFn.arg_size();
1506 assert((NumArgs == 2 || NumArgs == 3) &&
1507 "expected a 2-3 argument parallel outlined function");
1508 bool UseArgStruct = NumArgs == 3;
1509
1510 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1511 IRBuilder<>::InsertPointGuard IPG(Builder);
1512 auto *FnTy = FunctionType::get(Builder.getVoidTy(),
1513 {Builder.getInt16Ty(), Builder.getInt32Ty()},
1514 /*isVarArg=*/false);
1515 auto *WrapperFn =
1517 OutlinedFn.getName() + ".wrapper", OMPIRBuilder->M);
1518
1519 WrapperFn->addParamAttr(0, Attribute::NoUndef);
1520 WrapperFn->addParamAttr(0, Attribute::ZExt);
1521 WrapperFn->addParamAttr(1, Attribute::NoUndef);
1522
1523 BasicBlock *EntryBB =
1524 BasicBlock::Create(OMPIRBuilder->M.getContext(), "entry", WrapperFn);
1525 Builder.SetInsertPoint(EntryBB);
1526
1527 // Allocation.
1528 Value *AddrAlloca = Builder.CreateAlloca(Builder.getInt32Ty(),
1529 /*ArraySize=*/nullptr, "addr");
1530 AddrAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1531 AddrAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1532 AddrAlloca->getName() + ".ascast");
1533
1534 Value *ZeroAlloca = Builder.CreateAlloca(Builder.getInt32Ty(),
1535 /*ArraySize=*/nullptr, "zero");
1536 ZeroAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1537 ZeroAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1538 ZeroAlloca->getName() + ".ascast");
1539
1540 Value *ArgsAlloca = nullptr;
1541 if (UseArgStruct) {
1542 ArgsAlloca = Builder.CreateAlloca(Builder.getPtrTy(),
1543 /*ArraySize=*/nullptr, "global_args");
1544 ArgsAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1545 ArgsAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1546 ArgsAlloca->getName() + ".ascast");
1547 }
1548
1549 // Initialization.
1550 Builder.CreateStore(WrapperFn->getArg(1), AddrAlloca);
1551 Builder.CreateStore(Builder.getInt32(0), ZeroAlloca);
1552 if (UseArgStruct) {
1553 Builder.CreateCall(
1554 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(
1555 llvm::omp::RuntimeFunction::OMPRTL___kmpc_get_shared_variables),
1556 {ArgsAlloca});
1557 }
1558
1559 SmallVector<Value *, 3> Args{AddrAlloca, ZeroAlloca};
1560
1561 // Load structArg from global_args.
1562 if (UseArgStruct) {
1563 Value *StructArg = Builder.CreateLoad(Builder.getPtrTy(), ArgsAlloca);
1564 StructArg = Builder.CreateInBoundsGEP(Builder.getPtrTy(), StructArg,
1565 {Builder.getInt64(0)});
1566 StructArg = Builder.CreateLoad(Builder.getPtrTy(), StructArg, "structArg");
1567 Args.push_back(StructArg);
1568 }
1569
1570 // Call the outlined function holding the parallel body.
1571 Builder.CreateCall(&OutlinedFn, Args);
1572 Builder.CreateRetVoid();
1573
1574 return WrapperFn;
1575}
1576
1577// Callback used to create OpenMP runtime calls to support
1578// omp parallel clause for the device.
1579// We need to use this callback to replace call to the OutlinedFn in OuterFn
1580// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1582 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1583 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1584 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1585 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1586 assert(OutlinedFn.arg_size() >= 2 &&
1587 "Expected at least tid and bounded tid as arguments");
1588 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1589
1590 // Add some known attributes.
1591 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1592 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1593 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1594 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1595 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1596 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1597
1598 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1599 assert(CI && "Expected call instruction to outlined function");
1600 CI->getParent()->setName("omp_parallel");
1601
1602 Builder.SetInsertPoint(CI);
1603 Type *PtrTy = OMPIRBuilder->VoidPtr;
1604
1605 // Add alloca for kernel args
1606 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1607 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1608 AllocaInst *ArgsAlloca =
1609 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1610 Value *Args = ArgsAlloca;
1611 // Add address space cast if array for storing arguments is not allocated
1612 // in address space 0
1613 if (ArgsAlloca->getAddressSpace())
1614 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1615 Builder.restoreIP(CurrentIP);
1616
1617 // Store captured vars which are used by kmpc_parallel_60
1618 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1619 Value *V = *(CI->arg_begin() + 2 + Idx);
1620 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1621 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1622 Builder.CreateStore(V, StoreAddress);
1623 }
1624
1625 Value *Cond =
1626 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1627 : Builder.getInt32(1);
1628 Value *NumThreadsArg =
1629 NumThreads ? Builder.CreateZExtOrTrunc(NumThreads, OMPIRBuilder->Int32)
1630 : Builder.getInt32(-1);
1631
1632 // If this is not a Generic kernel, we can skip generating the wrapper.
1633 Value *WrapperFn;
1634 if (isGenericKernel(*OuterFn))
1635 WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn);
1636 else
1637 WrapperFn = Constant::getNullValue(PtrTy);
1638
1639 // Build kmpc_parallel_60 call
1640 Value *Parallel60CallArgs[] = {
1641 /* identifier*/ Ident,
1642 /* global thread num*/ ThreadID,
1643 /* if expression */ Cond,
1644 /* number of threads */ NumThreadsArg,
1645 /* Proc bind */ Builder.getInt32(-1),
1646 /* outlined function */ &OutlinedFn,
1647 /* wrapper function */ WrapperFn,
1648 /* arguments of the outlined funciton*/ Args,
1649 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1650 /* strict for number of threads */ Builder.getInt32(0)};
1651
1652 FunctionCallee RTLFn =
1653 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1654
1655 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1656
1657 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1658 << *Builder.GetInsertBlock()->getParent() << "\n");
1659
1660 // Initialize the local TID stack location with the argument value.
1661 Builder.SetInsertPoint(PrivTID);
1662 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1663 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1664 PrivTIDAddr);
1665
1666 // Remove redundant call to the outlined function.
1667 CI->eraseFromParent();
1668
1669 for (Instruction *I : ToBeDeleted) {
1670 I->eraseFromParent();
1671 }
1672}
1673
1674// Callback used to create OpenMP runtime calls to support
1675// omp parallel clause for the host.
1676// We need to use this callback to replace call to the OutlinedFn in OuterFn
1677// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1678static void
1680 Function *OuterFn, Value *Ident, Value *IfCondition,
1681 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1682 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1683 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1684 FunctionCallee RTLFn;
1685 if (IfCondition) {
1686 RTLFn =
1687 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1688 } else {
1689 RTLFn =
1690 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1691 }
1692 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1693 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1694 LLVMContext &Ctx = F->getContext();
1695 MDBuilder MDB(Ctx);
1696 // Annotate the callback behavior of the __kmpc_fork_call:
1697 // - The callback callee is argument number 2 (microtask).
1698 // - The first two arguments of the callback callee are unknown (-1).
1699 // - All variadic arguments to the __kmpc_fork_call are passed to the
1700 // callback callee.
1701 F->addMetadata(LLVMContext::MD_callback,
1703 2, {-1, -1},
1704 /* VarArgsArePassed */ true)}));
1705 }
1706 }
1707 // Add some known attributes.
1708 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1709 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1710 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1711
1712 assert(OutlinedFn.arg_size() >= 2 &&
1713 "Expected at least tid and bounded tid as arguments");
1714 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1715
1716 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1717 CI->getParent()->setName("omp_parallel");
1718 Builder.SetInsertPoint(CI);
1719
1720 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1721 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1722 &OutlinedFn};
1723
1724 SmallVector<Value *, 16> RealArgs;
1725 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1726 if (IfCondition) {
1727 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1728 RealArgs.push_back(Cond);
1729 }
1730 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1731
1732 // __kmpc_fork_call_if always expects a void ptr as the last argument
1733 // If there are no arguments, pass a null pointer.
1734 auto PtrTy = OMPIRBuilder->VoidPtr;
1735 if (IfCondition && NumCapturedVars == 0) {
1736 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1737 RealArgs.push_back(NullPtrValue);
1738 }
1739
1740 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1741
1742 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1743 << *Builder.GetInsertBlock()->getParent() << "\n");
1744
1745 // Initialize the local TID stack location with the argument value.
1746 Builder.SetInsertPoint(PrivTID);
1747 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1748 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1749 PrivTIDAddr);
1750
1751 // Remove redundant call to the outlined function.
1752 CI->eraseFromParent();
1753
1754 for (Instruction *I : ToBeDeleted) {
1755 I->eraseFromParent();
1756 }
1757}
1758
1760 const LocationDescription &Loc, InsertPointTy OuterAllocIP,
1761 ArrayRef<BasicBlock *> OuterDeallocBlocks, BodyGenCallbackTy BodyGenCB,
1762 PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition,
1763 Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable) {
1764 assert(!isConflictIP(Loc.IP, OuterAllocIP) && "IPs must not be ambiguous");
1765
1766 if (!updateToLocation(Loc))
1767 return Loc.IP;
1768
1769 uint32_t SrcLocStrSize;
1770 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1771 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1772 const bool NeedThreadID = NumThreads || Config.isTargetDevice() ||
1773 (ProcBind != OMP_PROC_BIND_default);
1774 Value *ThreadID = NeedThreadID ? getOrCreateThreadID(Ident) : nullptr;
1775 // If we generate code for the target device, we need to allocate
1776 // struct for aggregate params in the device default alloca address space.
1777 // OpenMP runtime requires that the params of the extracted functions are
1778 // passed as zero address space pointers. This flag ensures that extracted
1779 // function arguments are declared in zero address space
1780 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1781
1782 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1783 // only if we compile for host side.
1784 if (NumThreads && !Config.isTargetDevice()) {
1785 Value *Args[] = {
1786 Ident, ThreadID,
1787 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1789 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1790 }
1791
1792 if (ProcBind != OMP_PROC_BIND_default) {
1793 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1794 Value *Args[] = {
1795 Ident, ThreadID,
1796 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1798 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1799 }
1800
1801 BasicBlock *InsertBB = Builder.GetInsertBlock();
1802 Function *OuterFn = InsertBB->getParent();
1803
1804 // Save the outer alloca block because the insertion iterator may get
1805 // invalidated and we still need this later.
1806 BasicBlock *OuterAllocaBlock = OuterAllocIP.getBlock();
1807
1808 // Vector to remember instructions we used only during the modeling but which
1809 // we want to delete at the end.
1811
1812 // Change the location to the outer alloca insertion point to create and
1813 // initialize the allocas we pass into the parallel region.
1814 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1815 Builder.restoreIP(NewOuter);
1816 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1817 AllocaInst *ZeroAddrAlloca =
1818 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1819 Instruction *TIDAddr = TIDAddrAlloca;
1820 Instruction *ZeroAddr = ZeroAddrAlloca;
1821 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1822 // Add additional casts to enforce pointers in zero address space
1823 TIDAddr = new AddrSpaceCastInst(
1824 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1825 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1826 ToBeDeleted.push_back(TIDAddr);
1827 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1828 PointerType ::get(M.getContext(), 0),
1829 "zero.addr.ascast");
1830 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1831 ToBeDeleted.push_back(ZeroAddr);
1832 }
1833
1834 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1835 // associated arguments in the outlined function, so we delete them later.
1836 ToBeDeleted.push_back(TIDAddrAlloca);
1837 ToBeDeleted.push_back(ZeroAddrAlloca);
1838
1839 // Create an artificial insertion point that will also ensure the blocks we
1840 // are about to split are not degenerated.
1841 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1842
1843 BasicBlock *EntryBB = UI->getParent();
1844 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1845 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1846 BasicBlock *PRegPreFiniBB =
1847 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1848 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1849
1850 auto FiniCBWrapper = [&](InsertPointTy IP) {
1851 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1852 // target to the region exit block.
1853 if (IP.getBlock()->end() == IP.getPoint()) {
1855 Builder.restoreIP(IP);
1856 Instruction *I = Builder.CreateBr(PRegExitBB);
1857 IP = InsertPointTy(I->getParent(), I->getIterator());
1858 }
1859 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1860 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1861 "Unexpected insertion point for finalization call!");
1862 return FiniCB(IP);
1863 };
1864
1865 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1866
1867 // Generate the privatization allocas in the block that will become the entry
1868 // of the outlined function.
1869 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1870 InsertPointTy InnerAllocaIP = Builder.saveIP();
1871
1872 AllocaInst *PrivTIDAddr =
1873 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1874 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1875
1876 // Add some fake uses for OpenMP provided arguments.
1877 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1878 Instruction *ZeroAddrUse =
1879 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1880 ToBeDeleted.push_back(ZeroAddrUse);
1881
1882 // EntryBB
1883 // |
1884 // V
1885 // PRegionEntryBB <- Privatization allocas are placed here.
1886 // |
1887 // V
1888 // PRegionBodyBB <- BodeGen is invoked here.
1889 // |
1890 // V
1891 // PRegPreFiniBB <- The block we will start finalization from.
1892 // |
1893 // V
1894 // PRegionExitBB <- A common exit to simplify block collection.
1895 //
1896
1897 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1898
1899 // Let the caller create the body.
1900 assert(BodyGenCB && "Expected body generation callback!");
1901 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1902 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP, PRegExitBB))
1903 return Err;
1904
1905 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1906
1907 // If OuterFn is a Generic kernel, we need to use device shared memory to
1908 // allocate argument structures. Otherwise, we use stack allocations as usual.
1909 bool UsesDeviceSharedMemory =
1910 Config.isTargetDevice() && isGenericKernel(*OuterFn);
1911 std::unique_ptr<OutlineInfo> OI =
1912 UsesDeviceSharedMemory
1913 ? std::make_unique<DeviceSharedMemOutlineInfo>(*this)
1914 : std::make_unique<OutlineInfo>();
1915
1916 if (Config.isTargetDevice()) {
1917 // Generate OpenMP target specific runtime call
1918 OI->PostOutlineCB = [=, ToBeDeletedVec =
1919 std::move(ToBeDeleted)](Function &OutlinedFn) {
1920 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1921 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1922 ThreadID, ToBeDeletedVec);
1923 };
1924 } else {
1925 // Generate OpenMP host runtime call
1926 OI->PostOutlineCB = [=, ToBeDeletedVec =
1927 std::move(ToBeDeleted)](Function &OutlinedFn) {
1928 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1929 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1930 };
1931 }
1932
1933 OI->FixUpNonEntryAllocas = true;
1934 OI->OuterAllocBB = OuterAllocaBlock;
1935 OI->EntryBB = PRegEntryBB;
1936 OI->ExitBB = PRegExitBB;
1937 OI->OuterDeallocBBs.reserve(OuterDeallocBlocks.size());
1938 copy(OuterDeallocBlocks, OI->OuterDeallocBBs.end());
1939
1940 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1942 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
1943
1944 CodeExtractorAnalysisCache CEAC(*OuterFn);
1945 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1946 /* AggregateArgs */ false,
1947 /* BlockFrequencyInfo */ nullptr,
1948 /* BranchProbabilityInfo */ nullptr,
1949 /* AssumptionCache */ nullptr,
1950 /* AllowVarArgs */ true,
1951 /* AllowAlloca */ true,
1952 /* AllocationBlock */ OuterAllocaBlock,
1953 /* DeallocationBlocks */ {},
1954 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1955
1956 // Find inputs to, outputs from the code region.
1957 BasicBlock *CommonExit = nullptr;
1958 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1959 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1960
1961 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1962 /*CollectGlobalInputs=*/true);
1963
1964 Inputs.remove_if([&](Value *I) {
1966 return GV->getValueType() == OpenMPIRBuilder::Ident;
1967
1968 return false;
1969 });
1970
1971 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1972
1973 FunctionCallee TIDRTLFn =
1974 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1975
1976 auto PrivHelper = [&](Value &V) -> Error {
1977 if (&V == TIDAddr || &V == ZeroAddr) {
1978 OI->ExcludeArgsFromAggregate.push_back(&V);
1979 return Error::success();
1980 }
1981
1983 for (Use &U : V.uses())
1984 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1985 if (ParallelRegionBlockSet.count(UserI->getParent()))
1986 Uses.insert(&U);
1987
1988 // __kmpc_fork_call expects extra arguments as pointers. If the input
1989 // already has a pointer type, everything is fine. Otherwise, store the
1990 // value onto stack and load it back inside the to-be-outlined region. This
1991 // will ensure only the pointer will be passed to the function.
1992 // FIXME: if there are more than 15 trailing arguments, they must be
1993 // additionally packed in a struct.
1994 Value *Inner = &V;
1995 if (!V.getType()->isPointerTy()) {
1997 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1998
1999 Builder.restoreIP(OuterAllocIP);
2000 Value *Ptr;
2001 if (UsesDeviceSharedMemory) {
2002 // Use device shared memory instead, if needed.
2003 Ptr = createOMPAllocShared(OuterAllocIP, V.getType(),
2004 V.getName() + ".reloaded");
2005 for (BasicBlock *DeallocBlock : OuterDeallocBlocks)
2007 InsertPointTy(DeallocBlock, DeallocBlock->getFirstInsertionPt()),
2008 Ptr, V.getType());
2009 } else {
2010 Ptr = Builder.CreateAlloca(V.getType(), nullptr,
2011 V.getName() + ".reloaded");
2012 }
2013
2014 // Store to stack at end of the block that currently branches to the entry
2015 // block of the to-be-outlined region.
2016 Builder.SetInsertPoint(InsertBB,
2017 InsertBB->getTerminator()->getIterator());
2018 Builder.CreateStore(&V, Ptr);
2019
2020 // Load back next to allocations in the to-be-outlined region.
2021 Builder.restoreIP(InnerAllocaIP);
2022 Inner = Builder.CreateLoad(V.getType(), Ptr);
2023 }
2024
2025 Value *ReplacementValue = nullptr;
2026 CallInst *CI = dyn_cast<CallInst>(&V);
2027 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
2028 ReplacementValue = PrivTID;
2029 } else {
2030 InsertPointOrErrorTy AfterIP =
2031 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
2032 if (!AfterIP)
2033 return AfterIP.takeError();
2034 Builder.restoreIP(*AfterIP);
2035 InnerAllocaIP = {
2036 InnerAllocaIP.getBlock(),
2037 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
2038
2039 assert(ReplacementValue &&
2040 "Expected copy/create callback to set replacement value!");
2041 if (ReplacementValue == &V)
2042 return Error::success();
2043 }
2044
2045 for (Use *UPtr : Uses)
2046 UPtr->set(ReplacementValue);
2047
2048 return Error::success();
2049 };
2050
2051 // Reset the inner alloca insertion as it will be used for loading the values
2052 // wrapped into pointers before passing them into the to-be-outlined region.
2053 // Configure it to insert immediately after the fake use of zero address so
2054 // that they are available in the generated body and so that the
2055 // OpenMP-related values (thread ID and zero address pointers) remain leading
2056 // in the argument list.
2057 InnerAllocaIP = IRBuilder<>::InsertPoint(
2058 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
2059
2060 // Reset the outer alloca insertion point to the entry of the relevant block
2061 // in case it was invalidated.
2062 OuterAllocIP = IRBuilder<>::InsertPoint(
2063 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
2064
2065 for (Value *Input : Inputs) {
2066 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
2067 if (Error Err = PrivHelper(*Input))
2068 return Err;
2069 }
2070 LLVM_DEBUG({
2071 for (Value *Output : Outputs)
2072 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
2073 });
2074 assert(Outputs.empty() &&
2075 "OpenMP outlining should not produce live-out values!");
2076
2077 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
2078 LLVM_DEBUG({
2079 for (auto *BB : Blocks)
2080 dbgs() << " PBR: " << BB->getName() << "\n";
2081 });
2082
2083 // Adjust the finalization stack, verify the adjustment, and call the
2084 // finalize function a last time to finalize values between the pre-fini
2085 // block and the exit block if we left the parallel "the normal way".
2086 auto FiniInfo = FinalizationStack.pop_back_val();
2087 (void)FiniInfo;
2088 assert(FiniInfo.DK == OMPD_parallel &&
2089 "Unexpected finalization stack state!");
2090
2091 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
2092
2093 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
2094 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
2095 if (!FiniBBOrErr)
2096 return FiniBBOrErr.takeError();
2097 {
2099 Builder.restoreIP(PreFiniIP);
2100 Builder.CreateBr(*FiniBBOrErr);
2101 // There's currently a branch to omp.par.exit. Delete it. We will get there
2102 // via the fini block
2103 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
2104 Term->eraseFromParent();
2105 }
2106
2107 // Register the outlined info.
2108 addOutlineInfo(std::move(OI));
2109
2110 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
2111 UI->eraseFromParent();
2112
2113 return AfterIP;
2114}
2115
2117 // Build call void __kmpc_flush(ident_t *loc)
2118 uint32_t SrcLocStrSize;
2119 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2120 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
2121
2123 Args);
2124}
2125
2127 if (!updateToLocation(Loc))
2128 return;
2129 emitFlush(Loc);
2130}
2131
2133 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
2134 // global_tid);
2135 uint32_t SrcLocStrSize;
2136 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2137 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2138 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
2139
2140 // Ignore return result until untied tasks are supported.
2142 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
2143}
2144
2150
2152 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
2153 uint32_t SrcLocStrSize;
2154 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2155 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2156 Constant *I32Null = ConstantInt::getNullValue(Int32);
2157 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
2158
2160 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
2161}
2162
2168
2170 const DependData &Dep) {
2171 // Store the pointer to the variable
2172 Value *Addr = Builder.CreateStructGEP(
2173 DependInfo, Entry,
2174 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2175 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, SizeTy);
2176 Builder.CreateStore(DepValPtr, Addr);
2177 // Store the size of the variable
2178 Value *Size = Builder.CreateStructGEP(
2179 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Len));
2180 Builder.CreateStore(
2181 ConstantInt::get(SizeTy,
2182 M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
2183 Size);
2184 // Store the dependency kind
2185 Value *Flags = Builder.CreateStructGEP(
2186 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Flags));
2187 Builder.CreateStore(ConstantInt::get(Builder.getInt8Ty(),
2188 static_cast<unsigned int>(Dep.DepKind)),
2189 Flags);
2190}
2191
2192// Processes the dependencies in Dependencies and does the following
2193// - Allocates space on the stack of an array of DependInfo objects
2194// - Populates each DependInfo object with relevant information of
2195// the corresponding dependence.
2196// - All code is inserted in the entry block of the current function.
2198 OpenMPIRBuilder &OMPBuilder,
2200 // Early return if we have no dependencies to process
2201 if (Dependencies.empty())
2202 return nullptr;
2203
2204 // Given a vector of DependData objects, in this function we create an
2205 // array on the stack that holds kmp_depend_info objects corresponding
2206 // to each dependency. This is then passed to the OpenMP runtime.
2207 // For example, if there are 'n' dependencies then the following psedo
2208 // code is generated. Assume the first dependence is on a variable 'a'
2209 //
2210 // \code{c}
2211 // DepArray = alloc(n x sizeof(kmp_depend_info);
2212 // idx = 0;
2213 // DepArray[idx].base_addr = ptrtoint(&a);
2214 // DepArray[idx].len = 8;
2215 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
2216 // ++idx;
2217 // DepArray[idx].base_addr = ...;
2218 // \endcode
2219
2220 IRBuilderBase &Builder = OMPBuilder.Builder;
2221 Type *DependInfo = OMPBuilder.DependInfo;
2222
2223 Value *DepArray = nullptr;
2224 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
2225 Builder.SetInsertPoint(
2227
2228 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2229 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2230
2231 Builder.restoreIP(OldIP);
2232
2233 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2234 Value *Base =
2235 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2236 OMPBuilder.emitTaskDependency(Builder, Base, Dep);
2237 }
2238 return DepArray;
2239}
2240
2241/// Create the task duplication function passed to kmpc_taskloop.
2242Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2243 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2244 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2245 if (!DupCB)
2247 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2248
2249 // From OpenMP Runtime p_task_dup_t:
2250 // Routine optionally generated by the compiler for setting the lastprivate
2251 // flag and calling needed constructors for private/firstprivate objects (used
2252 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2253 // lastprivate flag.
2254 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2255
2256 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2257
2258 FunctionType *DupFuncTy = FunctionType::get(
2259 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2260 /*isVarArg=*/false);
2261
2262 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2263 "omp_taskloop_dup", M);
2264 Value *DestTaskArg = DupFunction->getArg(0);
2265 Value *SrcTaskArg = DupFunction->getArg(1);
2266 Value *LastprivateFlagArg = DupFunction->getArg(2);
2267 DestTaskArg->setName("dest_task");
2268 SrcTaskArg->setName("src_task");
2269 LastprivateFlagArg->setName("lastprivate_flag");
2270
2271 IRBuilderBase::InsertPointGuard Guard(Builder);
2272 Builder.SetInsertPoint(
2273 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2274
2275 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2276 Type *TaskWithPrivatesTy =
2277 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2278 Value *TaskPrivates = Builder.CreateGEP(
2279 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2280 Value *ContextPtr = Builder.CreateGEP(
2281 PrivatesTy, TaskPrivates,
2282 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2283 return ContextPtr;
2284 };
2285
2286 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2287 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2288
2289 DestTaskContextPtr->setName("destPtr");
2290 SrcTaskContextPtr->setName("srcPtr");
2291
2292 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2293 DupFunction->getEntryBlock().begin());
2294 InsertPointTy CodeGenIP = Builder.saveIP();
2295 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2296 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2297 if (!AfterIPOrError)
2298 return AfterIPOrError.takeError();
2299 Builder.restoreIP(*AfterIPOrError);
2300
2301 Builder.CreateRetVoid();
2302
2303 return DupFunction;
2304}
2305
2306OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2307 const LocationDescription &Loc, InsertPointTy AllocaIP,
2308 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB,
2309 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2310 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2311 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2312 Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
2313 Value *TaskContextStructPtrVal) {
2314
2315 if (!updateToLocation(Loc))
2316 return InsertPointTy();
2317
2318 uint32_t SrcLocStrSize;
2319 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2320 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2321
2322 BasicBlock *TaskloopExitBB =
2323 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2324 BasicBlock *TaskloopBodyBB =
2325 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2326 BasicBlock *TaskloopAllocaBB =
2327 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2328
2329 InsertPointTy TaskloopAllocaIP =
2330 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2331 InsertPointTy TaskloopBodyIP =
2332 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2333
2334 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP, TaskloopExitBB))
2335 return Err;
2336
2337 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2338 if (!result) {
2339 return result.takeError();
2340 }
2341
2342 llvm::CanonicalLoopInfo *CLI = result.get();
2343 auto OI = std::make_unique<OutlineInfo>();
2344 OI->EntryBB = TaskloopAllocaBB;
2345 OI->OuterAllocBB = AllocaIP.getBlock();
2346 OI->ExitBB = TaskloopExitBB;
2347 OI->OuterDeallocBBs.reserve(DeallocBlocks.size());
2348 copy(DeallocBlocks, OI->OuterDeallocBBs.end());
2349
2350 // Add the thread ID argument.
2351 SmallVector<Instruction *> ToBeDeleted;
2352 // dummy instruction to be used as a fake argument
2353 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2354 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2355 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2356 TaskloopAllocaIP, "lb", false, true);
2357 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2358 TaskloopAllocaIP, "ub", false, true);
2359 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2360 TaskloopAllocaIP, "step", false, true);
2361 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2362 // aggregate struct
2363 OI->Inputs.insert(FakeLB);
2364 OI->Inputs.insert(FakeUB);
2365 OI->Inputs.insert(FakeStep);
2366 if (TaskContextStructPtrVal)
2367 OI->Inputs.insert(TaskContextStructPtrVal);
2368 assert(((TaskContextStructPtrVal && DupCB) ||
2369 (!TaskContextStructPtrVal && !DupCB)) &&
2370 "Task context struct ptr and duplication callback must be both set "
2371 "or both null");
2372
2373 // It isn't safe to run the duplication bodygen callback inside the post
2374 // outlining callback so this has to be run now before we know the real task
2375 // shareds structure type.
2376 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2377 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2378 Type *FakeSharedsTy = StructType::get(
2379 Builder.getContext(),
2380 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2381 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2382 FakeSharedsTy,
2383 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2384 if (!TaskDupFnOrErr) {
2385 return TaskDupFnOrErr.takeError();
2386 }
2387 Value *TaskDupFn = *TaskDupFnOrErr;
2388
2389 OI->PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2390 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2391 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2392 FakeStep, FakeSharedsTy, Final, Mergeable, Priority,
2393 NumOfCollapseLoops](Function &OutlinedFn) mutable {
2394 // Replace the Stale CI by appropriate RTL function call.
2395 assert(OutlinedFn.hasOneUse() &&
2396 "there must be a single user for the outlined function");
2397 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2398
2399 /* Create the casting for the Bounds Values that can be used when outlining
2400 * to replace the uses of the fakes with real values */
2401 BasicBlock *CodeReplBB = StaleCI->getParent();
2402 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2403 Value *CastedLBVal =
2404 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2405 Value *CastedUBVal =
2406 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2407 Value *CastedStepVal =
2408 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2409
2410 Builder.SetInsertPoint(StaleCI);
2411
2412 // Gather the arguments for emitting the runtime call for
2413 // @__kmpc_omp_task_alloc
2414 Function *TaskAllocFn =
2415 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2416
2417 Value *ThreadID = getOrCreateThreadID(Ident);
2418
2419 if (!NoGroup) {
2420 // Emit runtime call for @__kmpc_taskgroup
2421 Function *TaskgroupFn =
2422 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2423 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2424 }
2425
2426 // `flags` Argument Configuration
2427 // Task is tied if (Flags & 1) == 1.
2428 // Task is untied if (Flags & 1) == 0.
2429 // Task is final if (Flags & 2) == 2.
2430 // Task is not final if (Flags & 2) == 0.
2431 // Task is mergeable if (Flags & 4) == 4.
2432 // Task is not mergeable if (Flags & 4) == 0.
2433 // Task is priority if (Flags & 32) == 32.
2434 // Task is not priority if (Flags & 32) == 0.
2435 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2436 if (Final)
2437 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2438 if (Mergeable)
2439 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2440 if (Priority)
2441 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2442
2443 Value *TaskSize = Builder.getInt64(
2444 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2445
2446 AllocaInst *ArgStructAlloca =
2448 assert(ArgStructAlloca &&
2449 "Unable to find the alloca instruction corresponding to arguments "
2450 "for extracted function");
2451 std::optional<TypeSize> ArgAllocSize =
2452 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2453 assert(ArgAllocSize &&
2454 "Unable to determine size of arguments for extracted function");
2455 Value *SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2456
2457 // Emit the @__kmpc_omp_task_alloc runtime call
2458 // The runtime call returns a pointer to an area where the task captured
2459 // variables must be copied before the task is run (TaskData)
2460 CallInst *TaskData = Builder.CreateCall(
2461 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2462 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2463 /*task_func=*/&OutlinedFn});
2464
2465 Value *Shareds = StaleCI->getArgOperand(1);
2466 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2467 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2468 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2469 SharedsSize);
2470 // Get the pointer to loop lb, ub, step from task ptr
2471 // and set up the lowerbound,upperbound and step values
2472 llvm::Value *Lb = Builder.CreateGEP(
2473 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2474
2475 llvm::Value *Ub = Builder.CreateGEP(
2476 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2477
2478 llvm::Value *Step = Builder.CreateGEP(
2479 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2480 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2481
2482 // set up the arguments for emitting kmpc_taskloop runtime call
2483 // setting values for ifval, nogroup, sched, grainsize, task_dup
2484 Value *IfCondVal =
2485 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2486 : Builder.getInt32(1);
2487 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2488 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2489 Value *NoGroupVal = Builder.getInt32(1);
2490 Value *SchedVal = Builder.getInt32(Sched);
2491 Value *GrainSizeVal =
2492 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2493 : Builder.getInt64(0);
2494 Value *TaskDup = TaskDupFn;
2495
2496 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2497 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2498
2499 // taskloop runtime call
2500 Function *TaskloopFn =
2501 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2502 Builder.CreateCall(TaskloopFn, Args);
2503
2504 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2505 // nogroup is not defined
2506 if (!NoGroup) {
2507 Function *EndTaskgroupFn =
2508 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2509 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2510 }
2511
2512 StaleCI->eraseFromParent();
2513
2514 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2515
2516 LoadInst *SharedsOutlined =
2517 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2518 OutlinedFn.getArg(1)->replaceUsesWithIf(
2519 SharedsOutlined,
2520 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2521
2522 Value *IV = CLI->getIndVar();
2523 Type *IVTy = IV->getType();
2524 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2525
2526 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2527 // UpperBound. These GEP's can be reused for loading the tasks respective
2528 // bounds.
2529 Value *TaskLB = nullptr;
2530 Value *TaskUB = nullptr;
2531 Value *TaskStep = nullptr;
2532 Value *LoadTaskLB = nullptr;
2533 Value *LoadTaskUB = nullptr;
2534 Value *LoadTaskStep = nullptr;
2535 for (Instruction &I : *TaskloopAllocaBB) {
2536 if (I.getOpcode() == Instruction::GetElementPtr) {
2537 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2538 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2539 switch (CI->getZExtValue()) {
2540 case 0:
2541 TaskLB = &I;
2542 break;
2543 case 1:
2544 TaskUB = &I;
2545 break;
2546 case 2:
2547 TaskStep = &I;
2548 break;
2549 }
2550 }
2551 } else if (I.getOpcode() == Instruction::Load) {
2552 LoadInst &Load = cast<LoadInst>(I);
2553 if (Load.getPointerOperand() == TaskLB) {
2554 assert(TaskLB != nullptr && "Expected value for TaskLB");
2555 LoadTaskLB = &I;
2556 } else if (Load.getPointerOperand() == TaskUB) {
2557 assert(TaskUB != nullptr && "Expected value for TaskUB");
2558 LoadTaskUB = &I;
2559 } else if (Load.getPointerOperand() == TaskStep) {
2560 assert(TaskStep != nullptr && "Expected value for TaskStep");
2561 LoadTaskStep = &I;
2562 }
2563 }
2564 }
2565
2566 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2567
2568 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2569 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2570 assert(LoadTaskStep != nullptr && "Expected value for LoadTaskStep");
2571 Value *TripCountMinusOne = Builder.CreateSDiv(
2572 Builder.CreateSub(LoadTaskUB, LoadTaskLB), LoadTaskStep);
2573 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2574 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2575 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2576 // set the trip count in the CLI
2577 CLI->setTripCount(CastedTripCount);
2578
2579 Builder.SetInsertPoint(CLI->getBody(),
2580 CLI->getBody()->getFirstInsertionPt());
2581
2582 if (NumOfCollapseLoops > 1) {
2583 llvm::SmallVector<User *> UsersToReplace;
2584 // When using the collapse clause, the bounds of the loop have to be
2585 // adjusted to properly represent the iterator of the outer loop.
2586 Value *IVPlusTaskLB = Builder.CreateAdd(
2587 CLI->getIndVar(),
2588 Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
2589 // To ensure every Use is correctly captured, we first want to record
2590 // which users to replace the value in, and then replace the value.
2591 for (auto IVUse = CLI->getIndVar()->uses().begin();
2592 IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
2593 User *IVUser = IVUse->getUser();
2594 if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
2595 if (Op->getOpcode() == Instruction::URem ||
2596 Op->getOpcode() == Instruction::UDiv) {
2597 UsersToReplace.push_back(IVUser);
2598 }
2599 }
2600 }
2601 for (User *User : UsersToReplace) {
2602 User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
2603 }
2604 } else {
2605 // The canonical loop is generated with a fixed lower bound. We need to
2606 // update the index calculation code to use the task's lower bound. The
2607 // generated code looks like this:
2608 // %omp_loop.iv = phi ...
2609 // ...
2610 // %tmp = mul [type] %omp_loop.iv, step
2611 // %user_index = add [type] tmp, lb
2612 // OpenMPIRBuilder constructs canonical loops to have exactly three uses
2613 // of the normalised induction variable:
2614 // 1. This one: converting the normalised IV to the user IV
2615 // 2. The increment (add)
2616 // 3. The comparison against the trip count (icmp)
2617 // (1) is the only use that is a mul followed by an add so this cannot
2618 // match other IR.
2619 assert(CLI->getIndVar()->getNumUses() == 3 &&
2620 "Canonical loop should have exactly three uses of the ind var");
2621 for (User *IVUser : CLI->getIndVar()->users()) {
2622 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2623 if (Mul->getOpcode() == Instruction::Mul) {
2624 for (User *MulUser : Mul->users()) {
2625 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2626 if (Add->getOpcode() == Instruction::Add) {
2627 Add->setOperand(1, CastedTaskLB);
2628 }
2629 }
2630 }
2631 }
2632 }
2633 }
2634 }
2635
2636 FakeLB->replaceAllUsesWith(CastedLBVal);
2637 FakeUB->replaceAllUsesWith(CastedUBVal);
2638 FakeStep->replaceAllUsesWith(CastedStepVal);
2639 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2640 I->eraseFromParent();
2641 }
2642 };
2643
2644 addOutlineInfo(std::move(OI));
2645 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2646 return Builder.saveIP();
2647}
2648
2651 M.getContext(), M.getDataLayout().getPointerSizeInBits());
2652 return llvm::StructType::get(IntPtrTy, IntPtrTy,
2653 llvm::Type::getInt32Ty(M.getContext()));
2654}
2655
2657 const LocationDescription &Loc, InsertPointTy AllocaIP,
2658 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB,
2659 bool Tied, Value *Final, Value *IfCondition,
2660 const DependenciesInfo &Dependencies, const AffinityData &Affinities,
2661 bool Mergeable, Value *EventHandle, Value *Priority) {
2662
2663 if (!updateToLocation(Loc))
2664 return InsertPointTy();
2665
2666 uint32_t SrcLocStrSize;
2667 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2668 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2669 // The current basic block is split into four basic blocks. After outlining,
2670 // they will be mapped as follows:
2671 // ```
2672 // def current_fn() {
2673 // current_basic_block:
2674 // br label %task.exit
2675 // task.exit:
2676 // ; instructions after task
2677 // }
2678 // def outlined_fn() {
2679 // task.alloca:
2680 // br label %task.body
2681 // task.body:
2682 // ret void
2683 // }
2684 // ```
2685 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2686 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2687 BasicBlock *TaskAllocaBB =
2688 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2689
2690 InsertPointTy TaskAllocaIP =
2691 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2692 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2693 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP, TaskExitBB))
2694 return Err;
2695
2696 auto OI = std::make_unique<OutlineInfo>();
2697 OI->EntryBB = TaskAllocaBB;
2698 OI->OuterAllocBB = AllocaIP.getBlock();
2699 OI->ExitBB = TaskExitBB;
2700 OI->OuterDeallocBBs.reserve(DeallocBlocks.size());
2701 copy(DeallocBlocks, OI->OuterDeallocBBs.end());
2702
2703 // Add the thread ID argument.
2705 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2706 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2707
2708 OI->PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2709 Affinities, Mergeable, Priority, EventHandle,
2710 TaskAllocaBB,
2711 ToBeDeleted](Function &OutlinedFn) mutable {
2712 // Replace the Stale CI by appropriate RTL function call.
2713 assert(OutlinedFn.hasOneUse() &&
2714 "there must be a single user for the outlined function");
2715 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2716
2717 // HasShareds is true if any variables are captured in the outlined region,
2718 // false otherwise.
2719 bool HasShareds = StaleCI->arg_size() > 1;
2720 Builder.SetInsertPoint(StaleCI);
2721
2722 // Gather the arguments for emitting the runtime call for
2723 // @__kmpc_omp_task_alloc
2724 Function *TaskAllocFn =
2725 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2726
2727 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2728 // call.
2729 Value *ThreadID = getOrCreateThreadID(Ident);
2730
2731 // Argument - `flags`
2732 // Task is tied iff (Flags & 1) == 1.
2733 // Task is untied iff (Flags & 1) == 0.
2734 // Task is final iff (Flags & 2) == 2.
2735 // Task is not final iff (Flags & 2) == 0.
2736 // Task is mergeable or merged-if0 iff (Flags & 4) == 4.
2737 // Task is neither mergeable nor merged-if0 iff (Flags & 4) == 0.
2738 // Task is detachable iff (Flags & 64) == 64.
2739 // Task is not detachable iff (Flags & 64) == 0.
2740 // Task is priority iff (Flags & 32) == 32.
2741 // Task is not priority iff (Flags & 32) == 0.
2742 // TODO: Handle the other flags.
2743 Value *Flags = Builder.getInt32(Tied);
2744 auto *ConstIfCondition = dyn_cast_or_null<ConstantInt>(IfCondition);
2745 bool UseMergedIf0Path = ConstIfCondition && ConstIfCondition->isZero();
2746 if (Final) {
2747 Value *FinalFlag =
2748 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2749 Flags = Builder.CreateOr(FinalFlag, Flags);
2750 }
2751
2752 if (Mergeable || UseMergedIf0Path)
2753 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2754 if (EventHandle)
2755 Flags = Builder.CreateOr(Builder.getInt32(64), Flags);
2756 if (Priority)
2757 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2758
2759 // Argument - `sizeof_kmp_task_t` (TaskSize)
2760 // Tasksize refers to the size in bytes of kmp_task_t data structure
2761 // including private vars accessed in task.
2762 // TODO: add kmp_task_t_with_privates (privates)
2763 Value *TaskSize = Builder.getInt64(
2764 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2765
2766 // Argument - `sizeof_shareds` (SharedsSize)
2767 // SharedsSize refers to the shareds array size in the kmp_task_t data
2768 // structure.
2769 Value *SharedsSize = Builder.getInt64(0);
2770 if (HasShareds) {
2771 AllocaInst *ArgStructAlloca =
2773 assert(ArgStructAlloca &&
2774 "Unable to find the alloca instruction corresponding to arguments "
2775 "for extracted function");
2776 std::optional<TypeSize> ArgAllocSize =
2777 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2778 assert(ArgAllocSize &&
2779 "Unable to determine size of arguments for extracted function");
2780 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2781 }
2782 // Emit the @__kmpc_omp_task_alloc runtime call
2783 // The runtime call returns a pointer to an area where the task captured
2784 // variables must be copied before the task is run (TaskData)
2786 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2787 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2788 /*task_func=*/&OutlinedFn});
2789
2790 if (Affinities.Count && Affinities.Info) {
2792 OMPRTL___kmpc_omp_reg_task_with_affinity);
2793
2794 createRuntimeFunctionCall(RegAffFn, {Ident, ThreadID, TaskData,
2795 Affinities.Count, Affinities.Info});
2796 }
2797
2798 // Emit detach clause initialization.
2799 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2800 // task_descriptor);
2801 if (EventHandle) {
2803 OMPRTL___kmpc_task_allow_completion_event);
2804 llvm::Value *EventVal =
2805 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2806 llvm::Value *EventHandleAddr =
2807 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2808 Builder.getPtrTy(0));
2809 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2810 Builder.CreateStore(EventVal, EventHandleAddr);
2811 }
2812 // Copy the arguments for outlined function
2813 if (HasShareds) {
2814 Value *Shareds = StaleCI->getArgOperand(1);
2815 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2816 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2817 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2818 SharedsSize);
2819 }
2820
2821 if (Priority) {
2822 //
2823 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2824 // we populate the priority information into the "kmp_task_t" here
2825 //
2826 // The struct "kmp_task_t" definition is available in kmp.h
2827 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2828 // data2 is used for priority
2829 //
2830 Type *Int32Ty = Builder.getInt32Ty();
2831 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2832 // kmp_task_t* => { ptr }
2833 Type *TaskPtr = StructType::get(VoidPtr);
2834 Value *TaskGEP =
2835 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2836 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2837 Type *TaskStructType = StructType::get(
2838 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2839 Value *PriorityData = Builder.CreateInBoundsGEP(
2840 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2841 // kmp_cmplrdata_t => { ptr, ptr }
2842 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2843 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2844 PriorityData, {Zero, Zero});
2845 Builder.CreateStore(Priority, CmplrData);
2846 }
2847
2848 Value *DepArray = nullptr;
2849 Value *NumDeps = nullptr;
2850 if (Dependencies.DepArray) {
2851 DepArray = Dependencies.DepArray;
2852 NumDeps = Dependencies.NumDeps;
2853 } else if (!Dependencies.Deps.empty()) {
2854 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
2855 NumDeps = Builder.getInt32(Dependencies.Deps.size());
2856 }
2857
2858 // In the presence of the `if` clause, the following IR is generated:
2859 // ...
2860 // %data = call @__kmpc_omp_task_alloc(...)
2861 // br i1 %if_condition, label %then, label %else
2862 // then:
2863 // call @__kmpc_omp_task(...)
2864 // br label %exit
2865 // else:
2866 // ;; Wait for resolution of dependencies, if any, before
2867 // ;; beginning the task
2868 // call @__kmpc_omp_wait_deps(...)
2869 // call @__kmpc_omp_task_begin_if0(...)
2870 // call @outlined_fn(...)
2871 // call @__kmpc_omp_task_complete_if0(...)
2872 // br label %exit
2873 // exit:
2874 // ...
2875 if (IfCondition && !UseMergedIf0Path) {
2876 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2877 // terminator.
2878 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2879 Instruction *IfTerminator =
2880 Builder.GetInsertPoint()->getParent()->getTerminator();
2881 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2882 Builder.SetInsertPoint(IfTerminator);
2883 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2884 &ElseTI);
2885 Builder.SetInsertPoint(ElseTI);
2886
2887 if (DepArray) {
2888 Function *TaskWaitFn =
2889 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2891 TaskWaitFn,
2892 {Ident, ThreadID, NumDeps, DepArray,
2893 ConstantInt::get(Builder.getInt32Ty(), 0),
2895 }
2896 Function *TaskBeginFn =
2897 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2898 Function *TaskCompleteFn =
2899 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2900 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2901 CallInst *CI = nullptr;
2902 if (HasShareds)
2903 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2904 else
2905 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2906 CI->setDebugLoc(StaleCI->getDebugLoc());
2907 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2908 Builder.SetInsertPoint(ThenTI);
2909 }
2910
2911 if (DepArray) {
2912 Function *TaskFn =
2913 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2915 TaskFn,
2916 {Ident, ThreadID, TaskData, NumDeps, DepArray,
2917 ConstantInt::get(Builder.getInt32Ty(), 0),
2919
2920 } else {
2921 // Emit the @__kmpc_omp_task runtime call to spawn the task
2922 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2923 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2924 }
2925
2926 StaleCI->eraseFromParent();
2927
2928 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2929 if (HasShareds) {
2930 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2931 OutlinedFn.getArg(1)->replaceUsesWithIf(
2932 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2933 }
2934
2935 for (Instruction *I : llvm::reverse(ToBeDeleted))
2936 I->eraseFromParent();
2937 };
2938
2939 addOutlineInfo(std::move(OI));
2940 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2941
2942 return Builder.saveIP();
2943}
2944
2946 const LocationDescription &Loc, InsertPointTy AllocaIP,
2947 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB) {
2948 if (!updateToLocation(Loc))
2949 return InsertPointTy();
2950
2951 uint32_t SrcLocStrSize;
2952 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2953 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2954 Value *ThreadID = getOrCreateThreadID(Ident);
2955
2956 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2957 Function *TaskgroupFn =
2958 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2959 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2960
2961 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2962 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP(), DeallocBlocks))
2963 return Err;
2964
2965 Builder.SetInsertPoint(TaskgroupExitBB);
2966 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2967 Function *EndTaskgroupFn =
2968 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2969 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2970
2971 return Builder.saveIP();
2972}
2973
2975 const LocationDescription &Loc, InsertPointTy AllocaIP,
2977 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2978 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2979
2980 if (!updateToLocation(Loc))
2981 return Loc.IP;
2982
2983 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2984
2985 // Each section is emitted as a switch case
2986 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2987 // -> OMP.createSection() which generates the IR for each section
2988 // Iterate through all sections and emit a switch construct:
2989 // switch (IV) {
2990 // case 0:
2991 // <SectionStmt[0]>;
2992 // break;
2993 // ...
2994 // case <NumSection> - 1:
2995 // <SectionStmt[<NumSection> - 1]>;
2996 // break;
2997 // }
2998 // ...
2999 // section_loop.after:
3000 // <FiniCB>;
3001 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
3002 Builder.restoreIP(CodeGenIP);
3004 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
3005 Function *CurFn = Continue->getParent();
3006 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
3007
3008 unsigned CaseNumber = 0;
3009 for (auto SectionCB : SectionCBs) {
3011 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
3012 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
3013 Builder.SetInsertPoint(CaseBB);
3014 UncondBrInst *CaseEndBr = Builder.CreateBr(Continue);
3015 if (Error Err =
3016 SectionCB(InsertPointTy(),
3017 {CaseEndBr->getParent(), CaseEndBr->getIterator()}, {}))
3018 return Err;
3019 CaseNumber++;
3020 }
3021 // remove the existing terminator from body BB since there can be no
3022 // terminators after switch/case
3023 return Error::success();
3024 };
3025 // Loop body ends here
3026 // LowerBound, UpperBound, and STride for createCanonicalLoop
3027 Type *I32Ty = Type::getInt32Ty(M.getContext());
3028 Value *LB = ConstantInt::get(I32Ty, 0);
3029 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
3030 Value *ST = ConstantInt::get(I32Ty, 1);
3032 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
3033 if (!LoopInfo)
3034 return LoopInfo.takeError();
3035
3036 InsertPointOrErrorTy WsloopIP =
3037 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
3038 WorksharingLoopType::ForStaticLoop, !IsNowait);
3039 if (!WsloopIP)
3040 return WsloopIP.takeError();
3041 InsertPointTy AfterIP = *WsloopIP;
3042
3043 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
3044 assert(LoopFini && "Bad structure of static workshare loop finalization");
3045
3046 // Apply the finalization callback in LoopAfterBB
3047 auto FiniInfo = FinalizationStack.pop_back_val();
3048 assert(FiniInfo.DK == OMPD_sections &&
3049 "Unexpected finalization stack state!");
3050 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
3051 return Err;
3052
3053 return AfterIP;
3054}
3055
3058 BodyGenCallbackTy BodyGenCB,
3059 FinalizeCallbackTy FiniCB) {
3060 if (!updateToLocation(Loc))
3061 return Loc.IP;
3062
3063 auto FiniCBWrapper = [&](InsertPointTy IP) {
3064 if (IP.getBlock()->end() != IP.getPoint())
3065 return FiniCB(IP);
3066 // This must be done otherwise any nested constructs using FinalizeOMPRegion
3067 // will fail because that function requires the Finalization Basic Block to
3068 // have a terminator, which is already removed by EmitOMPRegionBody.
3069 // IP is currently at cancelation block.
3070 // We need to backtrack to the condition block to fetch
3071 // the exit block and create a branch from cancelation
3072 // to exit block.
3074 Builder.restoreIP(IP);
3075 auto *CaseBB = Loc.IP.getBlock();
3076 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
3077 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
3078 Instruction *I = Builder.CreateBr(ExitBB);
3079 IP = InsertPointTy(I->getParent(), I->getIterator());
3080 return FiniCB(IP);
3081 };
3082
3083 Directive OMPD = Directive::OMPD_sections;
3084 // Since we are using Finalization Callback here, HasFinalize
3085 // and IsCancellable have to be true
3086 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
3087 /*Conditional*/ false, /*hasFinalize*/ true,
3088 /*IsCancellable*/ true);
3089}
3090
3096
3097Value *OpenMPIRBuilder::getGPUThreadID() {
3100 OMPRTL___kmpc_get_hardware_thread_id_in_block),
3101 {});
3102}
3103
3104Value *OpenMPIRBuilder::getGPUWarpSize() {
3106 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
3107}
3108
3109Value *OpenMPIRBuilder::getNVPTXWarpID() {
3110 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
3111 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
3112}
3113
3114Value *OpenMPIRBuilder::getNVPTXLaneID() {
3115 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
3116 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
3117 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
3118 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
3119 "nvptx_lane_id");
3120}
3121
3122Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
3123 Type *ToType) {
3124 Type *FromType = From->getType();
3125 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
3126 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
3127 assert(FromSize > 0 && "From size must be greater than zero");
3128 assert(ToSize > 0 && "To size must be greater than zero");
3129 if (FromType == ToType)
3130 return From;
3131 if (FromSize == ToSize)
3132 return Builder.CreateBitCast(From, ToType);
3133 if (ToType->isIntegerTy() && FromType->isIntegerTy())
3134 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
3135 InsertPointTy SaveIP = Builder.saveIP();
3136 Builder.restoreIP(AllocaIP);
3137 Value *CastItem = Builder.CreateAlloca(ToType);
3138 Builder.restoreIP(SaveIP);
3139
3140 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
3141 CastItem, Builder.getPtrTy(0));
3142 Builder.CreateStore(From, ValCastItem);
3143 return Builder.CreateLoad(ToType, CastItem);
3144}
3145
3146Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
3147 Value *Element,
3148 Type *ElementType,
3149 Value *Offset) {
3150 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
3151 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
3152
3153 // Cast all types to 32- or 64-bit values before calling shuffle routines.
3154 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
3155 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
3156 Value *WarpSize =
3157 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
3159 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
3160 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
3161 Value *WarpSizeCast =
3162 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
3163 Value *ShuffleCall =
3164 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
3165 return castValueToType(AllocaIP, ShuffleCall, CastTy);
3166}
3167
3168void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
3169 Value *DstAddr, Type *ElemType,
3170 Value *Offset, Type *ReductionArrayTy,
3171 bool IsByRefElem) {
3172 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
3173 // Create the loop over the big sized data.
3174 // ptr = (void*)Elem;
3175 // ptrEnd = (void*) Elem + 1;
3176 // Step = 8;
3177 // while (ptr + Step < ptrEnd)
3178 // shuffle((int64_t)*ptr);
3179 // Step = 4;
3180 // while (ptr + Step < ptrEnd)
3181 // shuffle((int32_t)*ptr);
3182 // ...
3183 Type *IndexTy = Builder.getIndexTy(
3184 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3185 Value *ElemPtr = DstAddr;
3186 Value *Ptr = SrcAddr;
3187 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
3188 if (Size < IntSize)
3189 continue;
3190 Type *IntType = Builder.getIntNTy(IntSize * 8);
3191 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3192 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
3193 Value *SrcAddrGEP =
3194 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
3195 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3196 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
3197
3198 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3199 if ((Size / IntSize) > 1) {
3200 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
3201 SrcAddrGEP, Builder.getPtrTy());
3202 BasicBlock *PreCondBB =
3203 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
3204 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
3205 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
3206 BasicBlock *CurrentBB = Builder.GetInsertBlock();
3207 emitBlock(PreCondBB, CurFunc);
3208 PHINode *PhiSrc =
3209 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
3210 PhiSrc->addIncoming(Ptr, CurrentBB);
3211 PHINode *PhiDest =
3212 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
3213 PhiDest->addIncoming(ElemPtr, CurrentBB);
3214 Ptr = PhiSrc;
3215 ElemPtr = PhiDest;
3216 Value *PtrDiff = Builder.CreatePtrDiff(
3217 Builder.getInt8Ty(), PtrEnd,
3218 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
3219 Builder.CreateCondBr(
3220 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
3221 ExitBB);
3222 emitBlock(ThenBB, CurFunc);
3223 Value *Res = createRuntimeShuffleFunction(
3224 AllocaIP,
3225 Builder.CreateAlignedLoad(
3226 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
3227 IntType, Offset);
3228 Builder.CreateAlignedStore(Res, ElemPtr,
3229 M.getDataLayout().getPrefTypeAlign(ElemType));
3230 Value *LocalPtr =
3231 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3232 Value *LocalElemPtr =
3233 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3234 PhiSrc->addIncoming(LocalPtr, ThenBB);
3235 PhiDest->addIncoming(LocalElemPtr, ThenBB);
3236 emitBranch(PreCondBB);
3237 emitBlock(ExitBB, CurFunc);
3238 } else {
3239 Value *Res = createRuntimeShuffleFunction(
3240 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
3241 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
3242 Res->getType()->getScalarSizeInBits())
3243 Res = Builder.CreateTrunc(Res, ElemType);
3244 Builder.CreateStore(Res, ElemPtr);
3245 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3246 ElemPtr =
3247 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3248 }
3249 Size = Size % IntSize;
3250 }
3251}
3252
3253Error OpenMPIRBuilder::emitReductionListCopy(
3254 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
3255 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
3256 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
3257 Type *IndexTy = Builder.getIndexTy(
3258 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3259 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
3260
3261 // Iterates, element-by-element, through the source Reduce list and
3262 // make a copy.
3263 for (auto En : enumerate(ReductionInfos)) {
3264 const ReductionInfo &RI = En.value();
3265 Value *SrcElementAddr = nullptr;
3266 AllocaInst *DestAlloca = nullptr;
3267 Value *DestElementAddr = nullptr;
3268 Value *DestElementPtrAddr = nullptr;
3269 // Should we shuffle in an element from a remote lane?
3270 bool ShuffleInElement = false;
3271 // Set to true to update the pointer in the dest Reduce list to a
3272 // newly created element.
3273 bool UpdateDestListPtr = false;
3274
3275 // Step 1.1: Get the address for the src element in the Reduce list.
3276 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3277 ReductionArrayTy, SrcBase,
3278 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3279 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3280
3281 // Step 1.2: Create a temporary to store the element in the destination
3282 // Reduce list.
3283 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3284 ReductionArrayTy, DestBase,
3285 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3286 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3287 switch (Action) {
3289 InsertPointTy CurIP = Builder.saveIP();
3290 Builder.restoreIP(AllocaIP);
3291
3292 Type *DestAllocaType =
3293 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3294 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3295 ".omp.reduction.element");
3296 DestAlloca->setAlignment(
3297 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3298 DestElementAddr = DestAlloca;
3299 DestElementAddr =
3300 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3301 DestElementAddr->getName() + ".ascast");
3302 Builder.restoreIP(CurIP);
3303 ShuffleInElement = true;
3304 UpdateDestListPtr = true;
3305 break;
3306 }
3308 DestElementAddr =
3309 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3310 break;
3311 }
3312 }
3313
3314 // Now that all active lanes have read the element in the
3315 // Reduce list, shuffle over the value from the remote lane.
3316 if (ShuffleInElement) {
3317 Type *ShuffleType = RI.ElementType;
3318 Value *ShuffleSrcAddr = SrcElementAddr;
3319 Value *ShuffleDestAddr = DestElementAddr;
3320 AllocaInst *LocalStorage = nullptr;
3321
3322 if (IsByRefElem) {
3323 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3324 assert(RI.ByRefAllocatedType &&
3325 "Expected by-ref allocated type to be set");
3326 // For by-ref reductions, we need to copy from the remote lane the
3327 // actual value of the partial reduction computed by that remote lane;
3328 // rather than, for example, a pointer to that data or, even worse, a
3329 // pointer to the descriptor of the by-ref reduction element.
3330 ShuffleType = RI.ByRefElementType;
3331
3332 if (RI.DataPtrPtrGen) {
3333 // Descriptor-based by-ref: extract data pointer from descriptor.
3334 InsertPointOrErrorTy GenResult = RI.DataPtrPtrGen(
3335 Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3336
3337 if (!GenResult)
3338 return GenResult.takeError();
3339
3340 ShuffleSrcAddr =
3341 Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3342
3343 {
3344 InsertPointTy OldIP = Builder.saveIP();
3345 Builder.restoreIP(AllocaIP);
3346
3347 LocalStorage = Builder.CreateAlloca(ShuffleType);
3348 Builder.restoreIP(OldIP);
3349 ShuffleDestAddr = LocalStorage;
3350 }
3351 } else {
3352 // Non-descriptor by-ref: the pointer already references data
3353 // directly. Shuffle into the destination alloca.
3354 ShuffleDestAddr = DestElementAddr;
3355 }
3356 }
3357
3358 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3359 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3360
3361 if (IsByRefElem && RI.DataPtrPtrGen) {
3362 // Copy descriptor from source and update base_ptr to shuffled data
3363 Value *DestDescriptorAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3364 DestAlloca, Builder.getPtrTy(), ".ascast");
3365
3366 InsertPointOrErrorTy GenResult = generateReductionDescriptor(
3367 DestDescriptorAddr, LocalStorage, SrcElementAddr,
3368 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3369
3370 if (!GenResult)
3371 return GenResult.takeError();
3372 }
3373 } else {
3374 switch (RI.EvaluationKind) {
3375 case EvalKind::Scalar: {
3376 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3377 // Store the source element value to the dest element address.
3378 Builder.CreateStore(Elem, DestElementAddr);
3379 break;
3380 }
3381 case EvalKind::Complex: {
3382 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3383 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3384 Value *SrcReal = Builder.CreateLoad(
3385 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3386 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3387 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3388 Value *SrcImg = Builder.CreateLoad(
3389 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3390
3391 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3392 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3393 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3394 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3395 Builder.CreateStore(SrcReal, DestRealPtr);
3396 Builder.CreateStore(SrcImg, DestImgPtr);
3397 break;
3398 }
3399 case EvalKind::Aggregate: {
3400 Value *SizeVal = Builder.getInt64(
3401 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3402 Builder.CreateMemCpy(
3403 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3404 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3405 SizeVal, false);
3406 break;
3407 }
3408 };
3409 }
3410
3411 // Step 3.1: Modify reference in dest Reduce list as needed.
3412 // Modifying the reference in Reduce list to point to the newly
3413 // created element. The element is live in the current function
3414 // scope and that of functions it invokes (i.e., reduce_function).
3415 // RemoteReduceData[i] = (void*)&RemoteElem
3416 if (UpdateDestListPtr) {
3417 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3418 DestElementAddr, Builder.getPtrTy(),
3419 DestElementAddr->getName() + ".ascast");
3420 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3421 }
3422 }
3423
3424 return Error::success();
3425}
3426
3427Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3428 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3429 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3430 InsertPointTy SavedIP = Builder.saveIP();
3431 LLVMContext &Ctx = M.getContext();
3432 FunctionType *FuncTy = FunctionType::get(
3433 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3434 /* IsVarArg */ false);
3435 Function *WcFunc =
3437 "_omp_reduction_inter_warp_copy_func", &M);
3438 WcFunc->setCallingConv(Config.getRuntimeCC());
3439 WcFunc->setAttributes(FuncAttrs);
3440 WcFunc->addParamAttr(0, Attribute::NoUndef);
3441 WcFunc->addParamAttr(1, Attribute::NoUndef);
3442 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3443 Builder.SetInsertPoint(EntryBB);
3444
3445 // ReduceList: thread local Reduce list.
3446 // At the stage of the computation when this function is called, partially
3447 // aggregated values reside in the first lane of every active warp.
3448 Argument *ReduceListArg = WcFunc->getArg(0);
3449 // NumWarps: number of warps active in the parallel region. This could
3450 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3451 Argument *NumWarpsArg = WcFunc->getArg(1);
3452
3453 // This array is used as a medium to transfer, one reduce element at a time,
3454 // the data from the first lane of every warp to lanes in the first warp
3455 // in order to perform the final step of a reduction in a parallel region
3456 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3457 // for reduced latency, as well as to have a distinct copy for concurrently
3458 // executing target regions. The array is declared with common linkage so
3459 // as to be shared across compilation units.
3460 StringRef TransferMediumName =
3461 "__openmp_nvptx_data_transfer_temporary_storage";
3462 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3463 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3464 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3465 if (!TransferMedium) {
3466 TransferMedium = new GlobalVariable(
3467 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3468 UndefValue::get(ArrayTy), TransferMediumName,
3469 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3470 /*AddressSpace=*/3);
3471 }
3472
3473 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3474 Value *GPUThreadID = getGPUThreadID();
3475 // nvptx_lane_id = nvptx_id % warpsize
3476 Value *LaneID = getNVPTXLaneID();
3477 // nvptx_warp_id = nvptx_id / warpsize
3478 Value *WarpID = getNVPTXWarpID();
3479
3480 InsertPointTy AllocaIP =
3481 InsertPointTy(Builder.GetInsertBlock(),
3482 Builder.GetInsertBlock()->getFirstInsertionPt());
3483 Type *Arg0Type = ReduceListArg->getType();
3484 Type *Arg1Type = NumWarpsArg->getType();
3485 Builder.restoreIP(AllocaIP);
3486 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3487 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3488 AllocaInst *NumWarpsAlloca =
3489 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3490 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3491 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3492 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3493 NumWarpsAlloca, Builder.getPtrTy(0),
3494 NumWarpsAlloca->getName() + ".ascast");
3495 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3496 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3497 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3498 InsertPointTy CodeGenIP =
3499 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3500 Builder.restoreIP(CodeGenIP);
3501
3502 Value *ReduceList =
3503 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3504
3505 for (auto En : enumerate(ReductionInfos)) {
3506 //
3507 // Warp master copies reduce element to transfer medium in __shared__
3508 // memory.
3509 //
3510 const ReductionInfo &RI = En.value();
3511 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3512 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3513 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3514 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3515 Type *CType = Builder.getIntNTy(TySize * 8);
3516
3517 unsigned NumIters = RealTySize / TySize;
3518 if (NumIters == 0)
3519 continue;
3520 Value *Cnt = nullptr;
3521 Value *CntAddr = nullptr;
3522 BasicBlock *PrecondBB = nullptr;
3523 BasicBlock *ExitBB = nullptr;
3524 if (NumIters > 1) {
3525 CodeGenIP = Builder.saveIP();
3526 Builder.restoreIP(AllocaIP);
3527 CntAddr =
3528 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3529
3530 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3531 CntAddr->getName() + ".ascast");
3532 Builder.restoreIP(CodeGenIP);
3533 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3534 CntAddr,
3535 /*Volatile=*/false);
3536 PrecondBB = BasicBlock::Create(Ctx, "precond");
3537 ExitBB = BasicBlock::Create(Ctx, "exit");
3538 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3539 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3540 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3541 /*Volatile=*/false);
3542 Value *Cmp = Builder.CreateICmpULT(
3543 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3544 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3545 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3546 }
3547
3548 // kmpc_barrier.
3549 InsertPointOrErrorTy BarrierIP1 =
3550 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3551 omp::Directive::OMPD_unknown,
3552 /* ForceSimpleCall */ false,
3553 /* CheckCancelFlag */ true);
3554 if (!BarrierIP1)
3555 return BarrierIP1.takeError();
3556 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3557 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3558 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3559
3560 // if (lane_id == 0)
3561 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3562 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3563 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3564
3565 // Reduce element = LocalReduceList[i]
3566 auto *RedListArrayTy =
3567 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3568 Type *IndexTy = Builder.getIndexTy(
3569 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3570 Value *ElemPtrPtr =
3571 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3572 {ConstantInt::get(IndexTy, 0),
3573 ConstantInt::get(IndexTy, En.index())});
3574 // elemptr = ((CopyType*)(elemptrptr)) + I
3575 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3576
3577 if (IsByRefElem && RI.DataPtrPtrGen) {
3578 InsertPointOrErrorTy GenRes =
3579 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3580
3581 if (!GenRes)
3582 return GenRes.takeError();
3583
3584 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3585 }
3586
3587 if (NumIters > 1)
3588 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3589
3590 // Get pointer to location in transfer medium.
3591 // MediumPtr = &medium[warp_id]
3592 Value *MediumPtr = Builder.CreateInBoundsGEP(
3593 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3594 // elem = *elemptr
3595 //*MediumPtr = elem
3596 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3597 // Store the source element value to the dest element address.
3598 Builder.CreateStore(Elem, MediumPtr,
3599 /*IsVolatile*/ true);
3600 Builder.CreateBr(MergeBB);
3601
3602 // else
3603 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3604 Builder.CreateBr(MergeBB);
3605
3606 // endif
3607 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3608 InsertPointOrErrorTy BarrierIP2 =
3609 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3610 omp::Directive::OMPD_unknown,
3611 /* ForceSimpleCall */ false,
3612 /* CheckCancelFlag */ true);
3613 if (!BarrierIP2)
3614 return BarrierIP2.takeError();
3615
3616 // Warp 0 copies reduce element from transfer medium
3617 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3618 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3619 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3620
3621 Value *NumWarpsVal =
3622 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3623 // Up to 32 threads in warp 0 are active.
3624 Value *IsActiveThread =
3625 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3626 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3627
3628 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3629
3630 // SecMediumPtr = &medium[tid]
3631 // SrcMediumVal = *SrcMediumPtr
3632 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3633 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3634 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3635 Value *TargetElemPtrPtr =
3636 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3637 {ConstantInt::get(IndexTy, 0),
3638 ConstantInt::get(IndexTy, En.index())});
3639 Value *TargetElemPtrVal =
3640 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3641 Value *TargetElemPtr = TargetElemPtrVal;
3642
3643 if (IsByRefElem && RI.DataPtrPtrGen) {
3644 InsertPointOrErrorTy GenRes =
3645 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3646
3647 if (!GenRes)
3648 return GenRes.takeError();
3649
3650 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3651 }
3652
3653 if (NumIters > 1)
3654 TargetElemPtr =
3655 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3656
3657 // *TargetElemPtr = SrcMediumVal;
3658 Value *SrcMediumValue =
3659 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3660 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3661 Builder.CreateBr(W0MergeBB);
3662
3663 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3664 Builder.CreateBr(W0MergeBB);
3665
3666 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3667
3668 if (NumIters > 1) {
3669 Cnt = Builder.CreateNSWAdd(
3670 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3671 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3672
3673 auto *CurFn = Builder.GetInsertBlock()->getParent();
3674 emitBranch(PrecondBB);
3675 emitBlock(ExitBB, CurFn);
3676 }
3677 RealTySize %= TySize;
3678 }
3679 }
3680
3681 Builder.CreateRetVoid();
3682 Builder.restoreIP(SavedIP);
3683
3684 return WcFunc;
3685}
3686
3687Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3688 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3689 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3690 LLVMContext &Ctx = M.getContext();
3691 FunctionType *FuncTy =
3692 FunctionType::get(Builder.getVoidTy(),
3693 {Builder.getPtrTy(), Builder.getInt16Ty(),
3694 Builder.getInt16Ty(), Builder.getInt16Ty()},
3695 /* IsVarArg */ false);
3696 Function *SarFunc =
3698 "_omp_reduction_shuffle_and_reduce_func", &M);
3699 SarFunc->setCallingConv(Config.getRuntimeCC());
3700 SarFunc->setAttributes(FuncAttrs);
3701 SarFunc->addParamAttr(0, Attribute::NoUndef);
3702 SarFunc->addParamAttr(1, Attribute::NoUndef);
3703 SarFunc->addParamAttr(2, Attribute::NoUndef);
3704 SarFunc->addParamAttr(3, Attribute::NoUndef);
3705 SarFunc->addParamAttr(1, Attribute::SExt);
3706 SarFunc->addParamAttr(2, Attribute::SExt);
3707 SarFunc->addParamAttr(3, Attribute::SExt);
3708 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3709 Builder.SetInsertPoint(EntryBB);
3710
3711 // Thread local Reduce list used to host the values of data to be reduced.
3712 Argument *ReduceListArg = SarFunc->getArg(0);
3713 // Current lane id; could be logical.
3714 Argument *LaneIDArg = SarFunc->getArg(1);
3715 // Offset of the remote source lane relative to the current lane.
3716 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3717 // Algorithm version. This is expected to be known at compile time.
3718 Argument *AlgoVerArg = SarFunc->getArg(3);
3719
3720 Type *ReduceListArgType = ReduceListArg->getType();
3721 Type *LaneIDArgType = LaneIDArg->getType();
3722 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3723 Value *ReduceListAlloca = Builder.CreateAlloca(
3724 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3725 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3726 LaneIDArg->getName() + ".addr");
3727 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3728 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3729 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3730 AlgoVerArg->getName() + ".addr");
3731 ArrayType *RedListArrayTy =
3732 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3733
3734 // Create a local thread-private variable to host the Reduce list
3735 // from a remote lane.
3736 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3737 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3738
3739 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3740 ReduceListAlloca, ReduceListArgType,
3741 ReduceListAlloca->getName() + ".ascast");
3742 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3743 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3744 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3745 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3746 RemoteLaneOffsetAlloca->getName() + ".ascast");
3747 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3748 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3749 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3750 RemoteReductionListAlloca, Builder.getPtrTy(),
3751 RemoteReductionListAlloca->getName() + ".ascast");
3752
3753 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3754 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3755 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3756 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3757
3758 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3759 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3760 Value *RemoteLaneOffset =
3761 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3762 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3763
3764 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3765
3766 // This loop iterates through the list of reduce elements and copies,
3767 // element by element, from a remote lane in the warp to RemoteReduceList,
3768 // hosted on the thread's stack.
3769 Error EmitRedLsCpRes = emitReductionListCopy(
3770 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3771 ReduceList, RemoteListAddrCast, IsByRef,
3772 {RemoteLaneOffset, nullptr, nullptr});
3773
3774 if (EmitRedLsCpRes)
3775 return EmitRedLsCpRes;
3776
3777 // The actions to be performed on the Remote Reduce list is dependent
3778 // on the algorithm version.
3779 //
3780 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3781 // LaneId % 2 == 0 && Offset > 0):
3782 // do the reduction value aggregation
3783 //
3784 // The thread local variable Reduce list is mutated in place to host the
3785 // reduced data, which is the aggregated value produced from local and
3786 // remote lanes.
3787 //
3788 // Note that AlgoVer is expected to be a constant integer known at compile
3789 // time.
3790 // When AlgoVer==0, the first conjunction evaluates to true, making
3791 // the entire predicate true during compile time.
3792 // When AlgoVer==1, the second conjunction has only the second part to be
3793 // evaluated during runtime. Other conjunctions evaluates to false
3794 // during compile time.
3795 // When AlgoVer==2, the third conjunction has only the second part to be
3796 // evaluated during runtime. Other conjunctions evaluates to false
3797 // during compile time.
3798 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3799 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3800 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3801 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3802 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3803 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3804 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3805 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3806 Value *RemoteOffsetComp =
3807 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3808 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3809 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3810 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3811
3812 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3813 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3814 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3815
3816 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3817 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3818 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3819 ReduceList, Builder.getPtrTy());
3820 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3821 RemoteListAddrCast, Builder.getPtrTy());
3822 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3823 ->addFnAttr(Attribute::NoUnwind);
3824 Builder.CreateBr(MergeBB);
3825
3826 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3827 Builder.CreateBr(MergeBB);
3828
3829 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3830
3831 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3832 // Reduce list.
3833 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3834 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3835 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3836
3837 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3838 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3839 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3840 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3841
3842 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3843
3844 EmitRedLsCpRes = emitReductionListCopy(
3845 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3846 RemoteListAddrCast, ReduceList, IsByRef);
3847
3848 if (EmitRedLsCpRes)
3849 return EmitRedLsCpRes;
3850
3851 Builder.CreateBr(CpyMergeBB);
3852
3853 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3854 Builder.CreateBr(CpyMergeBB);
3855
3856 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3857
3858 Builder.CreateRetVoid();
3859
3860 return SarFunc;
3861}
3862
3864OpenMPIRBuilder::generateReductionDescriptor(
3865 Value *DescriptorAddr, Value *DataPtr, Value *SrcDescriptorAddr,
3866 Type *DescriptorType,
3867 function_ref<InsertPointOrErrorTy(InsertPointTy, Value *, Value *&)>
3868 DataPtrPtrGen) {
3869
3870 // Copy the source descriptor to preserve all metadata (rank, extents,
3871 // strides, etc.)
3872 Value *DescriptorSize =
3873 Builder.getInt64(M.getDataLayout().getTypeStoreSize(DescriptorType));
3874 Builder.CreateMemCpy(
3875 DescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3876 SrcDescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3877 DescriptorSize);
3878
3879 // Update the base pointer field to point to the local shuffled data
3880 Value *DataPtrField;
3881 InsertPointOrErrorTy GenResult =
3882 DataPtrPtrGen(Builder.saveIP(), DescriptorAddr, DataPtrField);
3883
3884 if (!GenResult)
3885 return GenResult.takeError();
3886
3887 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3888 DataPtr, Builder.getPtrTy(), ".ascast"),
3889 DataPtrField);
3890
3891 return Builder.saveIP();
3892}
3893
3894Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3895 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3896 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3897 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3898 LLVMContext &Ctx = M.getContext();
3899 FunctionType *FuncTy = FunctionType::get(
3900 Builder.getVoidTy(),
3901 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3902 /* IsVarArg */ false);
3903 Function *LtGCFunc =
3905 "_omp_reduction_list_to_global_copy_func", &M);
3906 LtGCFunc->setAttributes(FuncAttrs);
3907 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3908 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3909 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3910
3911 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3912 Builder.SetInsertPoint(EntryBlock);
3913
3914 // Buffer: global reduction buffer.
3915 Argument *BufferArg = LtGCFunc->getArg(0);
3916 // Idx: index of the buffer.
3917 Argument *IdxArg = LtGCFunc->getArg(1);
3918 // ReduceList: thread local Reduce list.
3919 Argument *ReduceListArg = LtGCFunc->getArg(2);
3920
3921 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3922 BufferArg->getName() + ".addr");
3923 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3924 IdxArg->getName() + ".addr");
3925 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3926 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3927 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3928 BufferArgAlloca, Builder.getPtrTy(),
3929 BufferArgAlloca->getName() + ".ascast");
3930 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3931 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3932 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3933 ReduceListArgAlloca, Builder.getPtrTy(),
3934 ReduceListArgAlloca->getName() + ".ascast");
3935
3936 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3937 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3938 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3939
3940 Value *LocalReduceList =
3941 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3942 Value *BufferArgVal =
3943 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3944 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3945 Type *IndexTy = Builder.getIndexTy(
3946 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3947 for (auto En : enumerate(ReductionInfos)) {
3948 const ReductionInfo &RI = En.value();
3949 auto *RedListArrayTy =
3950 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3951 // Reduce element = LocalReduceList[i]
3952 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3953 RedListArrayTy, LocalReduceList,
3954 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3955 // elemptr = ((CopyType*)(elemptrptr)) + I
3956 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3957
3958 // Global = Buffer.VD[Idx];
3959 Value *BufferVD =
3960 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3961 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3962 ReductionsBufferTy, BufferVD, 0, En.index());
3963
3964 switch (RI.EvaluationKind) {
3965 case EvalKind::Scalar: {
3966 Value *TargetElement;
3967
3968 if (IsByRef.empty() || !IsByRef[En.index()]) {
3969 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3970 } else {
3971 if (RI.DataPtrPtrGen) {
3972 InsertPointOrErrorTy GenResult =
3973 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3974
3975 if (!GenResult)
3976 return GenResult.takeError();
3977
3978 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3979 }
3980 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3981 }
3982
3983 Builder.CreateStore(TargetElement, GlobVal);
3984 break;
3985 }
3986 case EvalKind::Complex: {
3987 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3988 RI.ElementType, ElemPtr, 0, 0, ".realp");
3989 Value *SrcReal = Builder.CreateLoad(
3990 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3991 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3992 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3993 Value *SrcImg = Builder.CreateLoad(
3994 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3995
3996 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3997 RI.ElementType, GlobVal, 0, 0, ".realp");
3998 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3999 RI.ElementType, GlobVal, 0, 1, ".imagp");
4000 Builder.CreateStore(SrcReal, DestRealPtr);
4001 Builder.CreateStore(SrcImg, DestImgPtr);
4002 break;
4003 }
4004 case EvalKind::Aggregate: {
4005 Value *SizeVal =
4006 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4007 Builder.CreateMemCpy(
4008 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
4009 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
4010 break;
4011 }
4012 }
4013 }
4014
4015 Builder.CreateRetVoid();
4016 Builder.restoreIP(OldIP);
4017 return LtGCFunc;
4018}
4019
4020Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
4021 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4022 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4023 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4024 LLVMContext &Ctx = M.getContext();
4025 FunctionType *FuncTy = FunctionType::get(
4026 Builder.getVoidTy(),
4027 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4028 /* IsVarArg */ false);
4029 Function *LtGRFunc =
4031 "_omp_reduction_list_to_global_reduce_func", &M);
4032 LtGRFunc->setAttributes(FuncAttrs);
4033 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
4034 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
4035 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
4036
4037 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
4038 Builder.SetInsertPoint(EntryBlock);
4039
4040 // Buffer: global reduction buffer.
4041 Argument *BufferArg = LtGRFunc->getArg(0);
4042 // Idx: index of the buffer.
4043 Argument *IdxArg = LtGRFunc->getArg(1);
4044 // ReduceList: thread local Reduce list.
4045 Argument *ReduceListArg = LtGRFunc->getArg(2);
4046
4047 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4048 BufferArg->getName() + ".addr");
4049 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4050 IdxArg->getName() + ".addr");
4051 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4052 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4053 auto *RedListArrayTy =
4054 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4055
4056 // 1. Build a list of reduction variables.
4057 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4058 Value *LocalReduceList =
4059 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4060
4061 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4062
4063 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4064 BufferArgAlloca, Builder.getPtrTy(),
4065 BufferArgAlloca->getName() + ".ascast");
4066 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4067 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4068 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4069 ReduceListArgAlloca, Builder.getPtrTy(),
4070 ReduceListArgAlloca->getName() + ".ascast");
4071 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4072 LocalReduceList, Builder.getPtrTy(),
4073 LocalReduceList->getName() + ".ascast");
4074
4075 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4076 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4077 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4078
4079 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4080 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4081 Type *IndexTy = Builder.getIndexTy(
4082 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4083 for (auto En : enumerate(ReductionInfos)) {
4084 const ReductionInfo &RI = En.value();
4085
4086 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4087 RedListArrayTy, LocalReduceListAddrCast,
4088 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4089 Value *BufferVD =
4090 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4091 // Global = Buffer.VD[Idx];
4092 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4093 ReductionsBufferTy, BufferVD, 0, En.index());
4094
4095 if (!IsByRef.empty() && IsByRef[En.index()] && RI.DataPtrPtrGen) {
4096 InsertPointTy OldIP = Builder.saveIP();
4097 Builder.restoreIP(AllocaIP);
4098
4099 Value *ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4100 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4101 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4102
4103 Builder.restoreIP(OldIP);
4104
4105 // Get source descriptor from the reduce list argument
4106 Value *ReduceList =
4107 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4108 Value *SrcElementPtrPtr =
4109 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
4110 {ConstantInt::get(IndexTy, 0),
4111 ConstantInt::get(IndexTy, En.index())});
4112 Value *SrcDescriptorAddr =
4113 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4114
4115 // Copy descriptor from source and update base_ptr to global buffer data
4116 InsertPointOrErrorTy GenResult =
4117 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4118 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4119
4120 if (!GenResult)
4121 return GenResult.takeError();
4122
4123 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4124 } else {
4125 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4126 }
4127 }
4128
4129 // Call reduce_function(GlobalReduceList, ReduceList)
4130 Value *ReduceList =
4131 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4132 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
4133 ->addFnAttr(Attribute::NoUnwind);
4134 Builder.CreateRetVoid();
4135 Builder.restoreIP(OldIP);
4136 return LtGRFunc;
4137}
4138
4139Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
4140 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
4141 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4142 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4143 LLVMContext &Ctx = M.getContext();
4144 FunctionType *FuncTy = FunctionType::get(
4145 Builder.getVoidTy(),
4146 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4147 /* IsVarArg */ false);
4148 Function *GtLCFunc =
4150 "_omp_reduction_global_to_list_copy_func", &M);
4151 GtLCFunc->setAttributes(FuncAttrs);
4152 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
4153 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
4154 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
4155
4156 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
4157 Builder.SetInsertPoint(EntryBlock);
4158
4159 // Buffer: global reduction buffer.
4160 Argument *BufferArg = GtLCFunc->getArg(0);
4161 // Idx: index of the buffer.
4162 Argument *IdxArg = GtLCFunc->getArg(1);
4163 // ReduceList: thread local Reduce list.
4164 Argument *ReduceListArg = GtLCFunc->getArg(2);
4165
4166 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4167 BufferArg->getName() + ".addr");
4168 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4169 IdxArg->getName() + ".addr");
4170 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4171 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4172 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4173 BufferArgAlloca, Builder.getPtrTy(),
4174 BufferArgAlloca->getName() + ".ascast");
4175 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4176 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4177 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4178 ReduceListArgAlloca, Builder.getPtrTy(),
4179 ReduceListArgAlloca->getName() + ".ascast");
4180 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4181 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4182 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4183
4184 Value *LocalReduceList =
4185 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4186 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4187 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4188 Type *IndexTy = Builder.getIndexTy(
4189 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4190 for (auto En : enumerate(ReductionInfos)) {
4191 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4192 auto *RedListArrayTy =
4193 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4194 // Reduce element = LocalReduceList[i]
4195 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
4196 RedListArrayTy, LocalReduceList,
4197 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4198 // elemptr = ((CopyType*)(elemptrptr)) + I
4199 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
4200 // Global = Buffer.VD[Idx];
4201 Value *BufferVD =
4202 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4203 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4204 ReductionsBufferTy, BufferVD, 0, En.index());
4205
4206 switch (RI.EvaluationKind) {
4207 case EvalKind::Scalar: {
4208 Type *ElemType = RI.ElementType;
4209
4210 if (!IsByRef.empty() && IsByRef[En.index()]) {
4211 ElemType = RI.ByRefElementType;
4212 if (RI.DataPtrPtrGen) {
4213 InsertPointOrErrorTy GenResult =
4214 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
4215
4216 if (!GenResult)
4217 return GenResult.takeError();
4218
4219 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
4220 }
4221 }
4222
4223 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
4224 Builder.CreateStore(TargetElement, ElemPtr);
4225 break;
4226 }
4227 case EvalKind::Complex: {
4228 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
4229 RI.ElementType, GlobValPtr, 0, 0, ".realp");
4230 Value *SrcReal = Builder.CreateLoad(
4231 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
4232 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
4233 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
4234 Value *SrcImg = Builder.CreateLoad(
4235 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
4236
4237 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
4238 RI.ElementType, ElemPtr, 0, 0, ".realp");
4239 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
4240 RI.ElementType, ElemPtr, 0, 1, ".imagp");
4241 Builder.CreateStore(SrcReal, DestRealPtr);
4242 Builder.CreateStore(SrcImg, DestImgPtr);
4243 break;
4244 }
4245 case EvalKind::Aggregate: {
4246 Value *SizeVal =
4247 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4248 Builder.CreateMemCpy(
4249 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4250 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4251 SizeVal, false);
4252 break;
4253 }
4254 }
4255 }
4256
4257 Builder.CreateRetVoid();
4258 Builder.restoreIP(OldIP);
4259 return GtLCFunc;
4260}
4261
4262Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
4263 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4264 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4265 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4266 LLVMContext &Ctx = M.getContext();
4267 auto *FuncTy = FunctionType::get(
4268 Builder.getVoidTy(),
4269 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4270 /* IsVarArg */ false);
4271 Function *GtLRFunc =
4273 "_omp_reduction_global_to_list_reduce_func", &M);
4274 GtLRFunc->setAttributes(FuncAttrs);
4275 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
4276 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
4277 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
4278
4279 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
4280 Builder.SetInsertPoint(EntryBlock);
4281
4282 // Buffer: global reduction buffer.
4283 Argument *BufferArg = GtLRFunc->getArg(0);
4284 // Idx: index of the buffer.
4285 Argument *IdxArg = GtLRFunc->getArg(1);
4286 // ReduceList: thread local Reduce list.
4287 Argument *ReduceListArg = GtLRFunc->getArg(2);
4288
4289 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4290 BufferArg->getName() + ".addr");
4291 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4292 IdxArg->getName() + ".addr");
4293 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4294 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4295 ArrayType *RedListArrayTy =
4296 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4297
4298 // 1. Build a list of reduction variables.
4299 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4300 Value *LocalReduceList =
4301 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4302
4303 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4304
4305 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4306 BufferArgAlloca, Builder.getPtrTy(),
4307 BufferArgAlloca->getName() + ".ascast");
4308 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4309 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4310 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4311 ReduceListArgAlloca, Builder.getPtrTy(),
4312 ReduceListArgAlloca->getName() + ".ascast");
4313 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4314 LocalReduceList, Builder.getPtrTy(),
4315 LocalReduceList->getName() + ".ascast");
4316
4317 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4318 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4319 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4320
4321 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4322 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4323 Type *IndexTy = Builder.getIndexTy(
4324 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4325 for (auto En : enumerate(ReductionInfos)) {
4326 const ReductionInfo &RI = En.value();
4327
4328 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4329 RedListArrayTy, ReductionList,
4330 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4331 // Global = Buffer.VD[Idx];
4332 Value *BufferVD =
4333 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4334 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4335 ReductionsBufferTy, BufferVD, 0, En.index());
4336
4337 if (!IsByRef.empty() && IsByRef[En.index()] && RI.DataPtrPtrGen) {
4338 InsertPointTy OldIP = Builder.saveIP();
4339 Builder.restoreIP(AllocaIP);
4340
4341 Value *ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4342 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4343 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4344
4345 Builder.restoreIP(OldIP);
4346
4347 // Get source descriptor from the reduce list
4348 Value *ReduceListVal =
4349 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4350 Value *SrcElementPtrPtr =
4351 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceListVal,
4352 {ConstantInt::get(IndexTy, 0),
4353 ConstantInt::get(IndexTy, En.index())});
4354 Value *SrcDescriptorAddr =
4355 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4356
4357 // Copy descriptor from source and update base_ptr to global buffer data
4358 InsertPointOrErrorTy GenResult =
4359 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4360 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4361 if (!GenResult)
4362 return GenResult.takeError();
4363
4364 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4365 } else {
4366 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4367 }
4368 }
4369
4370 // Call reduce_function(ReduceList, GlobalReduceList)
4371 Value *ReduceList =
4372 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4373 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4374 ->addFnAttr(Attribute::NoUnwind);
4375 Builder.CreateRetVoid();
4376 Builder.restoreIP(OldIP);
4377 return GtLRFunc;
4378}
4379
4380std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4381 std::string Suffix =
4382 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4383 return (Name + Suffix).str();
4384}
4385
4386Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4387 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4389 AttributeList FuncAttrs) {
4390 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4391 {Builder.getPtrTy(), Builder.getPtrTy()},
4392 /* IsVarArg */ false);
4393 std::string Name = getReductionFuncName(ReducerName);
4394 Function *ReductionFunc =
4396 ReductionFunc->setCallingConv(Config.getRuntimeCC());
4397 ReductionFunc->setAttributes(FuncAttrs);
4398 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4399 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4400 BasicBlock *EntryBB =
4401 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4402 Builder.SetInsertPoint(EntryBB);
4403
4404 // Need to alloca memory here and deal with the pointers before getting
4405 // LHS/RHS pointers out
4406 Value *LHSArrayPtr = nullptr;
4407 Value *RHSArrayPtr = nullptr;
4408 Argument *Arg0 = ReductionFunc->getArg(0);
4409 Argument *Arg1 = ReductionFunc->getArg(1);
4410 Type *Arg0Type = Arg0->getType();
4411 Type *Arg1Type = Arg1->getType();
4412
4413 Value *LHSAlloca =
4414 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4415 Value *RHSAlloca =
4416 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4417 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4418 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4419 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4420 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4421 Builder.CreateStore(Arg0, LHSAddrCast);
4422 Builder.CreateStore(Arg1, RHSAddrCast);
4423 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4424 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4425
4426 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4427 Type *IndexTy = Builder.getIndexTy(
4428 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4429 SmallVector<Value *> LHSPtrs, RHSPtrs;
4430 for (auto En : enumerate(ReductionInfos)) {
4431 const ReductionInfo &RI = En.value();
4432 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4433 RedArrayTy, RHSArrayPtr,
4434 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4435 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4436 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4437 RHSI8Ptr, RI.PrivateVariable->getType(),
4438 RHSI8Ptr->getName() + ".ascast");
4439
4440 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4441 RedArrayTy, LHSArrayPtr,
4442 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4443 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4444 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4445 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4446
4448 LHSPtrs.emplace_back(LHSPtr);
4449 RHSPtrs.emplace_back(RHSPtr);
4450 } else {
4451 Value *LHS = LHSPtr;
4452 Value *RHS = RHSPtr;
4453
4454 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4455 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4456 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4457 }
4458
4459 Value *Reduced;
4460 InsertPointOrErrorTy AfterIP =
4461 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4462 if (!AfterIP)
4463 return AfterIP.takeError();
4464 if (!Builder.GetInsertBlock())
4465 return ReductionFunc;
4466
4467 Builder.restoreIP(*AfterIP);
4468
4469 if (!IsByRef.empty() && !IsByRef[En.index()])
4470 Builder.CreateStore(Reduced, LHSPtr);
4471 }
4472 }
4473
4475 for (auto En : enumerate(ReductionInfos)) {
4476 unsigned Index = En.index();
4477 const ReductionInfo &RI = En.value();
4478 Value *LHSFixupPtr, *RHSFixupPtr;
4479 Builder.restoreIP(RI.ReductionGenClang(
4480 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4481
4482 // Fix the CallBack code genereated to use the correct Values for the LHS
4483 // and RHS
4484 LHSFixupPtr->replaceUsesWithIf(
4485 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4486 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4487 ReductionFunc;
4488 });
4489 RHSFixupPtr->replaceUsesWithIf(
4490 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4491 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4492 ReductionFunc;
4493 });
4494 }
4495
4496 Builder.CreateRetVoid();
4497 // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
4498 // to the entry block (this is dones for higher opt levels by later passes in
4499 // the pipeline). This has caused issues because non-entry `alloca`s force the
4500 // function to use dynamic stack allocations and we might run out of scratch
4501 // memory.
4502 hoistNonEntryAllocasToEntryBlock(ReductionFunc);
4503
4504 return ReductionFunc;
4505}
4506
4507static void
4509 bool IsGPU) {
4510 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4511 (void)RI;
4512 assert(RI.Variable && "expected non-null variable");
4513 assert(RI.PrivateVariable && "expected non-null private variable");
4514 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4515 "expected non-null reduction generator callback");
4516 if (!IsGPU) {
4517 assert(
4518 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4519 "expected variables and their private equivalents to have the same "
4520 "type");
4521 }
4522 assert(RI.Variable->getType()->isPointerTy() &&
4523 "expected variables to be pointers");
4524 }
4525}
4526
4528 const LocationDescription &Loc, InsertPointTy AllocaIP,
4529 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4530 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4531 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4532 unsigned ReductionBufNum, Value *SrcLocInfo) {
4533 if (!updateToLocation(Loc))
4534 return InsertPointTy();
4535 Builder.restoreIP(CodeGenIP);
4536 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4537 LLVMContext &Ctx = M.getContext();
4538
4539 // Source location for the ident struct
4540 if (!SrcLocInfo) {
4541 uint32_t SrcLocStrSize;
4542 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4543 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4544 }
4545
4546 if (ReductionInfos.size() == 0)
4547 return Builder.saveIP();
4548
4549 BasicBlock *ContinuationBlock = nullptr;
4551 // Copied code from createReductions
4552 BasicBlock *InsertBlock = Loc.IP.getBlock();
4553 ContinuationBlock =
4554 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4555 InsertBlock->getTerminator()->eraseFromParent();
4556 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4557 }
4558
4559 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4560 AttributeList FuncAttrs;
4561 AttrBuilder AttrBldr(Ctx);
4562 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4563 AttrBldr.addAttribute(Attr);
4564 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4565 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4566
4567 CodeGenIP = Builder.saveIP();
4568 Expected<Function *> ReductionResult = createReductionFunction(
4569 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4570 ReductionGenCBKind, FuncAttrs);
4571 if (!ReductionResult)
4572 return ReductionResult.takeError();
4573 Function *ReductionFunc = *ReductionResult;
4574 Builder.restoreIP(CodeGenIP);
4575
4576 // Set the grid value in the config needed for lowering later on
4577 if (GridValue.has_value())
4578 Config.setGridValue(GridValue.value());
4579 else
4580 Config.setGridValue(getGridValue(T, ReductionFunc));
4581
4582 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4583 // RedList, shuffle_reduce_func, interwarp_copy_func);
4584 // or
4585 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4586 Value *Res;
4587
4588 // 1. Build a list of reduction variables.
4589 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4590 auto Size = ReductionInfos.size();
4591 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4592 Type *FuncPtrTy =
4593 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4594 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4595 CodeGenIP = Builder.saveIP();
4596 Builder.restoreIP(AllocaIP);
4597 Value *ReductionListAlloca =
4598 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4599 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4600 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4601 Builder.restoreIP(CodeGenIP);
4602 Type *IndexTy = Builder.getIndexTy(
4603 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4604 for (auto En : enumerate(ReductionInfos)) {
4605 const ReductionInfo &RI = En.value();
4606 Value *ElemPtr = Builder.CreateInBoundsGEP(
4607 RedArrayTy, ReductionList,
4608 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4609
4610 Value *PrivateVar = RI.PrivateVariable;
4611 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4612 if (IsByRefElem)
4613 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4614
4615 Value *CastElem =
4616 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4617 Builder.CreateStore(CastElem, ElemPtr);
4618 }
4619 CodeGenIP = Builder.saveIP();
4620 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4621 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4622
4623 if (!SarFunc)
4624 return SarFunc.takeError();
4625
4626 Expected<Function *> CopyResult =
4627 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4628 if (!CopyResult)
4629 return CopyResult.takeError();
4630 Function *WcFunc = *CopyResult;
4631 Builder.restoreIP(CodeGenIP);
4632
4633 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4634
4635 // NOTE: ReductionDataSize is passed as the reduce_data_size
4636 // argument to __kmpc_nvptx_{parallel,teams}_reduce_nowait_v2, but
4637 // the runtime implementations do not currently use it. The teams
4638 // runtime reads ReductionDataSize from KernelEnvironmentTy instead
4639 // (set separately via TargetKernelDefaultAttrs). It is computed
4640 // here conservatively as max(element sizes) * N rather than the
4641 // exact sum, which over-calculates the size for mixed reduction
4642 // types but is harmless given the argument is unused.
4643 // TODO: Consider dropping this computation if the runtime API is
4644 // ever revised to remove the unused parameter.
4645 unsigned MaxDataSize = 0;
4646 SmallVector<Type *> ReductionTypeArgs;
4647 for (auto En : enumerate(ReductionInfos)) {
4648 // Use ByRefElementType for by-ref reductions so that MaxDataSize matches
4649 // the actual data size stored in the global reduction buffer, consistent
4650 // with the ReductionsBufferTy struct used for GEP offsets below.
4651 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4652 ? En.value().ByRefElementType
4653 : En.value().ElementType;
4654 auto Size = M.getDataLayout().getTypeStoreSize(RedTypeArg);
4655 if (Size > MaxDataSize)
4656 MaxDataSize = Size;
4657 ReductionTypeArgs.emplace_back(RedTypeArg);
4658 }
4659 Value *ReductionDataSize =
4660 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4661 if (!IsTeamsReduction) {
4662 Value *SarFuncCast =
4663 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4664 Value *WcFuncCast =
4665 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4666 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4667 WcFuncCast};
4669 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4670 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4671 } else {
4672 CodeGenIP = Builder.saveIP();
4673 StructType *ReductionsBufferTy = StructType::create(
4674 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4675 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4676 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4677
4678 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4679 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4680 if (!LtGCFunc)
4681 return LtGCFunc.takeError();
4682
4683 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4684 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4685 if (!LtGRFunc)
4686 return LtGRFunc.takeError();
4687
4688 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4689 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4690 if (!GtLCFunc)
4691 return GtLCFunc.takeError();
4692
4693 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4694 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4695 if (!GtLRFunc)
4696 return GtLRFunc.takeError();
4697
4698 Builder.restoreIP(CodeGenIP);
4699
4700 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4701 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4702
4703 Value *Args3[] = {SrcLocInfo,
4704 KernelTeamsReductionPtr,
4705 Builder.getInt32(ReductionBufNum),
4706 ReductionDataSize,
4707 RL,
4708 *SarFunc,
4709 WcFunc,
4710 *LtGCFunc,
4711 *LtGRFunc,
4712 *GtLCFunc,
4713 *GtLRFunc};
4714
4715 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4716 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4717 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4718 }
4719
4720 // 5. Build if (res == 1)
4721 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4722 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4723 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4724 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4725
4726 // 6. Build then branch: where we have reduced values in the master
4727 // thread in each team.
4728 // __kmpc_end_reduce{_nowait}(<gtid>);
4729 // break;
4730 emitBlock(ThenBB, CurFunc);
4731
4732 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4733 for (auto En : enumerate(ReductionInfos)) {
4734 const ReductionInfo &RI = En.value();
4736 Value *RedValue = RI.Variable;
4737 Value *RHS =
4738 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4739
4741 Value *LHSPtr, *RHSPtr;
4742 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4743 &LHSPtr, &RHSPtr, CurFunc));
4744
4745 // Fix the CallBack code genereated to use the correct Values for the LHS
4746 // and RHS. Cast to match types before replacing (necessary to handle
4747 // different address spaces).
4748 if (LHSPtr->getType() != RedValue->getType())
4749 RedValue = Builder.CreatePointerBitCastOrAddrSpaceCast(
4750 RedValue, LHSPtr->getType());
4751 if (RHSPtr->getType() != RHS->getType())
4752 RHS =
4753 Builder.CreatePointerBitCastOrAddrSpaceCast(RHS, RHSPtr->getType());
4754
4755 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4756 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4757 ReductionFunc;
4758 });
4759 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4760 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4761 ReductionFunc;
4762 });
4763 } else {
4764 if (IsByRef.empty() || !IsByRef[En.index()]) {
4765 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4766 "red.value." + Twine(En.index()));
4767 }
4768 Value *PrivateRedValue = Builder.CreateLoad(
4769 ValueType, RHS, "red.private.value" + Twine(En.index()));
4770 Value *Reduced;
4771 InsertPointOrErrorTy AfterIP =
4772 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4773 if (!AfterIP)
4774 return AfterIP.takeError();
4775 Builder.restoreIP(*AfterIP);
4776
4777 if (!IsByRef.empty() && !IsByRef[En.index()])
4778 Builder.CreateStore(Reduced, RI.Variable);
4779 }
4780 }
4781 emitBlock(ExitBB, CurFunc);
4782 if (ContinuationBlock) {
4783 Builder.CreateBr(ContinuationBlock);
4784 Builder.SetInsertPoint(ContinuationBlock);
4785 }
4786 Config.setEmitLLVMUsed();
4787
4788 return Builder.saveIP();
4789}
4790
4792 Type *VoidTy = Type::getVoidTy(M.getContext());
4793 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4794 auto *FuncTy =
4795 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4797 ".omp.reduction.func", &M);
4798}
4799
4801 Function *ReductionFunc,
4803 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4804 Module *Module = ReductionFunc->getParent();
4805 BasicBlock *ReductionFuncBlock =
4806 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4807 Builder.SetInsertPoint(ReductionFuncBlock);
4808 Value *LHSArrayPtr = nullptr;
4809 Value *RHSArrayPtr = nullptr;
4810 if (IsGPU) {
4811 // Need to alloca memory here and deal with the pointers before getting
4812 // LHS/RHS pointers out
4813 //
4814 Argument *Arg0 = ReductionFunc->getArg(0);
4815 Argument *Arg1 = ReductionFunc->getArg(1);
4816 Type *Arg0Type = Arg0->getType();
4817 Type *Arg1Type = Arg1->getType();
4818
4819 Value *LHSAlloca =
4820 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4821 Value *RHSAlloca =
4822 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4823 Value *LHSAddrCast =
4824 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4825 Value *RHSAddrCast =
4826 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4827 Builder.CreateStore(Arg0, LHSAddrCast);
4828 Builder.CreateStore(Arg1, RHSAddrCast);
4829 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4830 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4831 } else {
4832 LHSArrayPtr = ReductionFunc->getArg(0);
4833 RHSArrayPtr = ReductionFunc->getArg(1);
4834 }
4835
4836 unsigned NumReductions = ReductionInfos.size();
4837 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4838
4839 for (auto En : enumerate(ReductionInfos)) {
4840 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4841 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4842 RedArrayTy, LHSArrayPtr, 0, En.index());
4843 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4844 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4845 LHSI8Ptr, RI.Variable->getType());
4846 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4847 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4848 RedArrayTy, RHSArrayPtr, 0, En.index());
4849 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4850 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4851 RHSI8Ptr, RI.PrivateVariable->getType());
4852 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4853 Value *Reduced;
4855 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4856 if (!AfterIP)
4857 return AfterIP.takeError();
4858
4859 Builder.restoreIP(*AfterIP);
4860 // TODO: Consider flagging an error.
4861 if (!Builder.GetInsertBlock())
4862 return Error::success();
4863
4864 // store is inside of the reduction region when using by-ref
4865 if (!IsByRef[En.index()])
4866 Builder.CreateStore(Reduced, LHSPtr);
4867 }
4868 Builder.CreateRetVoid();
4869 return Error::success();
4870}
4871
4873 const LocationDescription &Loc, InsertPointTy AllocaIP,
4874 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4875 bool IsNoWait, bool IsTeamsReduction) {
4876 assert(ReductionInfos.size() == IsByRef.size());
4877 if (Config.isGPU())
4878 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4879 IsByRef, IsNoWait, IsTeamsReduction);
4880
4881 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4882
4883 if (!updateToLocation(Loc))
4884 return InsertPointTy();
4885
4886 if (ReductionInfos.size() == 0)
4887 return Builder.saveIP();
4888
4889 BasicBlock *InsertBlock = Loc.IP.getBlock();
4890 BasicBlock *ContinuationBlock =
4891 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4892 InsertBlock->getTerminator()->eraseFromParent();
4893
4894 // Create and populate array of type-erased pointers to private reduction
4895 // values.
4896 unsigned NumReductions = ReductionInfos.size();
4897 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4898 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4899 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4900
4901 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4902
4903 for (auto En : enumerate(ReductionInfos)) {
4904 unsigned Index = En.index();
4905 const ReductionInfo &RI = En.value();
4906 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4907 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4908 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4909 }
4910
4911 // Emit a call to the runtime function that orchestrates the reduction.
4912 // Declare the reduction function in the process.
4913 Type *IndexTy = Builder.getIndexTy(
4914 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4915 Function *Func = Builder.GetInsertBlock()->getParent();
4916 Module *Module = Func->getParent();
4917 uint32_t SrcLocStrSize;
4918 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4919 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4920 return RI.AtomicReductionGen;
4921 });
4922 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4923 CanGenerateAtomic
4924 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4925 : IdentFlag(0));
4926 Value *ThreadId = getOrCreateThreadID(Ident);
4927 Constant *NumVariables = Builder.getInt32(NumReductions);
4928 const DataLayout &DL = Module->getDataLayout();
4929 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4930 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4931 Function *ReductionFunc = getFreshReductionFunc(*Module);
4932 Value *Lock = getOMPCriticalRegionLock(".reduction");
4934 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4935 : RuntimeFunction::OMPRTL___kmpc_reduce);
4936 CallInst *ReduceCall =
4937 createRuntimeFunctionCall(ReduceFunc,
4938 {Ident, ThreadId, NumVariables, RedArraySize,
4939 RedArray, ReductionFunc, Lock},
4940 "reduce");
4941
4942 // Create final reduction entry blocks for the atomic and non-atomic case.
4943 // Emit IR that dispatches control flow to one of the blocks based on the
4944 // reduction supporting the atomic mode.
4945 BasicBlock *NonAtomicRedBlock =
4946 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4947 BasicBlock *AtomicRedBlock =
4948 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4949 SwitchInst *Switch =
4950 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4951 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4952 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4953
4954 // Populate the non-atomic reduction using the elementwise reduction function.
4955 // This loads the elements from the global and private variables and reduces
4956 // them before storing back the result to the global variable.
4957 Builder.SetInsertPoint(NonAtomicRedBlock);
4958 for (auto En : enumerate(ReductionInfos)) {
4959 const ReductionInfo &RI = En.value();
4961 // We have one less load for by-ref case because that load is now inside of
4962 // the reduction region
4963 Value *RedValue = RI.Variable;
4964 if (!IsByRef[En.index()]) {
4965 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4966 "red.value." + Twine(En.index()));
4967 }
4968 Value *PrivateRedValue =
4969 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4970 "red.private.value." + Twine(En.index()));
4971 Value *Reduced;
4972 InsertPointOrErrorTy AfterIP =
4973 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4974 if (!AfterIP)
4975 return AfterIP.takeError();
4976 Builder.restoreIP(*AfterIP);
4977
4978 if (!Builder.GetInsertBlock())
4979 return InsertPointTy();
4980 // for by-ref case, the load is inside of the reduction region
4981 if (!IsByRef[En.index()])
4982 Builder.CreateStore(Reduced, RI.Variable);
4983 }
4984 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4985 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4986 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4987 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4988 Builder.CreateBr(ContinuationBlock);
4989
4990 // Populate the atomic reduction using the atomic elementwise reduction
4991 // function. There are no loads/stores here because they will be happening
4992 // inside the atomic elementwise reduction.
4993 Builder.SetInsertPoint(AtomicRedBlock);
4994 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4995 for (const ReductionInfo &RI : ReductionInfos) {
4997 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4998 if (!AfterIP)
4999 return AfterIP.takeError();
5000 Builder.restoreIP(*AfterIP);
5001 if (!Builder.GetInsertBlock())
5002 return InsertPointTy();
5003 }
5004 Builder.CreateBr(ContinuationBlock);
5005 } else {
5006 Builder.CreateUnreachable();
5007 }
5008
5009 // Populate the outlined reduction function using the elementwise reduction
5010 // function. Partial values are extracted from the type-erased array of
5011 // pointers to private variables.
5012 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
5013 IsByRef, /*isGPU=*/false);
5014 if (Err)
5015 return Err;
5016
5017 if (!Builder.GetInsertBlock())
5018 return InsertPointTy();
5019
5020 Builder.SetInsertPoint(ContinuationBlock);
5021 return Builder.saveIP();
5022}
5023
5026 BodyGenCallbackTy BodyGenCB,
5027 FinalizeCallbackTy FiniCB) {
5028 if (!updateToLocation(Loc))
5029 return Loc.IP;
5030
5031 Directive OMPD = Directive::OMPD_master;
5032 uint32_t SrcLocStrSize;
5033 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5034 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5035 Value *ThreadId = getOrCreateThreadID(Ident);
5036 Value *Args[] = {Ident, ThreadId};
5037
5038 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
5039 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
5040
5041 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
5042 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
5043
5044 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5045 /*Conditional*/ true, /*hasFinalize*/ true);
5046}
5047
5050 BodyGenCallbackTy BodyGenCB,
5051 FinalizeCallbackTy FiniCB, Value *Filter) {
5052 if (!updateToLocation(Loc))
5053 return Loc.IP;
5054
5055 Directive OMPD = Directive::OMPD_masked;
5056 uint32_t SrcLocStrSize;
5057 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5058 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5059 Value *ThreadId = getOrCreateThreadID(Ident);
5060 Value *Args[] = {Ident, ThreadId, Filter};
5061 Value *ArgsEnd[] = {Ident, ThreadId};
5062
5063 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
5064 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
5065
5066 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
5067 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
5068
5069 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5070 /*Conditional*/ true, /*hasFinalize*/ true);
5071}
5072
5074 llvm::FunctionCallee Callee,
5076 const llvm::Twine &Name) {
5077 llvm::CallInst *Call = Builder.CreateCall(
5078 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
5079 Call->setDoesNotThrow();
5080 return Call;
5081}
5082
5083// Expects input basic block is dominated by BeforeScanBB.
5084// Once Scan directive is encountered, the code after scan directive should be
5085// dominated by AfterScanBB. Scan directive splits the code sequence to
5086// scan and input phase. Based on whether inclusive or exclusive
5087// clause is used in the scan directive and whether input loop or scan loop
5088// is lowered, it adds jumps to input and scan phase. First Scan loop is the
5089// input loop and second is the scan loop. The code generated handles only
5090// inclusive scans now.
5092 const LocationDescription &Loc, InsertPointTy AllocaIP,
5093 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
5094 bool IsInclusive, ScanInfo *ScanRedInfo) {
5095 if (ScanRedInfo->OMPFirstScanLoop) {
5096 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
5097 ScanVarsType, ScanRedInfo);
5098 if (Err)
5099 return Err;
5100 }
5101 if (!updateToLocation(Loc))
5102 return Loc.IP;
5103
5104 llvm::Value *IV = ScanRedInfo->IV;
5105
5106 if (ScanRedInfo->OMPFirstScanLoop) {
5107 // Emit buffer[i] = red; at the end of the input phase.
5108 for (size_t i = 0; i < ScanVars.size(); i++) {
5109 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
5110 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5111 Type *DestTy = ScanVarsType[i];
5112 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5113 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
5114
5115 Builder.CreateStore(Src, Val);
5116 }
5117 }
5118 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5119 emitBlock(ScanRedInfo->OMPScanDispatch,
5120 Builder.GetInsertBlock()->getParent());
5121
5122 if (!ScanRedInfo->OMPFirstScanLoop) {
5123 IV = ScanRedInfo->IV;
5124 // Emit red = buffer[i]; at the entrance to the scan phase.
5125 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
5126 for (size_t i = 0; i < ScanVars.size(); i++) {
5127 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
5128 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5129 Type *DestTy = ScanVarsType[i];
5130 Value *SrcPtr =
5131 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5132 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
5133 Builder.CreateStore(Src, ScanVars[i]);
5134 }
5135 }
5136
5137 // TODO: Update it to CreateBr and remove dead blocks
5138 llvm::Value *CmpI = Builder.getInt1(true);
5139 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
5140 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
5141 ScanRedInfo->OMPAfterScanBlock);
5142 } else {
5143 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
5144 ScanRedInfo->OMPBeforeScanBlock);
5145 }
5146 emitBlock(ScanRedInfo->OMPAfterScanBlock,
5147 Builder.GetInsertBlock()->getParent());
5148 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
5149 return Builder.saveIP();
5150}
5151
5152Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
5153 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
5154 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
5155
5156 Builder.restoreIP(AllocaIP);
5157 // Create the shared pointer at alloca IP.
5158 for (size_t i = 0; i < ScanVars.size(); i++) {
5159 llvm::Value *BuffPtr =
5160 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
5161 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
5162 }
5163
5164 // Allocate temporary buffer by master thread
5165 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5166 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5167 Builder.restoreIP(CodeGenIP);
5168 Value *AllocSpan =
5169 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
5170 for (size_t i = 0; i < ScanVars.size(); i++) {
5171 Type *IntPtrTy = Builder.getInt32Ty();
5172 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
5173 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
5174 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
5175 AllocSpan, nullptr, "arr");
5176 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
5177 }
5178 return Error::success();
5179 };
5180 // TODO: Perform finalization actions for variables. This has to be
5181 // called for variables which have destructors/finalizers.
5182 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5183
5184 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
5185 llvm::Value *FilterVal = Builder.getInt32(0);
5187 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5188
5189 if (!AfterIP)
5190 return AfterIP.takeError();
5191 Builder.restoreIP(*AfterIP);
5192 BasicBlock *InputBB = Builder.GetInsertBlock();
5193 if (InputBB->hasTerminator())
5194 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
5195 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5196 if (!AfterIP)
5197 return AfterIP.takeError();
5198 Builder.restoreIP(*AfterIP);
5199
5200 return Error::success();
5201}
5202
5203Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
5204 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
5205 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5206 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5207 Builder.restoreIP(CodeGenIP);
5208 for (ReductionInfo RedInfo : ReductionInfos) {
5209 Value *PrivateVar = RedInfo.PrivateVariable;
5210 Value *OrigVar = RedInfo.Variable;
5211 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
5212 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5213
5214 Type *SrcTy = RedInfo.ElementType;
5215 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
5216 "arrayOffset");
5217 Value *Src = Builder.CreateLoad(SrcTy, Val);
5218
5219 Builder.CreateStore(Src, OrigVar);
5220 Builder.CreateFree(Buff);
5221 }
5222 return Error::success();
5223 };
5224 // TODO: Perform finalization actions for variables. This has to be
5225 // called for variables which have destructors/finalizers.
5226 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5227
5228 if (Instruction *TI = ScanRedInfo->OMPScanFinish->getTerminatorOrNull())
5229 Builder.SetInsertPoint(TI);
5230 else
5231 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
5232
5233 llvm::Value *FilterVal = Builder.getInt32(0);
5235 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5236
5237 if (!AfterIP)
5238 return AfterIP.takeError();
5239 Builder.restoreIP(*AfterIP);
5240 BasicBlock *InputBB = Builder.GetInsertBlock();
5241 if (InputBB->hasTerminator())
5242 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
5243 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5244 if (!AfterIP)
5245 return AfterIP.takeError();
5246 Builder.restoreIP(*AfterIP);
5247 return Error::success();
5248}
5249
5251 const LocationDescription &Loc,
5253 ScanInfo *ScanRedInfo) {
5254
5255 if (!updateToLocation(Loc))
5256 return Loc.IP;
5257 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5258 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5259 Builder.restoreIP(CodeGenIP);
5260 Function *CurFn = Builder.GetInsertBlock()->getParent();
5261 // for (int k = 0; k <= ceil(log2(n)); ++k)
5262 llvm::BasicBlock *LoopBB =
5263 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
5264 llvm::BasicBlock *ExitBB =
5265 splitBB(Builder, false, "omp.outer.log.scan.exit");
5267 Builder.GetInsertBlock()->getModule(),
5268 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
5269 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
5270 llvm::Value *Arg =
5271 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
5272 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
5274 Builder.GetInsertBlock()->getModule(),
5275 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
5276 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
5277 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
5278 llvm::Value *NMin1 = Builder.CreateNUWSub(
5279 ScanRedInfo->Span,
5280 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
5281 Builder.SetInsertPoint(InputBB);
5282 Builder.CreateBr(LoopBB);
5283 emitBlock(LoopBB, CurFn);
5284 Builder.SetInsertPoint(LoopBB);
5285
5286 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5287 // size pow2k = 1;
5288 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5289 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
5290 InputBB);
5291 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
5292 InputBB);
5293 // for (size i = n - 1; i >= 2 ^ k; --i)
5294 // tmp[i] op= tmp[i-pow2k];
5295 llvm::BasicBlock *InnerLoopBB =
5296 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
5297 llvm::BasicBlock *InnerExitBB =
5298 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
5299 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
5300 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5301 emitBlock(InnerLoopBB, CurFn);
5302 Builder.SetInsertPoint(InnerLoopBB);
5303 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5304 IVal->addIncoming(NMin1, LoopBB);
5305 for (ReductionInfo RedInfo : ReductionInfos) {
5306 Value *ReductionVal = RedInfo.PrivateVariable;
5307 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
5308 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5309 Type *DestTy = RedInfo.ElementType;
5310 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
5311 Value *LHSPtr =
5312 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5313 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
5314 Value *RHSPtr =
5315 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
5316 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
5317 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
5318 llvm::Value *Result;
5319 InsertPointOrErrorTy AfterIP =
5320 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
5321 if (!AfterIP)
5322 return AfterIP.takeError();
5323 Builder.CreateStore(Result, LHSPtr);
5324 }
5325 llvm::Value *NextIVal = Builder.CreateNUWSub(
5326 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
5327 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
5328 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
5329 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5330 emitBlock(InnerExitBB, CurFn);
5331 llvm::Value *Next = Builder.CreateNUWAdd(
5332 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
5333 Counter->addIncoming(Next, Builder.GetInsertBlock());
5334 // pow2k <<= 1;
5335 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
5336 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
5337 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
5338 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
5339 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
5340 return Error::success();
5341 };
5342
5343 // TODO: Perform finalization actions for variables. This has to be
5344 // called for variables which have destructors/finalizers.
5345 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5346
5347 llvm::Value *FilterVal = Builder.getInt32(0);
5349 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5350
5351 if (!AfterIP)
5352 return AfterIP.takeError();
5353 Builder.restoreIP(*AfterIP);
5354 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5355
5356 if (!AfterIP)
5357 return AfterIP.takeError();
5358 Builder.restoreIP(*AfterIP);
5359 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5360 if (Err)
5361 return Err;
5362
5363 return AfterIP;
5364}
5365
5366Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5367 llvm::function_ref<Error()> InputLoopGen,
5368 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5369 ScanInfo *ScanRedInfo) {
5370
5371 {
5372 // Emit loop with input phase:
5373 // for (i: 0..<num_iters>) {
5374 // <input phase>;
5375 // buffer[i] = red;
5376 // }
5377 ScanRedInfo->OMPFirstScanLoop = true;
5378 Error Err = InputLoopGen();
5379 if (Err)
5380 return Err;
5381 }
5382 {
5383 // Emit loop with scan phase:
5384 // for (i: 0..<num_iters>) {
5385 // red = buffer[i];
5386 // <scan phase>;
5387 // }
5388 ScanRedInfo->OMPFirstScanLoop = false;
5389 Error Err = ScanLoopGen(Builder.saveIP());
5390 if (Err)
5391 return Err;
5392 }
5393 return Error::success();
5394}
5395
5396void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5397 Function *Fun = Builder.GetInsertBlock()->getParent();
5398 ScanRedInfo->OMPScanDispatch =
5399 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5400 ScanRedInfo->OMPAfterScanBlock =
5401 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5402 ScanRedInfo->OMPBeforeScanBlock =
5403 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5404 ScanRedInfo->OMPScanLoopExit =
5405 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5406}
5408 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5409 BasicBlock *PostInsertBefore, const Twine &Name) {
5410 Module *M = F->getParent();
5411 LLVMContext &Ctx = M->getContext();
5412 Type *IndVarTy = TripCount->getType();
5413
5414 // Create the basic block structure.
5415 BasicBlock *Preheader =
5416 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5417 BasicBlock *Header =
5418 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5419 BasicBlock *Cond =
5420 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5421 BasicBlock *Body =
5422 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5423 BasicBlock *Latch =
5424 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5425 BasicBlock *Exit =
5426 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5427 BasicBlock *After =
5428 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5429
5430 // Use specified DebugLoc for new instructions.
5431 Builder.SetCurrentDebugLocation(DL);
5432
5433 Builder.SetInsertPoint(Preheader);
5434 Builder.CreateBr(Header);
5435
5436 Builder.SetInsertPoint(Header);
5437 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5438 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5439 Builder.CreateBr(Cond);
5440
5441 Builder.SetInsertPoint(Cond);
5442 Value *Cmp =
5443 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5444 Builder.CreateCondBr(Cmp, Body, Exit);
5445
5446 Builder.SetInsertPoint(Body);
5447 Builder.CreateBr(Latch);
5448
5449 Builder.SetInsertPoint(Latch);
5450 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5451 "omp_" + Name + ".next", /*HasNUW=*/true);
5452 Builder.CreateBr(Header);
5453 IndVarPHI->addIncoming(Next, Latch);
5454
5455 Builder.SetInsertPoint(Exit);
5456 Builder.CreateBr(After);
5457
5458 // Remember and return the canonical control flow.
5459 LoopInfos.emplace_front();
5460 CanonicalLoopInfo *CL = &LoopInfos.front();
5461
5462 CL->Header = Header;
5463 CL->Cond = Cond;
5464 CL->Latch = Latch;
5465 CL->Exit = Exit;
5466
5467#ifndef NDEBUG
5468 CL->assertOK();
5469#endif
5470 return CL;
5471}
5472
5475 LoopBodyGenCallbackTy BodyGenCB,
5476 Value *TripCount, const Twine &Name) {
5477 BasicBlock *BB = Loc.IP.getBlock();
5478 BasicBlock *NextBB = BB->getNextNode();
5479
5480 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5481 NextBB, NextBB, Name);
5482 BasicBlock *After = CL->getAfter();
5483
5484 // If location is not set, don't connect the loop.
5485 if (updateToLocation(Loc)) {
5486 // Split the loop at the insertion point: Branch to the preheader and move
5487 // every following instruction to after the loop (the After BB). Also, the
5488 // new successor is the loop's after block.
5489 spliceBB(Builder, After, /*CreateBranch=*/false);
5490 Builder.CreateBr(CL->getPreheader());
5491 }
5492
5493 // Emit the body content. We do it after connecting the loop to the CFG to
5494 // avoid that the callback encounters degenerate BBs.
5495 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5496 return Err;
5497
5498#ifndef NDEBUG
5499 CL->assertOK();
5500#endif
5501 return CL;
5502}
5503
5505 ScanInfos.emplace_front();
5506 ScanInfo *Result = &ScanInfos.front();
5507 return Result;
5508}
5509
5513 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5514 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5515 LocationDescription ComputeLoc =
5516 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5517 updateToLocation(ComputeLoc);
5518
5520
5522 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5523 ScanRedInfo->Span = TripCount;
5524 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5525 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5526
5527 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5528 Builder.restoreIP(CodeGenIP);
5529 ScanRedInfo->IV = IV;
5530 createScanBBs(ScanRedInfo);
5531 BasicBlock *InputBlock = Builder.GetInsertBlock();
5532 Instruction *Terminator = InputBlock->getTerminator();
5533 assert(Terminator->getNumSuccessors() == 1);
5534 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5535 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5536 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5537 Builder.GetInsertBlock()->getParent());
5538 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5539 emitBlock(ScanRedInfo->OMPScanLoopExit,
5540 Builder.GetInsertBlock()->getParent());
5541 Builder.CreateBr(ContinueBlock);
5542 Builder.SetInsertPoint(
5543 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5544 return BodyGenCB(Builder.saveIP(), IV);
5545 };
5546
5547 const auto &&InputLoopGen = [&]() -> Error {
5549 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5550 ComputeIP, Name, true, ScanRedInfo);
5551 if (!LoopInfo)
5552 return LoopInfo.takeError();
5553 Result.push_back(*LoopInfo);
5554 Builder.restoreIP((*LoopInfo)->getAfterIP());
5555 return Error::success();
5556 };
5557 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5559 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5560 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5561 if (!LoopInfo)
5562 return LoopInfo.takeError();
5563 Result.push_back(*LoopInfo);
5564 Builder.restoreIP((*LoopInfo)->getAfterIP());
5565 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5566 return Error::success();
5567 };
5568 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5569 if (Err)
5570 return Err;
5571 return Result;
5572}
5573
5575 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5576 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5577
5578 // Consider the following difficulties (assuming 8-bit signed integers):
5579 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5580 // DO I = 1, 100, 50
5581 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5582 // DO I = 100, 0, -128
5583
5584 // Start, Stop and Step must be of the same integer type.
5585 auto *IndVarTy = cast<IntegerType>(Start->getType());
5586 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5587 assert(IndVarTy == Step->getType() && "Step type mismatch");
5588
5590
5591 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5592 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5593
5594 // Like Step, but always positive.
5595 Value *Incr = Step;
5596
5597 // Distance between Start and Stop; always positive.
5598 Value *Span;
5599
5600 // Condition whether there are no iterations are executed at all, e.g. because
5601 // UB < LB.
5602 Value *ZeroCmp;
5603
5604 if (IsSigned) {
5605 // Ensure that increment is positive. If not, negate and invert LB and UB.
5606 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5607 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5608 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5609 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5610 Span = Builder.CreateSub(UB, LB, "", false, true);
5611 ZeroCmp = Builder.CreateICmp(
5612 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5613 } else {
5614 Span = Builder.CreateSub(Stop, Start, "", true);
5615 ZeroCmp = Builder.CreateICmp(
5616 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5617 }
5618
5619 Value *CountIfLooping;
5620 if (InclusiveStop) {
5621 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5622 } else {
5623 // Avoid incrementing past stop since it could overflow.
5624 Value *CountIfTwo = Builder.CreateAdd(
5625 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5626 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5627 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5628 }
5629
5630 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5631 "omp_" + Name + ".tripcount");
5632}
5633
5636 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5637 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5638 ScanInfo *ScanRedInfo) {
5639 LocationDescription ComputeLoc =
5640 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5641
5643 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5644
5645 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5646 Builder.restoreIP(CodeGenIP);
5647 Value *Span = Builder.CreateMul(IV, Step);
5648 Value *IndVar = Builder.CreateAdd(Span, Start);
5649 if (InScan)
5650 ScanRedInfo->IV = IndVar;
5651 return BodyGenCB(Builder.saveIP(), IndVar);
5652 };
5653 LocationDescription LoopLoc =
5654 ComputeIP.isSet()
5655 ? Loc
5656 : LocationDescription(Builder.saveIP(),
5657 Builder.getCurrentDebugLocation());
5658 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5659}
5660
5661// Returns an LLVM function to call for initializing loop bounds using OpenMP
5662// static scheduling for composite `distribute parallel for` depending on
5663// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5664// integers as unsigned similarly to CanonicalLoopInfo.
5665static FunctionCallee
5667 OpenMPIRBuilder &OMPBuilder) {
5668 unsigned Bitwidth = Ty->getIntegerBitWidth();
5669 if (Bitwidth == 32)
5670 return OMPBuilder.getOrCreateRuntimeFunction(
5671 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5672 if (Bitwidth == 64)
5673 return OMPBuilder.getOrCreateRuntimeFunction(
5674 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5675 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5676}
5677
5678// Returns an LLVM function to call for initializing loop bounds using OpenMP
5679// static scheduling depending on `type`. Only i32 and i64 are supported by the
5680// runtime. Always interpret integers as unsigned similarly to
5681// CanonicalLoopInfo.
5683 OpenMPIRBuilder &OMPBuilder) {
5684 unsigned Bitwidth = Ty->getIntegerBitWidth();
5685 if (Bitwidth == 32)
5686 return OMPBuilder.getOrCreateRuntimeFunction(
5687 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5688 if (Bitwidth == 64)
5689 return OMPBuilder.getOrCreateRuntimeFunction(
5690 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5691 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5692}
5693
5694OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5695 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5696 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5697 OMPScheduleType DistScheduleSchedType) {
5698 assert(CLI->isValid() && "Requires a valid canonical loop");
5699 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5700 "Require dedicated allocate IP");
5701
5702 // Set up the source location value for OpenMP runtime.
5703 Builder.restoreIP(CLI->getPreheaderIP());
5704 Builder.SetCurrentDebugLocation(DL);
5705
5706 uint32_t SrcLocStrSize;
5707 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5709 switch (LoopType) {
5710 case WorksharingLoopType::ForStaticLoop:
5711 Flag = OMP_IDENT_FLAG_WORK_LOOP;
5712 break;
5713 case WorksharingLoopType::DistributeStaticLoop:
5714 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5715 break;
5716 case WorksharingLoopType::DistributeForStaticLoop:
5717 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
5718 break;
5719 }
5720 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5721
5722 // Declare useful OpenMP runtime functions.
5723 Value *IV = CLI->getIndVar();
5724 Type *IVTy = IV->getType();
5725 FunctionCallee StaticInit =
5726 LoopType == WorksharingLoopType::DistributeForStaticLoop
5727 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5728 : getKmpcForStaticInitForType(IVTy, M, *this);
5729 FunctionCallee StaticFini =
5730 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5731
5732 // Allocate space for computed loop bounds as expected by the "init" function.
5733 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5734
5735 Type *I32Type = Type::getInt32Ty(M.getContext());
5736 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5737 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5738 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5739 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5740 CLI->setLastIter(PLastIter);
5741
5742 // At the end of the preheader, prepare for calling the "init" function by
5743 // storing the current loop bounds into the allocated space. A canonical loop
5744 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5745 // and produces an inclusive upper bound.
5746 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5747 Constant *Zero = ConstantInt::get(IVTy, 0);
5748 Constant *One = ConstantInt::get(IVTy, 1);
5749 Builder.CreateStore(Zero, PLowerBound);
5750 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5751 Builder.CreateStore(UpperBound, PUpperBound);
5752 Builder.CreateStore(One, PStride);
5753
5754 Value *ThreadNum =
5755 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5756
5757 OMPScheduleType SchedType =
5758 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5759 ? OMPScheduleType::OrderedDistribute
5761 Constant *SchedulingType =
5762 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5763
5764 // Call the "init" function and update the trip count of the loop with the
5765 // value it produced.
5766 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5767 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5768 this](Value *SchedulingType, auto &Builder) {
5769 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5770 PLowerBound, PUpperBound});
5771 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5772 Value *PDistUpperBound =
5773 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5774 Args.push_back(PDistUpperBound);
5775 }
5776 Args.append({PStride, One, Zero});
5777 createRuntimeFunctionCall(StaticInit, Args);
5778 };
5779 BuildInitCall(SchedulingType, Builder);
5780 if (HasDistSchedule &&
5781 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5782 Constant *DistScheduleSchedType = ConstantInt::get(
5783 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5784 // We want to emit a second init function call for the dist_schedule clause
5785 // to the Distribute construct. This should only be done however if a
5786 // Workshare Loop is nested within a Distribute Construct
5787 BuildInitCall(DistScheduleSchedType, Builder);
5788 }
5789 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5790 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5791 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5792 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5793 CLI->setTripCount(TripCount);
5794
5795 // Update all uses of the induction variable except the one in the condition
5796 // block that compares it with the actual upper bound, and the increment in
5797 // the latch block.
5798
5799 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5800 Builder.SetInsertPoint(CLI->getBody(),
5801 CLI->getBody()->getFirstInsertionPt());
5802 Builder.SetCurrentDebugLocation(DL);
5803 return Builder.CreateAdd(OldIV, LowerBound);
5804 });
5805
5806 // In the "exit" block, call the "fini" function.
5807 Builder.SetInsertPoint(CLI->getExit(),
5808 CLI->getExit()->getTerminator()->getIterator());
5809 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5810
5811 // Add the barrier if requested.
5812 if (NeedsBarrier) {
5813 InsertPointOrErrorTy BarrierIP =
5815 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5816 /* CheckCancelFlag */ false);
5817 if (!BarrierIP)
5818 return BarrierIP.takeError();
5819 }
5820
5821 InsertPointTy AfterIP = CLI->getAfterIP();
5822 CLI->invalidate();
5823
5824 return AfterIP;
5825}
5826
5827static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5828 LoopInfo &LI);
5829static void addLoopMetadata(CanonicalLoopInfo *Loop,
5830 ArrayRef<Metadata *> Properties);
5831
5833 LLVMContext &Ctx, Loop *Loop,
5835 SmallVector<Metadata *> &LoopMDList) {
5836 SmallSet<BasicBlock *, 8> Reachable;
5837
5838 // Get the basic blocks from the loop in which memref instructions
5839 // can be found.
5840 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5841 // preferably without running any passes.
5842 for (BasicBlock *Block : Loop->getBlocks()) {
5843 if (Block == CLI->getCond() || Block == CLI->getHeader())
5844 continue;
5845 Reachable.insert(Block);
5846 }
5847
5848 // Add access group metadata to memory-access instructions.
5849 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5850 for (BasicBlock *BB : Reachable)
5851 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5852 // TODO: If the loop has existing parallel access metadata, have
5853 // to combine two lists.
5854 LoopMDList.push_back(MDNode::get(
5855 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5856}
5857
5859OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5860 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5861 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5862 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5863 assert(CLI->isValid() && "Requires a valid canonical loop");
5864 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5865
5866 LLVMContext &Ctx = CLI->getFunction()->getContext();
5867 Value *IV = CLI->getIndVar();
5868 Value *OrigTripCount = CLI->getTripCount();
5869 Type *IVTy = IV->getType();
5870 assert(IVTy->getIntegerBitWidth() <= 64 &&
5871 "Max supported tripcount bitwidth is 64 bits");
5872 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5873 : Type::getInt64Ty(Ctx);
5874 Type *I32Type = Type::getInt32Ty(M.getContext());
5875 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5876 Constant *One = ConstantInt::get(InternalIVTy, 1);
5877
5878 Function *F = CLI->getFunction();
5879 // Blocks must have terminators.
5880 // FIXME: Don't run analyses on incomplete/invalid IR.
5882 for (BasicBlock &BB : *F)
5883 if (!BB.hasTerminator())
5884 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
5886 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5887 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5888 LoopAnalysis LIA;
5889 LoopInfo &&LI = LIA.run(*F, FAM);
5890 for (Instruction *I : UIs)
5891 I->eraseFromParent();
5892 Loop *L = LI.getLoopFor(CLI->getHeader());
5893 SmallVector<Metadata *> LoopMDList;
5894 if (ChunkSize || DistScheduleChunkSize)
5895 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5896 addLoopMetadata(CLI, LoopMDList);
5897
5898 // Declare useful OpenMP runtime functions.
5899 FunctionCallee StaticInit =
5900 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5901 FunctionCallee StaticFini =
5902 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5903
5904 // Allocate space for computed loop bounds as expected by the "init" function.
5905 Builder.restoreIP(AllocaIP);
5906 Builder.SetCurrentDebugLocation(DL);
5907 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5908 Value *PLowerBound =
5909 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5910 Value *PUpperBound =
5911 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5912 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5913 CLI->setLastIter(PLastIter);
5914
5915 // Set up the source location value for the OpenMP runtime.
5916 Builder.restoreIP(CLI->getPreheaderIP());
5917 Builder.SetCurrentDebugLocation(DL);
5918
5919 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5920 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5921 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5922 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5923 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5924 "distschedulechunksize");
5925 Value *CastedTripCount =
5926 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5927
5928 Constant *SchedulingType =
5929 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5930 Constant *DistSchedulingType =
5931 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5932 Builder.CreateStore(Zero, PLowerBound);
5933 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5934 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5935 Value *UpperBound =
5936 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5937 Builder.CreateStore(UpperBound, PUpperBound);
5938 Builder.CreateStore(One, PStride);
5939
5940 // Call the "init" function and update the trip count of the loop with the
5941 // value it produced.
5942 uint32_t SrcLocStrSize;
5943 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5944 IdentFlag Flag = OMP_IDENT_FLAG_WORK_LOOP;
5945 if (DistScheduleSchedType != OMPScheduleType::None) {
5946 Flag |= OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5947 }
5948 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5949 Value *ThreadNum =
5950 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5951 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5952 PUpperBound, PStride, One,
5953 this](Value *SchedulingType, Value *ChunkSize,
5954 auto &Builder) {
5956 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5957 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5958 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5959 /*pstride=*/PStride, /*incr=*/One,
5960 /*chunk=*/ChunkSize});
5961 };
5962 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5963 if (DistScheduleSchedType != OMPScheduleType::None &&
5964 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5965 SchedType != OMPScheduleType::OrderedDistribute) {
5966 // We want to emit a second init function call for the dist_schedule clause
5967 // to the Distribute construct. This should only be done however if a
5968 // Workshare Loop is nested within a Distribute Construct
5969 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5970 }
5971
5972 // Load values written by the "init" function.
5973 Value *FirstChunkStart =
5974 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5975 Value *FirstChunkStop =
5976 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5977 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5978 Value *ChunkRange =
5979 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5980 Value *NextChunkStride =
5981 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5982
5983 // Create outer "dispatch" loop for enumerating the chunks.
5984 BasicBlock *DispatchEnter = splitBB(Builder, true);
5985 Value *DispatchCounter;
5986
5987 // It is safe to assume this didn't return an error because the callback
5988 // passed into createCanonicalLoop is the only possible error source, and it
5989 // always returns success.
5990 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5991 {Builder.saveIP(), DL},
5992 [&](InsertPointTy BodyIP, Value *Counter) {
5993 DispatchCounter = Counter;
5994 return Error::success();
5995 },
5996 FirstChunkStart, CastedTripCount, NextChunkStride,
5997 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5998 "dispatch"));
5999
6000 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
6001 // not have to preserve the canonical invariant.
6002 BasicBlock *DispatchBody = DispatchCLI->getBody();
6003 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
6004 BasicBlock *DispatchExit = DispatchCLI->getExit();
6005 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
6006 DispatchCLI->invalidate();
6007
6008 // Rewire the original loop to become the chunk loop inside the dispatch loop.
6009 redirectTo(DispatchAfter, CLI->getAfter(), DL);
6010 redirectTo(CLI->getExit(), DispatchLatch, DL);
6011 redirectTo(DispatchBody, DispatchEnter, DL);
6012
6013 // Prepare the prolog of the chunk loop.
6014 Builder.restoreIP(CLI->getPreheaderIP());
6015 Builder.SetCurrentDebugLocation(DL);
6016
6017 // Compute the number of iterations of the chunk loop.
6018 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
6019 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
6020 Value *IsLastChunk =
6021 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
6022 Value *CountUntilOrigTripCount =
6023 Builder.CreateSub(CastedTripCount, DispatchCounter);
6024 Value *ChunkTripCount = Builder.CreateSelect(
6025 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
6026 Value *BackcastedChunkTC =
6027 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
6028 CLI->setTripCount(BackcastedChunkTC);
6029
6030 // Update all uses of the induction variable except the one in the condition
6031 // block that compares it with the actual upper bound, and the increment in
6032 // the latch block.
6033 Value *BackcastedDispatchCounter =
6034 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
6035 CLI->mapIndVar([&](Instruction *) -> Value * {
6036 Builder.restoreIP(CLI->getBodyIP());
6037 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
6038 });
6039
6040 // In the "exit" block, call the "fini" function.
6041 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
6042 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
6043
6044 // Add the barrier if requested.
6045 if (NeedsBarrier) {
6046 InsertPointOrErrorTy AfterIP =
6047 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
6048 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
6049 if (!AfterIP)
6050 return AfterIP.takeError();
6051 }
6052
6053#ifndef NDEBUG
6054 // Even though we currently do not support applying additional methods to it,
6055 // the chunk loop should remain a canonical loop.
6056 CLI->assertOK();
6057#endif
6058
6059 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
6060}
6061
6062// Returns an LLVM function to call for executing an OpenMP static worksharing
6063// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
6064// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
6065static FunctionCallee
6067 WorksharingLoopType LoopType) {
6068 unsigned Bitwidth = Ty->getIntegerBitWidth();
6069 Module &M = OMPBuilder->M;
6070 switch (LoopType) {
6071 case WorksharingLoopType::ForStaticLoop:
6072 if (Bitwidth == 32)
6073 return OMPBuilder->getOrCreateRuntimeFunction(
6074 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
6075 if (Bitwidth == 64)
6076 return OMPBuilder->getOrCreateRuntimeFunction(
6077 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
6078 break;
6079 case WorksharingLoopType::DistributeStaticLoop:
6080 if (Bitwidth == 32)
6081 return OMPBuilder->getOrCreateRuntimeFunction(
6082 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
6083 if (Bitwidth == 64)
6084 return OMPBuilder->getOrCreateRuntimeFunction(
6085 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
6086 break;
6087 case WorksharingLoopType::DistributeForStaticLoop:
6088 if (Bitwidth == 32)
6089 return OMPBuilder->getOrCreateRuntimeFunction(
6090 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
6091 if (Bitwidth == 64)
6092 return OMPBuilder->getOrCreateRuntimeFunction(
6093 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
6094 break;
6095 }
6096 if (Bitwidth != 32 && Bitwidth != 64) {
6097 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
6098 }
6099 llvm_unreachable("Unknown type of OpenMP worksharing loop");
6100}
6101
6102// Inserts a call to proper OpenMP Device RTL function which handles
6103// loop worksharing.
6105 WorksharingLoopType LoopType,
6106 BasicBlock *InsertBlock, Value *Ident,
6107 Value *LoopBodyArg, Value *TripCount,
6108 Function &LoopBodyFn, bool NoLoop) {
6109 Type *TripCountTy = TripCount->getType();
6110 Module &M = OMPBuilder->M;
6111 IRBuilder<> &Builder = OMPBuilder->Builder;
6112 FunctionCallee RTLFn =
6113 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
6114 SmallVector<Value *, 8> RealArgs;
6115 RealArgs.push_back(Ident);
6116 RealArgs.push_back(&LoopBodyFn);
6117 RealArgs.push_back(LoopBodyArg);
6118 RealArgs.push_back(TripCount);
6119 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
6120 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6121 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
6122 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
6123 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
6124 return;
6125 }
6126 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
6127 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
6128 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
6129 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
6130
6131 RealArgs.push_back(
6132 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
6133 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6134 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
6135 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6136 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
6137 } else {
6138 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
6139 }
6140
6141 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
6142}
6143
6145 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
6146 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
6147 WorksharingLoopType LoopType, bool NoLoop) {
6148 IRBuilder<> &Builder = OMPIRBuilder->Builder;
6149 BasicBlock *Preheader = CLI->getPreheader();
6150 Value *TripCount = CLI->getTripCount();
6151
6152 // After loop body outling, the loop body contains only set up
6153 // of loop body argument structure and the call to the outlined
6154 // loop body function. Firstly, we need to move setup of loop body args
6155 // into loop preheader.
6156 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
6157 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
6158
6159 // The next step is to remove the whole loop. We do not it need anymore.
6160 // That's why make an unconditional branch from loop preheader to loop
6161 // exit block
6162 Builder.restoreIP({Preheader, Preheader->end()});
6163 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
6164 Preheader->getTerminator()->eraseFromParent();
6165 Builder.CreateBr(CLI->getExit());
6166
6167 // Delete dead loop blocks
6168 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
6169 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
6170 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
6171 CleanUpInfo.EntryBB = CLI->getHeader();
6172 CleanUpInfo.ExitBB = CLI->getExit();
6173 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
6174 DeleteDeadBlocks(BlocksToBeRemoved);
6175
6176 // Find the instruction which corresponds to loop body argument structure
6177 // and remove the call to loop body function instruction.
6178 Value *LoopBodyArg;
6179 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
6180 assert(OutlinedFnUser &&
6181 "Expected unique undroppable user of outlined function");
6182 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
6183 assert(OutlinedFnCallInstruction && "Expected outlined function call");
6184 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
6185 "Expected outlined function call to be located in loop preheader");
6186 // Check in case no argument structure has been passed.
6187 if (OutlinedFnCallInstruction->arg_size() > 1)
6188 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
6189 else
6190 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
6191 OutlinedFnCallInstruction->eraseFromParent();
6192
6193 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
6194 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
6195
6196 for (auto &ToBeDeletedItem : ToBeDeleted)
6197 ToBeDeletedItem->eraseFromParent();
6198 CLI->invalidate();
6199}
6200
6201OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
6202 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
6203 WorksharingLoopType LoopType, bool NoLoop) {
6204 uint32_t SrcLocStrSize;
6205 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6207 switch (LoopType) {
6208 case WorksharingLoopType::ForStaticLoop:
6209 Flag = OMP_IDENT_FLAG_WORK_LOOP;
6210 break;
6211 case WorksharingLoopType::DistributeStaticLoop:
6212 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
6213 break;
6214 case WorksharingLoopType::DistributeForStaticLoop:
6215 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
6216 break;
6217 }
6218 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
6219
6220 auto OI = std::make_unique<OutlineInfo>();
6221 OI->OuterAllocBB = CLI->getPreheader();
6222 Function *OuterFn = CLI->getPreheader()->getParent();
6223
6224 // Instructions which need to be deleted at the end of code generation
6225 SmallVector<Instruction *, 4> ToBeDeleted;
6226
6227 OI->OuterAllocBB = AllocaIP.getBlock();
6228
6229 // Mark the body loop as region which needs to be extracted
6230 OI->EntryBB = CLI->getBody();
6231 OI->ExitBB = CLI->getLatch()->splitBasicBlockBefore(CLI->getLatch()->begin(),
6232 "omp.prelatch");
6233
6234 // Prepare loop body for extraction
6235 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
6236
6237 // Insert new loop counter variable which will be used only in loop
6238 // body.
6239 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
6240 Instruction *NewLoopCntLoad =
6241 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
6242 // New loop counter instructions are redundant in the loop preheader when
6243 // code generation for workshare loop is finshed. That's why mark them as
6244 // ready for deletion.
6245 ToBeDeleted.push_back(NewLoopCntLoad);
6246 ToBeDeleted.push_back(NewLoopCnt);
6247
6248 // Analyse loop body region. Find all input variables which are used inside
6249 // loop body region.
6250 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
6252 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
6253
6254 CodeExtractorAnalysisCache CEAC(*OuterFn);
6255 CodeExtractor Extractor(Blocks,
6256 /* DominatorTree */ nullptr,
6257 /* AggregateArgs */ true,
6258 /* BlockFrequencyInfo */ nullptr,
6259 /* BranchProbabilityInfo */ nullptr,
6260 /* AssumptionCache */ nullptr,
6261 /* AllowVarArgs */ true,
6262 /* AllowAlloca */ true,
6263 /* AllocationBlock */ CLI->getPreheader(),
6264 /* DeallocationBlocks */ {},
6265 /* Suffix */ ".omp_wsloop",
6266 /* AggrArgsIn0AddrSpace */ true);
6267
6268 BasicBlock *CommonExit = nullptr;
6269 SetVector<Value *> SinkingCands, HoistingCands;
6270
6271 // Find allocas outside the loop body region which are used inside loop
6272 // body
6273 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
6274
6275 // We need to model loop body region as the function f(cnt, loop_arg).
6276 // That's why we replace loop induction variable by the new counter
6277 // which will be one of loop body function argument
6279 CLI->getIndVar()->user_end());
6280 for (auto Use : Users) {
6281 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
6282 if (ParallelRegionBlockSet.count(Inst->getParent())) {
6283 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
6284 }
6285 }
6286 }
6287 // Make sure that loop counter variable is not merged into loop body
6288 // function argument structure and it is passed as separate variable
6289 OI->ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
6290
6291 // PostOutline CB is invoked when loop body function is outlined and
6292 // loop body is replaced by call to outlined function. We need to add
6293 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
6294 // function will handle loop control logic.
6295 //
6296 OI->PostOutlineCB = [=, ToBeDeletedVec =
6297 std::move(ToBeDeleted)](Function &OutlinedFn) {
6298 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
6299 LoopType, NoLoop);
6300 };
6301 addOutlineInfo(std::move(OI));
6302 return CLI->getAfterIP();
6303}
6304
6307 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
6308 bool HasSimdModifier, bool HasMonotonicModifier,
6309 bool HasNonmonotonicModifier, bool HasOrderedClause,
6310 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
6311 Value *DistScheduleChunkSize) {
6312 if (Config.isTargetDevice())
6313 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
6314 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
6315 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
6316 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
6317
6318 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
6319 OMPScheduleType::ModifierOrdered;
6320 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
6321 if (HasDistSchedule) {
6322 DistScheduleSchedType = DistScheduleChunkSize
6323 ? OMPScheduleType::OrderedDistributeChunked
6324 : OMPScheduleType::OrderedDistribute;
6325 }
6326 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
6327 case OMPScheduleType::BaseStatic:
6328 case OMPScheduleType::BaseDistribute:
6329 assert((!ChunkSize || !DistScheduleChunkSize) &&
6330 "No chunk size with static-chunked schedule");
6331 if (IsOrdered && !HasDistSchedule)
6332 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6333 NeedsBarrier, ChunkSize);
6334 // FIXME: Monotonicity ignored?
6335 if (DistScheduleChunkSize)
6336 return applyStaticChunkedWorkshareLoop(
6337 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6338 DistScheduleChunkSize, DistScheduleSchedType);
6339 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
6340 HasDistSchedule);
6341
6342 case OMPScheduleType::BaseStaticChunked:
6343 case OMPScheduleType::BaseDistributeChunked:
6344 if (IsOrdered && !HasDistSchedule)
6345 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6346 NeedsBarrier, ChunkSize);
6347 // FIXME: Monotonicity ignored?
6348 return applyStaticChunkedWorkshareLoop(
6349 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6350 DistScheduleChunkSize, DistScheduleSchedType);
6351
6352 case OMPScheduleType::BaseRuntime:
6353 case OMPScheduleType::BaseAuto:
6354 case OMPScheduleType::BaseGreedy:
6355 case OMPScheduleType::BaseBalanced:
6356 case OMPScheduleType::BaseSteal:
6357 case OMPScheduleType::BaseRuntimeSimd:
6358 assert(!ChunkSize &&
6359 "schedule type does not support user-defined chunk sizes");
6360 [[fallthrough]];
6361 case OMPScheduleType::BaseGuidedSimd:
6362 case OMPScheduleType::BaseDynamicChunked:
6363 case OMPScheduleType::BaseGuidedChunked:
6364 case OMPScheduleType::BaseGuidedIterativeChunked:
6365 case OMPScheduleType::BaseGuidedAnalyticalChunked:
6366 case OMPScheduleType::BaseStaticBalancedChunked:
6367 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6368 NeedsBarrier, ChunkSize);
6369
6370 default:
6371 llvm_unreachable("Unknown/unimplemented schedule kind");
6372 }
6373}
6374
6375/// Returns an LLVM function to call for initializing loop bounds using OpenMP
6376/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6377/// the runtime. Always interpret integers as unsigned similarly to
6378/// CanonicalLoopInfo.
6379static FunctionCallee
6381 unsigned Bitwidth = Ty->getIntegerBitWidth();
6382 if (Bitwidth == 32)
6383 return OMPBuilder.getOrCreateRuntimeFunction(
6384 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
6385 if (Bitwidth == 64)
6386 return OMPBuilder.getOrCreateRuntimeFunction(
6387 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
6388 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6389}
6390
6391/// Returns an LLVM function to call for updating the next loop using OpenMP
6392/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6393/// the runtime. Always interpret integers as unsigned similarly to
6394/// CanonicalLoopInfo.
6395static FunctionCallee
6397 unsigned Bitwidth = Ty->getIntegerBitWidth();
6398 if (Bitwidth == 32)
6399 return OMPBuilder.getOrCreateRuntimeFunction(
6400 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6401 if (Bitwidth == 64)
6402 return OMPBuilder.getOrCreateRuntimeFunction(
6403 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6404 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6405}
6406
6407/// Returns an LLVM function to call for finalizing the dynamic loop using
6408/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6409/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6410static FunctionCallee
6412 unsigned Bitwidth = Ty->getIntegerBitWidth();
6413 if (Bitwidth == 32)
6414 return OMPBuilder.getOrCreateRuntimeFunction(
6415 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6416 if (Bitwidth == 64)
6417 return OMPBuilder.getOrCreateRuntimeFunction(
6418 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6419 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6420}
6421
6423OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6424 InsertPointTy AllocaIP,
6425 OMPScheduleType SchedType,
6426 bool NeedsBarrier, Value *Chunk) {
6427 assert(CLI->isValid() && "Requires a valid canonical loop");
6428 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6429 "Require dedicated allocate IP");
6431 "Require valid schedule type");
6432
6433 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6434 OMPScheduleType::ModifierOrdered;
6435
6436 // Set up the source location value for OpenMP runtime.
6437 Builder.SetCurrentDebugLocation(DL);
6438
6439 uint32_t SrcLocStrSize;
6440 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6441 Value *SrcLoc =
6442 getOrCreateIdent(SrcLocStr, SrcLocStrSize, OMP_IDENT_FLAG_WORK_LOOP);
6443
6444 // Declare useful OpenMP runtime functions.
6445 Value *IV = CLI->getIndVar();
6446 Type *IVTy = IV->getType();
6447 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6448 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6449
6450 // Allocate space for computed loop bounds as expected by the "init" function.
6451 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6452 Type *I32Type = Type::getInt32Ty(M.getContext());
6453 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6454 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6455 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6456 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6457 CLI->setLastIter(PLastIter);
6458
6459 // At the end of the preheader, prepare for calling the "init" function by
6460 // storing the current loop bounds into the allocated space. A canonical loop
6461 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6462 // and produces an inclusive upper bound.
6463 BasicBlock *PreHeader = CLI->getPreheader();
6464 Builder.SetInsertPoint(PreHeader->getTerminator());
6465 Constant *One = ConstantInt::get(IVTy, 1);
6466 Builder.CreateStore(One, PLowerBound);
6467 Value *UpperBound = CLI->getTripCount();
6468 Builder.CreateStore(UpperBound, PUpperBound);
6469 Builder.CreateStore(One, PStride);
6470
6471 BasicBlock *Header = CLI->getHeader();
6472 BasicBlock *Exit = CLI->getExit();
6473 BasicBlock *Cond = CLI->getCond();
6474 BasicBlock *Latch = CLI->getLatch();
6475 InsertPointTy AfterIP = CLI->getAfterIP();
6476
6477 // The CLI will be "broken" in the code below, as the loop is no longer
6478 // a valid canonical loop.
6479
6480 if (!Chunk)
6481 Chunk = One;
6482
6483 Value *ThreadNum =
6484 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
6485
6486 Constant *SchedulingType =
6487 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6488
6489 // Call the "init" function.
6490 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6491 /* LowerBound */ One, UpperBound,
6492 /* step */ One, Chunk});
6493
6494 // An outer loop around the existing one.
6495 BasicBlock *OuterCond = BasicBlock::Create(
6496 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6497 PreHeader->getParent());
6498 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6499 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6501 DynamicNext,
6502 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6503 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6504 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6505 Value *LowerBound =
6506 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6507 Builder.CreateCondBr(MoreWork, Header, Exit);
6508
6509 // Change PHI-node in loop header to use outer cond rather than preheader,
6510 // and set IV to the LowerBound.
6511 Instruction *Phi = &Header->front();
6512 auto *PI = cast<PHINode>(Phi);
6513 PI->setIncomingBlock(0, OuterCond);
6514 PI->setIncomingValue(0, LowerBound);
6515
6516 // Then set the pre-header to jump to the OuterCond
6517 Instruction *Term = PreHeader->getTerminator();
6518 auto *Br = cast<UncondBrInst>(Term);
6519 Br->setSuccessor(OuterCond);
6520
6521 // Modify the inner condition:
6522 // * Use the UpperBound returned from the DynamicNext call.
6523 // * jump to the loop outer loop when done with one of the inner loops.
6524 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6525 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6526 Instruction *Comp = &*Builder.GetInsertPoint();
6527 auto *CI = cast<CmpInst>(Comp);
6528 CI->setOperand(1, UpperBound);
6529 // Redirect the inner exit to branch to outer condition.
6530 Instruction *Branch = &Cond->back();
6531 auto *BI = cast<CondBrInst>(Branch);
6532 assert(BI->getSuccessor(1) == Exit);
6533 BI->setSuccessor(1, OuterCond);
6534
6535 // Call the "fini" function if "ordered" is present in wsloop directive.
6536 if (Ordered) {
6537 Builder.SetInsertPoint(&Latch->back());
6538 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6539 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6540 }
6541
6542 // Add the barrier if requested.
6543 if (NeedsBarrier) {
6544 Builder.SetInsertPoint(&Exit->back());
6545 InsertPointOrErrorTy BarrierIP =
6547 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6548 /* CheckCancelFlag */ false);
6549 if (!BarrierIP)
6550 return BarrierIP.takeError();
6551 }
6552
6553 CLI->invalidate();
6554 return AfterIP;
6555}
6556
6557/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6558/// after this \p OldTarget will be orphaned.
6560 BasicBlock *NewTarget, DebugLoc DL) {
6561 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6562 redirectTo(Pred, NewTarget, DL);
6563}
6564
6566 SmallPtrSet<BasicBlock *, 8> InternalBBs(from_range, BBs);
6567 // We add a block to BBsToKeep iff we have proven it has an external use.
6569
6570 while (true) {
6571 bool Changed = false;
6572
6573 for (BasicBlock *BB : BBs) {
6574 if (BBsToKeep.contains(BB))
6575 continue;
6576
6577 for (Use &U : BB->uses()) {
6578 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6579 if (!UseInst)
6580 continue;
6581 BasicBlock *UseBB = UseInst->getParent();
6582 if (!InternalBBs.contains(UseBB) || BBsToKeep.contains(UseBB)) {
6583 BBsToKeep.insert(BB);
6584 Changed = true;
6585 break;
6586 }
6587 }
6588 }
6589
6590 if (!Changed)
6591 break;
6592 }
6593
6595 BBs, [&BBsToKeep](BasicBlock *BB) { return !BBsToKeep.contains(BB); });
6596 DeleteDeadBlocks(BBsToDelete);
6597}
6598
6599CanonicalLoopInfo *
6601 InsertPointTy ComputeIP) {
6602 assert(Loops.size() >= 1 && "At least one loop required");
6603 size_t NumLoops = Loops.size();
6604
6605 // Nothing to do if there is already just one loop.
6606 if (NumLoops == 1)
6607 return Loops.front();
6608
6609 CanonicalLoopInfo *Outermost = Loops.front();
6610 CanonicalLoopInfo *Innermost = Loops.back();
6611 BasicBlock *OrigPreheader = Outermost->getPreheader();
6612 BasicBlock *OrigAfter = Outermost->getAfter();
6613 Function *F = OrigPreheader->getParent();
6614
6615 // Loop control blocks that may become orphaned later.
6616 SmallVector<BasicBlock *, 12> OldControlBBs;
6617 OldControlBBs.reserve(6 * Loops.size());
6619 Loop->collectControlBlocks(OldControlBBs);
6620
6621 // Setup the IRBuilder for inserting the trip count computation.
6622 Builder.SetCurrentDebugLocation(DL);
6623 if (ComputeIP.isSet())
6624 Builder.restoreIP(ComputeIP);
6625 else
6626 Builder.restoreIP(Outermost->getPreheaderIP());
6627
6628 // Derive the collapsed' loop trip count.
6629 // TODO: Find common/largest indvar type.
6630 Value *CollapsedTripCount = nullptr;
6631 for (CanonicalLoopInfo *L : Loops) {
6632 assert(L->isValid() &&
6633 "All loops to collapse must be valid canonical loops");
6634 Value *OrigTripCount = L->getTripCount();
6635 if (!CollapsedTripCount) {
6636 CollapsedTripCount = OrigTripCount;
6637 continue;
6638 }
6639
6640 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6641 CollapsedTripCount =
6642 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6643 }
6644
6645 // Create the collapsed loop control flow.
6646 CanonicalLoopInfo *Result =
6647 createLoopSkeleton(DL, CollapsedTripCount, F,
6648 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6649
6650 // Build the collapsed loop body code.
6651 // Start with deriving the input loop induction variables from the collapsed
6652 // one, using a divmod scheme. To preserve the original loops' order, the
6653 // innermost loop use the least significant bits.
6654 Builder.restoreIP(Result->getBodyIP());
6655
6656 Value *Leftover = Result->getIndVar();
6657 SmallVector<Value *> NewIndVars;
6658 NewIndVars.resize(NumLoops);
6659 for (int i = NumLoops - 1; i >= 1; --i) {
6660 Value *OrigTripCount = Loops[i]->getTripCount();
6661
6662 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6663 NewIndVars[i] = NewIndVar;
6664
6665 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6666 }
6667 // Outermost loop gets all the remaining bits.
6668 NewIndVars[0] = Leftover;
6669
6670 // Construct the loop body control flow.
6671 // We progressively construct the branch structure following in direction of
6672 // the control flow, from the leading in-between code, the loop nest body, the
6673 // trailing in-between code, and rejoining the collapsed loop's latch.
6674 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6675 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6676 // its predecessors as sources.
6677 BasicBlock *ContinueBlock = Result->getBody();
6678 BasicBlock *ContinuePred = nullptr;
6679 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6680 BasicBlock *NextSrc) {
6681 if (ContinueBlock)
6682 redirectTo(ContinueBlock, Dest, DL);
6683 else
6684 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6685
6686 ContinueBlock = nullptr;
6687 ContinuePred = NextSrc;
6688 };
6689
6690 // The code before the nested loop of each level.
6691 // Because we are sinking it into the nest, it will be executed more often
6692 // that the original loop. More sophisticated schemes could keep track of what
6693 // the in-between code is and instantiate it only once per thread.
6694 for (size_t i = 0; i < NumLoops - 1; ++i)
6695 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6696
6697 // Connect the loop nest body.
6698 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6699
6700 // The code after the nested loop at each level.
6701 for (size_t i = NumLoops - 1; i > 0; --i)
6702 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6703
6704 // Connect the finished loop to the collapsed loop latch.
6705 ContinueWith(Result->getLatch(), nullptr);
6706
6707 // Replace the input loops with the new collapsed loop.
6708 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6709 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6710
6711 // Replace the input loop indvars with the derived ones.
6712 for (size_t i = 0; i < NumLoops; ++i)
6713 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6714
6715 // Remove unused parts of the input loops.
6716 removeUnusedBlocksFromParent(OldControlBBs);
6717
6718 for (CanonicalLoopInfo *L : Loops)
6719 L->invalidate();
6720
6721#ifndef NDEBUG
6722 Result->assertOK();
6723#endif
6724 return Result;
6725}
6726
6727std::vector<CanonicalLoopInfo *>
6729 ArrayRef<Value *> TileSizes) {
6730 assert(TileSizes.size() == Loops.size() &&
6731 "Must pass as many tile sizes as there are loops");
6732 int NumLoops = Loops.size();
6733 assert(NumLoops >= 1 && "At least one loop to tile required");
6734
6735 CanonicalLoopInfo *OutermostLoop = Loops.front();
6736 CanonicalLoopInfo *InnermostLoop = Loops.back();
6737 Function *F = OutermostLoop->getBody()->getParent();
6738 BasicBlock *InnerEnter = InnermostLoop->getBody();
6739 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6740
6741 // Loop control blocks that may become orphaned later.
6742 SmallVector<BasicBlock *, 12> OldControlBBs;
6743 OldControlBBs.reserve(6 * Loops.size());
6745 Loop->collectControlBlocks(OldControlBBs);
6746
6747 // Collect original trip counts and induction variable to be accessible by
6748 // index. Also, the structure of the original loops is not preserved during
6749 // the construction of the tiled loops, so do it before we scavenge the BBs of
6750 // any original CanonicalLoopInfo.
6751 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6752 for (CanonicalLoopInfo *L : Loops) {
6753 assert(L->isValid() && "All input loops must be valid canonical loops");
6754 OrigTripCounts.push_back(L->getTripCount());
6755 OrigIndVars.push_back(L->getIndVar());
6756 }
6757
6758 // Collect the code between loop headers. These may contain SSA definitions
6759 // that are used in the loop nest body. To be usable with in the innermost
6760 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6761 // these instructions may be executed more often than before the tiling.
6762 // TODO: It would be sufficient to only sink them into body of the
6763 // corresponding tile loop.
6765 for (int i = 0; i < NumLoops - 1; ++i) {
6766 CanonicalLoopInfo *Surrounding = Loops[i];
6767 CanonicalLoopInfo *Nested = Loops[i + 1];
6768
6769 BasicBlock *EnterBB = Surrounding->getBody();
6770 BasicBlock *ExitBB = Nested->getHeader();
6771 InbetweenCode.emplace_back(EnterBB, ExitBB);
6772 }
6773
6774 // Compute the trip counts of the floor loops.
6775 Builder.SetCurrentDebugLocation(DL);
6776 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6777 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6778 for (int i = 0; i < NumLoops; ++i) {
6779 Value *TileSize = TileSizes[i];
6780 Value *OrigTripCount = OrigTripCounts[i];
6781 Type *IVType = OrigTripCount->getType();
6782
6783 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6784 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6785
6786 // 0 if tripcount divides the tilesize, 1 otherwise.
6787 // 1 means we need an additional iteration for a partial tile.
6788 //
6789 // Unfortunately we cannot just use the roundup-formula
6790 // (tripcount + tilesize - 1)/tilesize
6791 // because the summation might overflow. We do not want introduce undefined
6792 // behavior when the untiled loop nest did not.
6793 Value *FloorTripOverflow =
6794 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6795
6796 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6797 Value *FloorTripCount =
6798 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6799 "omp_floor" + Twine(i) + ".tripcount", true);
6800
6801 // Remember some values for later use.
6802 FloorCompleteCount.push_back(FloorCompleteTripCount);
6803 FloorCount.push_back(FloorTripCount);
6804 FloorRems.push_back(FloorTripRem);
6805 }
6806
6807 // Generate the new loop nest, from the outermost to the innermost.
6808 std::vector<CanonicalLoopInfo *> Result;
6809 Result.reserve(NumLoops * 2);
6810
6811 // The basic block of the surrounding loop that enters the nest generated
6812 // loop.
6813 BasicBlock *Enter = OutermostLoop->getPreheader();
6814
6815 // The basic block of the surrounding loop where the inner code should
6816 // continue.
6817 BasicBlock *Continue = OutermostLoop->getAfter();
6818
6819 // Where the next loop basic block should be inserted.
6820 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6821
6822 auto EmbeddNewLoop =
6823 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6824 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6825 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6826 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6827 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6828 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6829
6830 // Setup the position where the next embedded loop connects to this loop.
6831 Enter = EmbeddedLoop->getBody();
6832 Continue = EmbeddedLoop->getLatch();
6833 OutroInsertBefore = EmbeddedLoop->getLatch();
6834 return EmbeddedLoop;
6835 };
6836
6837 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6838 const Twine &NameBase) {
6839 for (auto P : enumerate(TripCounts)) {
6840 CanonicalLoopInfo *EmbeddedLoop =
6841 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6842 Result.push_back(EmbeddedLoop);
6843 }
6844 };
6845
6846 EmbeddNewLoops(FloorCount, "floor");
6847
6848 // Within the innermost floor loop, emit the code that computes the tile
6849 // sizes.
6850 Builder.SetInsertPoint(Enter->getTerminator());
6851 SmallVector<Value *, 4> TileCounts;
6852 for (int i = 0; i < NumLoops; ++i) {
6853 CanonicalLoopInfo *FloorLoop = Result[i];
6854 Value *TileSize = TileSizes[i];
6855
6856 Value *FloorIsEpilogue =
6857 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6858 Value *TileTripCount =
6859 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6860
6861 TileCounts.push_back(TileTripCount);
6862 }
6863
6864 // Create the tile loops.
6865 EmbeddNewLoops(TileCounts, "tile");
6866
6867 // Insert the inbetween code into the body.
6868 BasicBlock *BodyEnter = Enter;
6869 BasicBlock *BodyEntered = nullptr;
6870 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6871 BasicBlock *EnterBB = P.first;
6872 BasicBlock *ExitBB = P.second;
6873
6874 if (BodyEnter)
6875 redirectTo(BodyEnter, EnterBB, DL);
6876 else
6877 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6878
6879 BodyEnter = nullptr;
6880 BodyEntered = ExitBB;
6881 }
6882
6883 // Append the original loop nest body into the generated loop nest body.
6884 if (BodyEnter)
6885 redirectTo(BodyEnter, InnerEnter, DL);
6886 else
6887 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6889
6890 // Replace the original induction variable with an induction variable computed
6891 // from the tile and floor induction variables.
6892 Builder.restoreIP(Result.back()->getBodyIP());
6893 for (int i = 0; i < NumLoops; ++i) {
6894 CanonicalLoopInfo *FloorLoop = Result[i];
6895 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6896 Value *OrigIndVar = OrigIndVars[i];
6897 Value *Size = TileSizes[i];
6898
6899 Value *Scale =
6900 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6901 Value *Shift =
6902 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6903 OrigIndVar->replaceAllUsesWith(Shift);
6904 }
6905
6906 // Remove unused parts of the original loops.
6907 removeUnusedBlocksFromParent(OldControlBBs);
6908
6909 for (CanonicalLoopInfo *L : Loops)
6910 L->invalidate();
6911
6912#ifndef NDEBUG
6913 for (CanonicalLoopInfo *GenL : Result)
6914 GenL->assertOK();
6915#endif
6916 return Result;
6917}
6918
6919/// Attach metadata \p Properties to the basic block described by \p BB. If the
6920/// basic block already has metadata, the basic block properties are appended.
6922 ArrayRef<Metadata *> Properties) {
6923 // Nothing to do if no property to attach.
6924 if (Properties.empty())
6925 return;
6926
6927 LLVMContext &Ctx = BB->getContext();
6928 SmallVector<Metadata *> NewProperties;
6929 NewProperties.push_back(nullptr);
6930
6931 // If the basic block already has metadata, prepend it to the new metadata.
6932 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6933 if (Existing)
6934 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6935
6936 append_range(NewProperties, Properties);
6937 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6938 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6939
6940 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6941}
6942
6943/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6944/// loop already has metadata, the loop properties are appended.
6946 ArrayRef<Metadata *> Properties) {
6947 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6948
6949 // Attach metadata to the loop's latch
6950 BasicBlock *Latch = Loop->getLatch();
6951 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6952 addBasicBlockMetadata(Latch, Properties);
6953}
6954
6955/// Attach llvm.access.group metadata to the memref instructions of \p Block
6957 LoopInfo &LI) {
6958 for (Instruction &I : *Block) {
6959 if (I.mayReadOrWriteMemory()) {
6960 // TODO: This instruction may already have access group from
6961 // other pragmas e.g. #pragma clang loop vectorize. Append
6962 // so that the existing metadata is not overwritten.
6963 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6964 }
6965 }
6966}
6967
6968CanonicalLoopInfo *
6970 CanonicalLoopInfo *firstLoop = Loops.front();
6971 CanonicalLoopInfo *lastLoop = Loops.back();
6972 Function *F = firstLoop->getPreheader()->getParent();
6973
6974 // Loop control blocks that will become orphaned later
6975 SmallVector<BasicBlock *> oldControlBBs;
6977 Loop->collectControlBlocks(oldControlBBs);
6978
6979 // Collect original trip counts
6980 SmallVector<Value *> origTripCounts;
6981 for (CanonicalLoopInfo *L : Loops) {
6982 assert(L->isValid() && "All input loops must be valid canonical loops");
6983 origTripCounts.push_back(L->getTripCount());
6984 }
6985
6986 Builder.SetCurrentDebugLocation(DL);
6987
6988 // Compute max trip count.
6989 // The fused loop will be from 0 to max(origTripCounts)
6990 BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
6991 F, firstLoop->getHeader());
6992 Builder.SetInsertPoint(TCBlock);
6993 Value *fusedTripCount = nullptr;
6994 for (CanonicalLoopInfo *L : Loops) {
6995 assert(L->isValid() && "All loops to fuse must be valid canonical loops");
6996 Value *origTripCount = L->getTripCount();
6997 if (!fusedTripCount) {
6998 fusedTripCount = origTripCount;
6999 continue;
7000 }
7001 Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
7002 fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
7003 ".omp.fuse.tc");
7004 }
7005
7006 // Generate new loop
7007 CanonicalLoopInfo *fused =
7008 createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
7009 lastLoop->getLatch(), "fused");
7010
7011 // Replace original loops with the fused loop
7012 // Preheader and After are not considered inside the CLI.
7013 // These are used to compute the individual TCs of the loops
7014 // so they have to be put before the resulting fused loop.
7015 // Moving them up for readability.
7016 for (size_t i = 0; i < Loops.size() - 1; ++i) {
7017 Loops[i]->getPreheader()->moveBefore(TCBlock);
7018 Loops[i]->getAfter()->moveBefore(TCBlock);
7019 }
7020 lastLoop->getPreheader()->moveBefore(TCBlock);
7021
7022 for (size_t i = 0; i < Loops.size() - 1; ++i) {
7023 redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
7024 redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
7025 }
7026 redirectTo(lastLoop->getPreheader(), TCBlock, DL);
7027 redirectTo(TCBlock, fused->getPreheader(), DL);
7028 redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
7029
7030 // Build the fused body
7031 // Create new Blocks with conditions that jump to the original loop bodies
7033 SmallVector<Value *> condValues;
7034 for (size_t i = 0; i < Loops.size(); ++i) {
7035 BasicBlock *condBlock = BasicBlock::Create(
7036 F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
7037 Builder.SetInsertPoint(condBlock);
7038 Value *condValue =
7039 Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
7040 condBBs.push_back(condBlock);
7041 condValues.push_back(condValue);
7042 }
7043 // Join the condition blocks with the bodies of the original loops
7044 redirectTo(fused->getBody(), condBBs[0], DL);
7045 for (size_t i = 0; i < Loops.size() - 1; ++i) {
7046 Builder.SetInsertPoint(condBBs[i]);
7047 Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
7048 redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
7049 // Replace the IV with the fused IV
7050 Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
7051 }
7052 // Last body jumps to the created end body block
7053 Builder.SetInsertPoint(condBBs.back());
7054 Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
7055 fused->getLatch());
7056 redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
7057 // Replace the IV with the fused IV
7058 lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
7059
7060 // The loop latch must have only one predecessor. Currently it is branched to
7061 // from both the last condition block and the last loop body
7062 fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(),
7063 "omp.fused.pre_latch");
7064
7065 // Remove unused parts
7066 removeUnusedBlocksFromParent(oldControlBBs);
7067
7068 // Invalidate old CLIs
7069 for (CanonicalLoopInfo *L : Loops)
7070 L->invalidate();
7071
7072#ifndef NDEBUG
7073 fused->assertOK();
7074#endif
7075 return fused;
7076}
7077
7079 LLVMContext &Ctx = Builder.getContext();
7081 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7082 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
7083}
7084
7086 LLVMContext &Ctx = Builder.getContext();
7088 Loop, {
7089 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7090 });
7091}
7092
7093void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
7094 Value *IfCond, ValueToValueMapTy &VMap,
7095 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
7096 const Twine &NamePrefix) {
7097 Function *F = CanonicalLoop->getFunction();
7098
7099 // We can't do
7100 // if (cond) {
7101 // simd_loop;
7102 // } else {
7103 // non_simd_loop;
7104 // }
7105 // because then the CanonicalLoopInfo would only point to one of the loops:
7106 // leading to other constructs operating on the same loop to malfunction.
7107 // Instead generate
7108 // while (...) {
7109 // if (cond) {
7110 // simd_body;
7111 // } else {
7112 // not_simd_body;
7113 // }
7114 // }
7115 // At least for simple loops, LLVM seems able to hoist the if out of the loop
7116 // body at -O3
7117
7118 // Define where if branch should be inserted
7119 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
7120
7121 // Create additional blocks for the if statement
7122 BasicBlock *Cond = SplitBeforeIt->getParent();
7123 llvm::LLVMContext &C = Cond->getContext();
7125 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
7127 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
7128
7129 // Create if condition branch.
7130 Builder.SetInsertPoint(SplitBeforeIt);
7131 Instruction *BrInstr =
7132 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
7133 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
7134 // Then block contains branch to omp loop body which needs to be vectorized
7135 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
7136 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
7137
7138 Builder.SetInsertPoint(ElseBlock);
7139
7140 // Clone loop for the else branch
7142
7143 SmallVector<BasicBlock *, 8> ExistingBlocks;
7144 ExistingBlocks.reserve(L->getNumBlocks() + 1);
7145 ExistingBlocks.push_back(ThenBlock);
7146 ExistingBlocks.append(L->block_begin(), L->block_end());
7147 // Cond is the block that has the if clause condition
7148 // LoopCond is omp_loop.cond
7149 // LoopHeader is omp_loop.header
7150 BasicBlock *LoopCond = Cond->getUniquePredecessor();
7151 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
7152 assert(LoopCond && LoopHeader && "Invalid loop structure");
7153 for (BasicBlock *Block : ExistingBlocks) {
7154 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
7155 Block == LoopHeader || Block == LoopCond || Block == Cond) {
7156 continue;
7157 }
7158 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
7159
7160 // fix name not to be omp.if.then
7161 if (Block == ThenBlock)
7162 NewBB->setName(NamePrefix + ".if.else");
7163
7164 NewBB->moveBefore(CanonicalLoop->getExit());
7165 VMap[Block] = NewBB;
7166 NewBlocks.push_back(NewBB);
7167 }
7168 remapInstructionsInBlocks(NewBlocks, VMap);
7169 Builder.CreateBr(NewBlocks.front());
7170
7171 // The loop latch must have only one predecessor. Currently it is branched to
7172 // from both the 'then' and 'else' branches.
7173 L->getLoopLatch()->splitBasicBlockBefore(L->getLoopLatch()->begin(),
7174 NamePrefix + ".pre_latch");
7175
7176 // Ensure that the then block is added to the loop so we add the attributes in
7177 // the next step
7178 L->addBasicBlockToLoop(ThenBlock, LI);
7179}
7180
7181unsigned
7183 const StringMap<bool> &Features) {
7184 if (TargetTriple.isX86()) {
7185 if (Features.lookup("avx512f"))
7186 return 512;
7187 else if (Features.lookup("avx"))
7188 return 256;
7189 return 128;
7190 }
7191 if (TargetTriple.isPPC())
7192 return 128;
7193 if (TargetTriple.isWasm())
7194 return 128;
7195 return 0;
7196}
7197
7199 MapVector<Value *, Value *> AlignedVars,
7200 Value *IfCond, OrderKind Order,
7201 ConstantInt *Simdlen, ConstantInt *Safelen) {
7202 LLVMContext &Ctx = Builder.getContext();
7203
7204 Function *F = CanonicalLoop->getFunction();
7205
7206 // Blocks must have terminators.
7207 // FIXME: Don't run analyses on incomplete/invalid IR.
7209 for (BasicBlock &BB : *F)
7210 if (!BB.hasTerminator())
7211 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7212
7213 // TODO: We should not rely on pass manager. Currently we use pass manager
7214 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
7215 // object. We should have a method which returns all blocks between
7216 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
7218 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7219 FAM.registerPass([]() { return LoopAnalysis(); });
7220 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7221
7222 LoopAnalysis LIA;
7223 LoopInfo &&LI = LIA.run(*F, FAM);
7224
7225 for (Instruction *I : UIs)
7226 I->eraseFromParent();
7227
7228 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
7229 if (AlignedVars.size()) {
7230 InsertPointTy IP = Builder.saveIP();
7231 for (auto &AlignedItem : AlignedVars) {
7232 Value *AlignedPtr = AlignedItem.first;
7233 Value *Alignment = AlignedItem.second;
7234 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
7235 Builder.SetInsertPoint(loadInst->getNextNode());
7236 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
7237 Alignment);
7238 }
7239 Builder.restoreIP(IP);
7240 }
7241
7242 if (IfCond) {
7243 ValueToValueMapTy VMap;
7244 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
7245 }
7246
7248
7249 // Get the basic blocks from the loop in which memref instructions
7250 // can be found.
7251 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
7252 // preferably without running any passes.
7253 for (BasicBlock *Block : L->getBlocks()) {
7254 if (Block == CanonicalLoop->getCond() ||
7255 Block == CanonicalLoop->getHeader())
7256 continue;
7257 Reachable.insert(Block);
7258 }
7259
7260 SmallVector<Metadata *> LoopMDList;
7261
7262 // In presence of finite 'safelen', it may be unsafe to mark all
7263 // the memory instructions parallel, because loop-carried
7264 // dependences of 'safelen' iterations are possible.
7265 // If clause order(concurrent) is specified then the memory instructions
7266 // are marked parallel even if 'safelen' is finite.
7267 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
7268 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
7269
7270 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
7271 // versions so we can't add the loop attributes in that case.
7272 if (IfCond) {
7273 // we can still add llvm.loop.parallel_access
7274 addLoopMetadata(CanonicalLoop, LoopMDList);
7275 return;
7276 }
7277
7278 // Use the above access group metadata to create loop level
7279 // metadata, which should be distinct for each loop.
7280 ConstantAsMetadata *BoolConst =
7282 LoopMDList.push_back(MDNode::get(
7283 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
7284
7285 if (Simdlen || Safelen) {
7286 // If both simdlen and safelen clauses are specified, the value of the
7287 // simdlen parameter must be less than or equal to the value of the safelen
7288 // parameter. Therefore, use safelen only in the absence of simdlen.
7289 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
7290 LoopMDList.push_back(
7291 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
7292 ConstantAsMetadata::get(VectorizeWidth)}));
7293 }
7294
7295 addLoopMetadata(CanonicalLoop, LoopMDList);
7296}
7297
7298/// Create the TargetMachine object to query the backend for optimization
7299/// preferences.
7300///
7301/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
7302/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
7303/// needed for the LLVM pass pipline. We use some default options to avoid
7304/// having to pass too many settings from the frontend that probably do not
7305/// matter.
7306///
7307/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
7308/// method. If we are going to use TargetMachine for more purposes, especially
7309/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
7310/// might become be worth requiring front-ends to pass on their TargetMachine,
7311/// or at least cache it between methods. Note that while fontends such as Clang
7312/// have just a single main TargetMachine per translation unit, "target-cpu" and
7313/// "target-features" that determine the TargetMachine are per-function and can
7314/// be overrided using __attribute__((target("OPTIONS"))).
7315static std::unique_ptr<TargetMachine>
7317 Module *M = F->getParent();
7318
7319 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
7320 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
7321 const llvm::Triple &Triple = M->getTargetTriple();
7322
7323 std::string Error;
7325 if (!TheTarget)
7326 return {};
7327
7329 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
7330 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
7331 /*CodeModel=*/std::nullopt, OptLevel));
7332}
7333
7334/// Heuristically determine the best-performant unroll factor for \p CLI. This
7335/// depends on the target processor. We are re-using the same heuristics as the
7336/// LoopUnrollPass.
7338 Function *F = CLI->getFunction();
7339
7340 // Assume the user requests the most aggressive unrolling, even if the rest of
7341 // the code is optimized using a lower setting.
7343 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
7344
7345 // Blocks must have terminators.
7346 // FIXME: Don't run analyses on incomplete/invalid IR.
7348 for (BasicBlock &BB : *F)
7349 if (!BB.hasTerminator())
7350 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7351
7353 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
7354 FAM.registerPass([]() { return AssumptionAnalysis(); });
7355 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7356 FAM.registerPass([]() { return LoopAnalysis(); });
7357 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
7358 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7359 TargetIRAnalysis TIRA;
7360 if (TM)
7361 TIRA = TargetIRAnalysis(
7362 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
7363 FAM.registerPass([&]() { return TIRA; });
7364
7365 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
7367 ScalarEvolution &&SE = SEA.run(*F, FAM);
7369 DominatorTree &&DT = DTA.run(*F, FAM);
7370 LoopAnalysis LIA;
7371 LoopInfo &&LI = LIA.run(*F, FAM);
7373 AssumptionCache &&AC = ACT.run(*F, FAM);
7375
7376 for (Instruction *I : UIs)
7377 I->eraseFromParent();
7378
7379 Loop *L = LI.getLoopFor(CLI->getHeader());
7380 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
7381
7383 L, SE, TTI,
7384 /*BlockFrequencyInfo=*/nullptr,
7385 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
7386 /*UserThreshold=*/std::nullopt,
7387 /*UserCount=*/std::nullopt,
7388 /*UserAllowPartial=*/true,
7389 /*UserAllowRuntime=*/true,
7390 /*UserUpperBound=*/std::nullopt,
7391 /*UserFullUnrollMaxCount=*/std::nullopt);
7392
7393 UP.Force = true;
7394
7395 // Account for additional optimizations taking place before the LoopUnrollPass
7396 // would unroll the loop.
7399
7400 // Use normal unroll factors even if the rest of the code is optimized for
7401 // size.
7404
7405 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
7406 << " Threshold=" << UP.Threshold << "\n"
7407 << " PartialThreshold=" << UP.PartialThreshold << "\n"
7408 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
7409 << " PartialOptSizeThreshold="
7410 << UP.PartialOptSizeThreshold << "\n");
7411
7412 // Disable peeling.
7415 /*UserAllowPeeling=*/false,
7416 /*UserAllowProfileBasedPeeling=*/false,
7417 /*UnrollingSpecficValues=*/false);
7418
7420 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
7421
7422 // Assume that reads and writes to stack variables can be eliminated by
7423 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
7424 // size.
7425 for (BasicBlock *BB : L->blocks()) {
7426 for (Instruction &I : *BB) {
7427 Value *Ptr;
7428 if (auto *Load = dyn_cast<LoadInst>(&I)) {
7429 Ptr = Load->getPointerOperand();
7430 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
7431 Ptr = Store->getPointerOperand();
7432 } else
7433 continue;
7434
7435 Ptr = Ptr->stripPointerCasts();
7436
7437 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
7438 if (Alloca->getParent() == &F->getEntryBlock())
7439 EphValues.insert(&I);
7440 }
7441 }
7442 }
7443
7444 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
7445
7446 // Loop is not unrollable if the loop contains certain instructions.
7447 if (!UCE.canUnroll()) {
7448 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
7449 return 1;
7450 }
7451
7452 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
7453 << "\n");
7454
7455 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
7456 // be able to use it.
7457 int TripCount = 0;
7458 int MaxTripCount = 0;
7459 bool MaxOrZero = false;
7460 unsigned TripMultiple = 0;
7461
7462 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
7463 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
7464 unsigned Factor = UP.Count;
7465 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
7466
7467 // This function returns 1 to signal to not unroll a loop.
7468 if (Factor == 0)
7469 return 1;
7470 return Factor;
7471}
7472
7474 int32_t Factor,
7475 CanonicalLoopInfo **UnrolledCLI) {
7476 assert(Factor >= 0 && "Unroll factor must not be negative");
7477
7478 Function *F = Loop->getFunction();
7479 LLVMContext &Ctx = F->getContext();
7480
7481 // If the unrolled loop is not used for another loop-associated directive, it
7482 // is sufficient to add metadata for the LoopUnrollPass.
7483 if (!UnrolledCLI) {
7484 SmallVector<Metadata *, 2> LoopMetadata;
7485 LoopMetadata.push_back(
7486 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
7487
7488 if (Factor >= 1) {
7490 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7491 LoopMetadata.push_back(MDNode::get(
7492 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
7493 }
7494
7495 addLoopMetadata(Loop, LoopMetadata);
7496 return;
7497 }
7498
7499 // Heuristically determine the unroll factor.
7500 if (Factor == 0)
7502
7503 // No change required with unroll factor 1.
7504 if (Factor == 1) {
7505 *UnrolledCLI = Loop;
7506 return;
7507 }
7508
7509 assert(Factor >= 2 &&
7510 "unrolling only makes sense with a factor of 2 or larger");
7511
7512 Type *IndVarTy = Loop->getIndVarType();
7513
7514 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
7515 // unroll the inner loop.
7516 Value *FactorVal =
7517 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
7518 /*isSigned=*/false));
7519 std::vector<CanonicalLoopInfo *> LoopNest =
7520 tileLoops(DL, {Loop}, {FactorVal});
7521 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
7522 *UnrolledCLI = LoopNest[0];
7523 CanonicalLoopInfo *InnerLoop = LoopNest[1];
7524
7525 // LoopUnrollPass can only fully unroll loops with constant trip count.
7526 // Unroll by the unroll factor with a fallback epilog for the remainder
7527 // iterations if necessary.
7529 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7531 InnerLoop,
7532 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7534 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
7535
7536#ifndef NDEBUG
7537 (*UnrolledCLI)->assertOK();
7538#endif
7539}
7540
7543 llvm::Value *BufSize, llvm::Value *CpyBuf,
7544 llvm::Value *CpyFn, llvm::Value *DidIt) {
7545 if (!updateToLocation(Loc))
7546 return Loc.IP;
7547
7548 uint32_t SrcLocStrSize;
7549 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7550 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7551 Value *ThreadId = getOrCreateThreadID(Ident);
7552
7553 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7554
7555 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7556
7557 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7558 createRuntimeFunctionCall(Fn, Args);
7559
7560 return Builder.saveIP();
7561}
7562
7564 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7565 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7567
7568 if (!updateToLocation(Loc))
7569 return Loc.IP;
7570
7571 // If needed allocate and initialize `DidIt` with 0.
7572 // DidIt: flag variable: 1=single thread; 0=not single thread.
7573 llvm::Value *DidIt = nullptr;
7574 if (!CPVars.empty()) {
7575 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7576 Builder.CreateStore(Builder.getInt32(0), DidIt);
7577 }
7578
7579 Directive OMPD = Directive::OMPD_single;
7580 uint32_t SrcLocStrSize;
7581 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7582 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7583 Value *ThreadId = getOrCreateThreadID(Ident);
7584 Value *Args[] = {Ident, ThreadId};
7585
7586 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7587 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7588
7589 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7590 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7591
7592 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7593 if (Error Err = FiniCB(IP))
7594 return Err;
7595
7596 // The thread that executes the single region must set `DidIt` to 1.
7597 // This is used by __kmpc_copyprivate, to know if the caller is the
7598 // single thread or not.
7599 if (DidIt)
7600 Builder.CreateStore(Builder.getInt32(1), DidIt);
7601
7602 return Error::success();
7603 };
7604
7605 // generates the following:
7606 // if (__kmpc_single()) {
7607 // .... single region ...
7608 // __kmpc_end_single
7609 // }
7610 // __kmpc_copyprivate
7611 // __kmpc_barrier
7612
7613 InsertPointOrErrorTy AfterIP =
7614 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7615 /*Conditional*/ true,
7616 /*hasFinalize*/ true);
7617 if (!AfterIP)
7618 return AfterIP.takeError();
7619
7620 if (DidIt) {
7621 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7622 // NOTE BufSize is currently unused, so just pass 0.
7624 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7625 CPFuncs[I], DidIt);
7626 // NOTE __kmpc_copyprivate already inserts a barrier
7627 } else if (!IsNowait) {
7628 InsertPointOrErrorTy AfterIP =
7630 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7631 /* CheckCancelFlag */ false);
7632 if (!AfterIP)
7633 return AfterIP.takeError();
7634 }
7635 return Builder.saveIP();
7636}
7637
7640 BodyGenCallbackTy BodyGenCB,
7641 FinalizeCallbackTy FiniCB, bool IsNowait) {
7642
7643 if (!updateToLocation(Loc))
7644 return Loc.IP;
7645
7646 // All threads execute the scope body — no conditional entry.
7647 InsertPointOrErrorTy AfterIP = EmitOMPInlinedRegion(
7648 Directive::OMPD_scope, /*EntryCall=*/nullptr, /*ExitCall=*/nullptr,
7649 BodyGenCB, FiniCB, /*Conditional=*/false, /*HasFinalize=*/true,
7650 /*IsCancellable=*/false);
7651 if (!AfterIP)
7652 return AfterIP.takeError();
7653
7654 Builder.restoreIP(*AfterIP);
7655 if (!IsNowait) {
7656 AfterIP = createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
7657 omp::Directive::OMPD_unknown,
7658 /*ForceSimpleCall=*/false,
7659 /*CheckCancelFlag=*/false);
7660 if (!AfterIP)
7661 return AfterIP.takeError();
7662 }
7663 return Builder.saveIP();
7664}
7665
7667 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7668 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7669
7670 if (!updateToLocation(Loc))
7671 return Loc.IP;
7672
7673 Directive OMPD = Directive::OMPD_critical;
7674 uint32_t SrcLocStrSize;
7675 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7676 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7677 Value *ThreadId = getOrCreateThreadID(Ident);
7678 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7679 Value *Args[] = {Ident, ThreadId, LockVar};
7680
7681 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7682 Function *RTFn = nullptr;
7683 if (HintInst) {
7684 // Add Hint to entry Args and create call
7685 EnterArgs.push_back(HintInst);
7686 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7687 } else {
7688 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7689 }
7690 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7691
7692 Function *ExitRTLFn =
7693 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7694 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7695
7696 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7697 /*Conditional*/ false, /*hasFinalize*/ true);
7698}
7699
7702 InsertPointTy AllocaIP, unsigned NumLoops,
7703 ArrayRef<llvm::Value *> StoreValues,
7704 const Twine &Name, bool IsDependSource) {
7705 assert(
7706 llvm::all_of(StoreValues,
7707 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7708 "OpenMP runtime requires depend vec with i64 type");
7709
7710 if (!updateToLocation(Loc))
7711 return Loc.IP;
7712
7713 // Allocate space for vector and generate alloc instruction.
7714 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7715 Builder.restoreIP(AllocaIP);
7716 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7717 ArgsBase->setAlignment(Align(8));
7719
7720 // Store the index value with offset in depend vector.
7721 for (unsigned I = 0; I < NumLoops; ++I) {
7722 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7723 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7724 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7725 STInst->setAlignment(Align(8));
7726 }
7727
7728 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7729 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7730
7731 uint32_t SrcLocStrSize;
7732 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7733 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7734 Value *ThreadId = getOrCreateThreadID(Ident);
7735 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7736
7737 Function *RTLFn = nullptr;
7738 if (IsDependSource)
7739 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7740 else
7741 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7742 createRuntimeFunctionCall(RTLFn, Args);
7743
7744 return Builder.saveIP();
7745}
7746
7748 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7749 FinalizeCallbackTy FiniCB, bool IsThreads) {
7750 if (!updateToLocation(Loc))
7751 return Loc.IP;
7752
7753 Directive OMPD = Directive::OMPD_ordered;
7754 Instruction *EntryCall = nullptr;
7755 Instruction *ExitCall = nullptr;
7756
7757 if (IsThreads) {
7758 uint32_t SrcLocStrSize;
7759 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7760 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7761 Value *ThreadId = getOrCreateThreadID(Ident);
7762 Value *Args[] = {Ident, ThreadId};
7763
7764 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7765 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7766
7767 Function *ExitRTLFn =
7768 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7769 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7770 }
7771
7772 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7773 /*Conditional*/ false, /*hasFinalize*/ true);
7774}
7775
7776OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7777 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7778 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7779 bool HasFinalize, bool IsCancellable) {
7780
7781 if (HasFinalize)
7782 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7783
7784 // Create inlined region's entry and body blocks, in preparation
7785 // for conditional creation
7786 BasicBlock *EntryBB = Builder.GetInsertBlock();
7787 Instruction *SplitPos = EntryBB->getTerminatorOrNull();
7789 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7790 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7791 BasicBlock *FiniBB =
7792 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7793
7794 Builder.SetInsertPoint(EntryBB->getTerminator());
7795 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7796
7797 // generate body
7798 if (Error Err =
7799 BodyGenCB(/* AllocaIP */ InsertPointTy(),
7800 /* CodeGenIP */ Builder.saveIP(), /* DeallocBlocks */ {}))
7801 return Err;
7802
7803 // emit exit call and do any needed finalization.
7804 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7805 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7806 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7807 "Unexpected control flow graph state!!");
7808 InsertPointOrErrorTy AfterIP =
7809 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7810 if (!AfterIP)
7811 return AfterIP.takeError();
7812
7813 // If we are skipping the region of a non conditional, remove the exit
7814 // block, and clear the builder's insertion point.
7815 assert(SplitPos->getParent() == ExitBB &&
7816 "Unexpected Insertion point location!");
7817 auto merged = MergeBlockIntoPredecessor(ExitBB);
7818 BasicBlock *ExitPredBB = SplitPos->getParent();
7819 auto InsertBB = merged ? ExitPredBB : ExitBB;
7821 SplitPos->eraseFromParent();
7822 Builder.SetInsertPoint(InsertBB);
7823
7824 return Builder.saveIP();
7825}
7826
7827OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7828 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7829 // if nothing to do, Return current insertion point.
7830 if (!Conditional || !EntryCall)
7831 return Builder.saveIP();
7832
7833 BasicBlock *EntryBB = Builder.GetInsertBlock();
7834 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7835 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7836 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7837
7838 // Emit thenBB and set the Builder's insertion point there for
7839 // body generation next. Place the block after the current block.
7840 Function *CurFn = EntryBB->getParent();
7841 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7842
7843 // Move Entry branch to end of ThenBB, and replace with conditional
7844 // branch (If-stmt)
7845 Instruction *EntryBBTI = EntryBB->getTerminator();
7846 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7847 EntryBBTI->removeFromParent();
7848 Builder.SetInsertPoint(UI);
7849 Builder.Insert(EntryBBTI);
7850 UI->eraseFromParent();
7851 Builder.SetInsertPoint(ThenBB->getTerminator());
7852
7853 // return an insertion point to ExitBB.
7854 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7855}
7856
7857OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7858 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7859 bool HasFinalize) {
7860
7861 Builder.restoreIP(FinIP);
7862
7863 // If there is finalization to do, emit it before the exit call
7864 if (HasFinalize) {
7865 assert(!FinalizationStack.empty() &&
7866 "Unexpected finalization stack state!");
7867
7868 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7869 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7870
7871 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7872 return std::move(Err);
7873
7874 // Exit condition: insertion point is before the terminator of the new Fini
7875 // block
7876 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7877 }
7878
7879 if (!ExitCall)
7880 return Builder.saveIP();
7881
7882 // place the Exitcall as last instruction before Finalization block terminator
7883 ExitCall->removeFromParent();
7884 Builder.Insert(ExitCall);
7885
7886 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7887 ExitCall->getIterator());
7888}
7889
7891 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7892 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7893 if (!IP.isSet())
7894 return IP;
7895
7897
7898 // creates the following CFG structure
7899 // OMP_Entry : (MasterAddr != PrivateAddr)?
7900 // F T
7901 // | \
7902 // | copin.not.master
7903 // | /
7904 // v /
7905 // copyin.not.master.end
7906 // |
7907 // v
7908 // OMP.Entry.Next
7909
7910 BasicBlock *OMP_Entry = IP.getBlock();
7911 Function *CurFn = OMP_Entry->getParent();
7912 BasicBlock *CopyBegin =
7913 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7914 BasicBlock *CopyEnd = nullptr;
7915
7916 // If entry block is terminated, split to preserve the branch to following
7917 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7919 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7920 "copyin.not.master.end");
7921 OMP_Entry->getTerminator()->eraseFromParent();
7922 } else {
7923 CopyEnd =
7924 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7925 }
7926
7927 Builder.SetInsertPoint(OMP_Entry);
7928 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7929 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7930 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7931 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7932
7933 Builder.SetInsertPoint(CopyBegin);
7934 if (BranchtoEnd)
7935 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7936
7937 return Builder.saveIP();
7938}
7939
7941 Value *Size, Value *Allocator,
7942 std::string Name) {
7944 if (!updateToLocation(Loc))
7945 return nullptr;
7946
7947 uint32_t SrcLocStrSize;
7948 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7949 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7950 Value *ThreadId = getOrCreateThreadID(Ident);
7951 Value *Args[] = {ThreadId, Size, Allocator};
7952
7953 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7954
7955 return createRuntimeFunctionCall(Fn, Args, Name);
7956}
7957
7959 Value *Align, Value *Size,
7960 Value *Allocator,
7961 std::string Name) {
7963 if (!updateToLocation(Loc))
7964 return nullptr;
7965
7966 uint32_t SrcLocStrSize;
7967 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7968 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7969 Value *ThreadId = getOrCreateThreadID(Ident);
7970 Value *Args[] = {ThreadId, Align, Size, Allocator};
7971
7972 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_aligned_alloc);
7973
7974 return Builder.CreateCall(Fn, Args, Name);
7975}
7976
7978 Value *Addr, Value *Allocator,
7979 std::string Name) {
7981 if (!updateToLocation(Loc))
7982 return nullptr;
7983
7984 uint32_t SrcLocStrSize;
7985 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7986 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7987 Value *ThreadId = getOrCreateThreadID(Ident);
7988 Value *Args[] = {ThreadId, Addr, Allocator};
7989 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7990 return createRuntimeFunctionCall(Fn, Args, Name);
7991}
7992
7994 Value *Size,
7995 const Twine &Name) {
7998
7999 Value *Args[] = {Size};
8000 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc_shared);
8001 CallInst *Call = Builder.CreateCall(Fn, Args, Name);
8003 M.getContext(), M.getDataLayout().getPrefTypeAlign(Int64)));
8004 return Call;
8005}
8006
8008 Type *VarType,
8009 const Twine &Name) {
8010 return createOMPAllocShared(
8011 Loc, Builder.getInt64(M.getDataLayout().getTypeAllocSize(VarType)), Name);
8012}
8013
8015 Value *Addr, Value *Size,
8016 const Twine &Name) {
8019
8020 Value *Args[] = {Addr, Size};
8021 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free_shared);
8022 return Builder.CreateCall(Fn, Args, Name);
8023}
8024
8026 Value *Addr, Type *VarType,
8027 const Twine &Name) {
8028 return createOMPFreeShared(
8029 Loc, Addr, Builder.getInt64(M.getDataLayout().getTypeAllocSize(VarType)),
8030 Name);
8031}
8032
8034 const LocationDescription &Loc, Value *InteropVar,
8035 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
8036 Value *DependenceAddress, bool HaveNowaitClause) {
8039
8040 uint32_t SrcLocStrSize;
8041 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8042 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8043 Value *ThreadId = getOrCreateThreadID(Ident);
8044 if (Device == nullptr)
8045 Device = Constant::getAllOnesValue(Int32);
8046 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
8047 if (NumDependences == nullptr) {
8048 NumDependences = ConstantInt::get(Int32, 0);
8049 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8050 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8051 }
8052 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8053 Value *Args[] = {
8054 Ident, ThreadId, InteropVar, InteropTypeVal,
8055 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
8056
8057 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
8058
8059 return createRuntimeFunctionCall(Fn, Args);
8060}
8061
8063 const LocationDescription &Loc, Value *InteropVar, Value *Device,
8064 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
8067
8068 uint32_t SrcLocStrSize;
8069 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8070 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8071 Value *ThreadId = getOrCreateThreadID(Ident);
8072 if (Device == nullptr)
8073 Device = Constant::getAllOnesValue(Int32);
8074 if (NumDependences == nullptr) {
8075 NumDependences = ConstantInt::get(Int32, 0);
8076 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8077 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8078 }
8079 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8080 Value *Args[] = {
8081 Ident, ThreadId, InteropVar, Device,
8082 NumDependences, DependenceAddress, HaveNowaitClauseVal};
8083
8084 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
8085
8086 return createRuntimeFunctionCall(Fn, Args);
8087}
8088
8090 Value *InteropVar, Value *Device,
8091 Value *NumDependences,
8092 Value *DependenceAddress,
8093 bool HaveNowaitClause) {
8096 uint32_t SrcLocStrSize;
8097 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8098 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8099 Value *ThreadId = getOrCreateThreadID(Ident);
8100 if (Device == nullptr)
8101 Device = Constant::getAllOnesValue(Int32);
8102 if (NumDependences == nullptr) {
8103 NumDependences = ConstantInt::get(Int32, 0);
8104 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8105 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8106 }
8107 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8108 Value *Args[] = {
8109 Ident, ThreadId, InteropVar, Device,
8110 NumDependences, DependenceAddress, HaveNowaitClauseVal};
8111
8112 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
8113
8114 return createRuntimeFunctionCall(Fn, Args);
8115}
8116
8119 llvm::ConstantInt *Size, const llvm::Twine &Name) {
8122
8123 uint32_t SrcLocStrSize;
8124 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8125 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8126 Value *ThreadId = getOrCreateThreadID(Ident);
8127 Constant *ThreadPrivateCache =
8128 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
8129 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
8130
8131 Function *Fn =
8132 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
8133
8134 return createRuntimeFunctionCall(Fn, Args);
8135}
8136
8138 const LocationDescription &Loc,
8140 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
8141 "expected num_threads and num_teams to be specified");
8142
8143 if (!updateToLocation(Loc))
8144 return Loc.IP;
8145
8146 uint32_t SrcLocStrSize;
8147 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8148 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8149 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
8150 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
8151 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD &&
8152 Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP);
8153 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
8154 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
8155
8156 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
8157 Function *Kernel = DebugKernelWrapper;
8158
8159 // We need to strip the debug prefix to get the correct kernel name.
8160 StringRef KernelName = Kernel->getName();
8161 const std::string DebugPrefix = "_debug__";
8162 if (KernelName.ends_with(DebugPrefix)) {
8163 KernelName = KernelName.drop_back(DebugPrefix.length());
8164 Kernel = M.getFunction(KernelName);
8165 assert(Kernel && "Expected the real kernel to exist");
8166 }
8167
8168 // Manifest the launch configuration in the metadata matching the kernel
8169 // environment.
8170 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
8171 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
8172
8173 // If MaxThreads is not set and needs adjustment, select the maximum between
8174 // the default workgroup size and the MinThreads value.
8175 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
8176 if (MaxThreadsVal < 0 && UseDefaultMaxThreads) {
8177 if (hasGridValue(T)) {
8178 MaxThreadsVal =
8179 std::max(int32_t(getGridValue(T, Kernel).GV_Default_WG_Size),
8180 Attrs.MinThreads);
8181 } else {
8182 MaxThreadsVal = Attrs.MinThreads;
8183 }
8184 }
8185
8186 if (MaxThreadsVal > 0)
8187 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
8188
8189 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
8190 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
8191 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
8192 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
8193 Constant *ReductionDataSize =
8194 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
8195 Constant *ReductionBufferLength =
8196 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
8197
8199 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
8200 const DataLayout &DL = Fn->getDataLayout();
8201
8202 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
8203 Constant *DynamicEnvironmentInitializer =
8204 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
8205 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
8206 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
8207 DynamicEnvironmentInitializer, DynamicEnvironmentName,
8208 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
8209 DL.getDefaultGlobalsAddressSpace());
8210 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
8211
8212 Constant *DynamicEnvironment =
8213 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
8214 ? DynamicEnvironmentGV
8215 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
8216 DynamicEnvironmentPtr);
8217
8218 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
8219 ConfigurationEnvironment, {
8220 UseGenericStateMachineVal,
8221 MayUseNestedParallelismVal,
8222 IsSPMDVal,
8223 MinThreads,
8224 MaxThreads,
8225 MinTeams,
8226 MaxTeams,
8227 ReductionDataSize,
8228 ReductionBufferLength,
8229 });
8230 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
8231 KernelEnvironment, {
8232 ConfigurationEnvironmentInitializer,
8233 Ident,
8234 DynamicEnvironment,
8235 });
8236 std::string KernelEnvironmentName =
8237 (KernelName + "_kernel_environment").str();
8238 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
8239 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
8240 KernelEnvironmentInitializer, KernelEnvironmentName,
8241 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
8242 DL.getDefaultGlobalsAddressSpace());
8243 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
8244
8245 Constant *KernelEnvironment =
8246 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
8247 ? KernelEnvironmentGV
8248 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
8249 KernelEnvironmentPtr);
8250 Value *KernelLaunchEnvironment =
8251 DebugKernelWrapper->getArg(DebugKernelWrapper->arg_size() - 1);
8252 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
8253 KernelLaunchEnvironment =
8254 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
8255 ? KernelLaunchEnvironment
8256 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
8257 KernelLaunchEnvParamTy);
8258 CallInst *ThreadKind = createRuntimeFunctionCall(
8259 Fn, {KernelEnvironment, KernelLaunchEnvironment});
8260
8261 Value *ExecUserCode = Builder.CreateICmpEQ(
8262 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
8263 "exec_user_code");
8264
8265 // ThreadKind = __kmpc_target_init(...)
8266 // if (ThreadKind == -1)
8267 // user_code
8268 // else
8269 // return;
8270
8271 auto *UI = Builder.CreateUnreachable();
8272 BasicBlock *CheckBB = UI->getParent();
8273 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
8274
8275 BasicBlock *WorkerExitBB = BasicBlock::Create(
8276 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
8277 Builder.SetInsertPoint(WorkerExitBB);
8278 Builder.CreateRetVoid();
8279
8280 auto *CheckBBTI = CheckBB->getTerminator();
8281 Builder.SetInsertPoint(CheckBBTI);
8282 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
8283
8284 CheckBBTI->eraseFromParent();
8285 UI->eraseFromParent();
8286
8287 // Continue in the "user_code" block, see diagram above and in
8288 // openmp/libomptarget/deviceRTLs/common/include/target.h .
8289 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
8290}
8291
8293 int32_t TeamsReductionDataSize,
8294 int32_t TeamsReductionBufferLength) {
8295 if (!updateToLocation(Loc))
8296 return;
8297
8299 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
8300
8302
8303 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
8304 return;
8305
8306 Function *Kernel = Builder.GetInsertBlock()->getParent();
8307 // We need to strip the debug prefix to get the correct kernel name.
8308 StringRef KernelName = Kernel->getName();
8309 const std::string DebugPrefix = "_debug__";
8310 if (KernelName.ends_with(DebugPrefix))
8311 KernelName = KernelName.drop_back(DebugPrefix.length());
8312 auto *KernelEnvironmentGV =
8313 M.getNamedGlobal((KernelName + "_kernel_environment").str());
8314 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
8315 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
8316 auto *NewInitializer = ConstantFoldInsertValueInstruction(
8317 KernelEnvironmentInitializer,
8318 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
8319 NewInitializer = ConstantFoldInsertValueInstruction(
8320 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
8321 {0, 8});
8322 KernelEnvironmentGV->setInitializer(NewInitializer);
8323}
8324
8325static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
8326 bool Min) {
8327 if (Kernel.hasFnAttribute(Name)) {
8328 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
8329 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
8330 }
8331 Kernel.addFnAttr(Name, llvm::utostr(Value));
8332}
8333
8334std::pair<int32_t, int32_t>
8336 int32_t ThreadLimit =
8337 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
8338
8339 if (T.isAMDGPU()) {
8340 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
8341 if (!Attr.isValid() || !Attr.isStringAttribute())
8342 return {0, ThreadLimit};
8343 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
8344 int32_t LB, UB;
8345 if (!llvm::to_integer(UBStr, UB, 10))
8346 return {0, ThreadLimit};
8347 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
8348 if (!llvm::to_integer(LBStr, LB, 10))
8349 return {0, UB};
8350 return {LB, UB};
8351 }
8352
8353 if (Kernel.hasFnAttribute(NVVMAttr::MaxNTID)) {
8354 int32_t UB = Kernel.getFnAttributeAsParsedInteger(NVVMAttr::MaxNTID);
8355 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
8356 }
8357 return {0, ThreadLimit};
8358}
8359
8361 Function &Kernel, int32_t LB,
8362 int32_t UB) {
8363 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
8364
8365 if (T.isAMDGPU()) {
8366 Kernel.addFnAttr("amdgpu-flat-work-group-size",
8367 llvm::utostr(LB) + "," + llvm::utostr(UB));
8368 return;
8369 }
8370
8372}
8373
8374std::pair<int32_t, int32_t>
8376 // TODO: Read from backend annotations if available.
8377 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
8378}
8379
8381 int32_t LB, int32_t UB) {
8382 if (UB > 0) {
8383 if (T.isNVPTX())
8385 if (T.isAMDGPU())
8386 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(UB) + ",1,1");
8387 }
8388
8389 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
8390}
8391
8392void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
8393 Function *OutlinedFn) {
8394 if (Config.isTargetDevice()) {
8396 // TODO: Determine if DSO local can be set to true.
8397 OutlinedFn->setDSOLocal(false);
8399 if (T.isAMDGCN())
8401 else if (T.isNVPTX())
8403 else if (T.isSPIRV())
8405 }
8406}
8407
8408Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
8409 StringRef EntryFnIDName) {
8410 if (Config.isTargetDevice()) {
8411 assert(OutlinedFn && "The outlined function must exist if embedded");
8412 return OutlinedFn;
8413 }
8414
8415 return new GlobalVariable(
8416 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
8417 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
8418}
8419
8420Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
8421 StringRef EntryFnName) {
8422 if (OutlinedFn)
8423 return OutlinedFn;
8424
8425 assert(!M.getGlobalVariable(EntryFnName, true) &&
8426 "Named kernel already exists?");
8427 return new GlobalVariable(
8428 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
8429 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
8430}
8431
8433 TargetRegionEntryInfo &EntryInfo,
8434 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
8435 Function *&OutlinedFn, Constant *&OutlinedFnID) {
8436
8437 SmallString<64> EntryFnName;
8438 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
8439
8440 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
8441 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
8442 if (!CBResult)
8443 return CBResult.takeError();
8444 OutlinedFn = *CBResult;
8445 } else {
8446 OutlinedFn = nullptr;
8447 }
8448
8449 // If this target outline function is not an offload entry, we don't need to
8450 // register it. This may be in the case of a false if clause, or if there are
8451 // no OpenMP targets.
8452 if (!IsOffloadEntry)
8453 return Error::success();
8454
8455 std::string EntryFnIDName =
8456 Config.isTargetDevice()
8457 ? std::string(EntryFnName)
8458 : createPlatformSpecificName({EntryFnName, "region_id"});
8459
8460 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
8461 EntryFnName, EntryFnIDName);
8462 return Error::success();
8463}
8464
8466 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
8467 StringRef EntryFnName, StringRef EntryFnIDName) {
8468 if (OutlinedFn)
8469 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
8470 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
8471 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
8472 OffloadInfoManager.registerTargetRegionEntryInfo(
8473 EntryInfo, EntryAddr, OutlinedFnID,
8475 return OutlinedFnID;
8476}
8477
8479 const LocationDescription &Loc, InsertPointTy AllocaIP,
8480 InsertPointTy CodeGenIP, ArrayRef<BasicBlock *> DeallocBlocks,
8481 Value *DeviceID, Value *IfCond, TargetDataInfo &Info,
8482 GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB,
8483 omp::RuntimeFunction *MapperFunc,
8485 BodyGenTy BodyGenType)>
8486 BodyGenCB,
8487 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
8488 if (!updateToLocation(Loc))
8489 return InsertPointTy();
8490
8491 Builder.restoreIP(CodeGenIP);
8492
8493 bool IsStandAlone = !BodyGenCB;
8494 MapInfosTy *MapInfo;
8495 // Generate the code for the opening of the data environment. Capture all the
8496 // arguments of the runtime call by reference because they are used in the
8497 // closing of the region.
8498 auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8499 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
8500 MapInfo = &GenMapInfoCB(Builder.saveIP());
8501 if (Error Err = emitOffloadingArrays(
8502 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
8503 /*IsNonContiguous=*/true, DeviceAddrCB))
8504 return Err;
8505
8506 TargetDataRTArgs RTArgs;
8508
8509 // Emit the number of elements in the offloading arrays.
8510 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8511
8512 // Source location for the ident struct
8513 if (!SrcLocInfo) {
8514 uint32_t SrcLocStrSize;
8515 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8516 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8517 }
8518
8519 SmallVector<llvm::Value *, 13> OffloadingArgs = {
8520 SrcLocInfo, DeviceID,
8521 PointerNum, RTArgs.BasePointersArray,
8522 RTArgs.PointersArray, RTArgs.SizesArray,
8523 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8524 RTArgs.MappersArray};
8525
8526 if (IsStandAlone) {
8527 assert(MapperFunc && "MapperFunc missing for standalone target data");
8528
8529 auto TaskBodyCB = [&](Value *, Value *,
8531 if (Info.HasNoWait) {
8532 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
8536 }
8537
8539 OffloadingArgs);
8540
8541 if (Info.HasNoWait) {
8542 BasicBlock *OffloadContBlock =
8543 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
8544 Function *CurFn = Builder.GetInsertBlock()->getParent();
8545 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
8546 Builder.restoreIP(Builder.saveIP());
8547 }
8548 return Error::success();
8549 };
8550
8551 bool RequiresOuterTargetTask = Info.HasNoWait;
8552 if (!RequiresOuterTargetTask)
8553 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
8554 /*TargetTaskAllocaIP=*/{}));
8555 else
8556 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
8557 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
8558 } else {
8559 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
8560 omp::OMPRTL___tgt_target_data_begin_mapper);
8561
8562 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
8563
8564 for (auto DeviceMap : Info.DevicePtrInfoMap) {
8565 if (isa<AllocaInst>(DeviceMap.second.second)) {
8566 auto *LI =
8567 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
8568 Builder.CreateStore(LI, DeviceMap.second.second);
8569 }
8570 }
8571
8572 // If device pointer privatization is required, emit the body of the
8573 // region here. It will have to be duplicated: with and without
8574 // privatization.
8575 InsertPointOrErrorTy AfterIP =
8576 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
8577 if (!AfterIP)
8578 return AfterIP.takeError();
8579 Builder.restoreIP(*AfterIP);
8580 }
8581 return Error::success();
8582 };
8583
8584 // If we need device pointer privatization, we need to emit the body of the
8585 // region with no privatization in the 'else' branch of the conditional.
8586 // Otherwise, we don't have to do anything.
8587 auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8588 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
8589 InsertPointOrErrorTy AfterIP =
8590 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
8591 if (!AfterIP)
8592 return AfterIP.takeError();
8593 Builder.restoreIP(*AfterIP);
8594 return Error::success();
8595 };
8596
8597 // Generate code for the closing of the data region.
8598 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8599 ArrayRef<BasicBlock *> DeallocBlocks) {
8600 TargetDataRTArgs RTArgs;
8601 Info.EmitDebug = !MapInfo->Names.empty();
8602 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
8603
8604 // Emit the number of elements in the offloading arrays.
8605 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8606
8607 // Source location for the ident struct
8608 if (!SrcLocInfo) {
8609 uint32_t SrcLocStrSize;
8610 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8611 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8612 }
8613
8614 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
8615 PointerNum, RTArgs.BasePointersArray,
8616 RTArgs.PointersArray, RTArgs.SizesArray,
8617 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8618 RTArgs.MappersArray};
8619 Function *EndMapperFunc =
8620 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
8621
8622 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
8623 return Error::success();
8624 };
8625
8626 // We don't have to do anything to close the region if the if clause evaluates
8627 // to false.
8628 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8629 ArrayRef<BasicBlock *> DeallocBlocks) {
8630 return Error::success();
8631 };
8632
8633 Error Err = [&]() -> Error {
8634 if (BodyGenCB) {
8635 Error Err = [&]() {
8636 if (IfCond)
8637 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8638 return BeginThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8639 }();
8640
8641 if (Err)
8642 return Err;
8643
8644 // If we don't require privatization of device pointers, we emit the body
8645 // in between the runtime calls. This avoids duplicating the body code.
8646 InsertPointOrErrorTy AfterIP =
8647 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8648 if (!AfterIP)
8649 return AfterIP.takeError();
8650 restoreIPandDebugLoc(Builder, *AfterIP);
8651
8652 if (IfCond)
8653 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8654 return EndThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8655 }
8656 if (IfCond)
8657 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8658 return BeginThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8659 }();
8660
8661 if (Err)
8662 return Err;
8663
8664 return Builder.saveIP();
8665}
8666
8669 bool IsGPUDistribute) {
8670 assert((IVSize == 32 || IVSize == 64) &&
8671 "IV size is not compatible with the omp runtime");
8672 RuntimeFunction Name;
8673 if (IsGPUDistribute)
8674 Name = IVSize == 32
8675 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8676 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8677 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8678 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8679 else
8680 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8681 : omp::OMPRTL___kmpc_for_static_init_4u)
8682 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8683 : omp::OMPRTL___kmpc_for_static_init_8u);
8684
8685 return getOrCreateRuntimeFunction(M, Name);
8686}
8687
8689 bool IVSigned) {
8690 assert((IVSize == 32 || IVSize == 64) &&
8691 "IV size is not compatible with the omp runtime");
8692 RuntimeFunction Name = IVSize == 32
8693 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8694 : omp::OMPRTL___kmpc_dispatch_init_4u)
8695 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8696 : omp::OMPRTL___kmpc_dispatch_init_8u);
8697
8698 return getOrCreateRuntimeFunction(M, Name);
8699}
8700
8702 bool IVSigned) {
8703 assert((IVSize == 32 || IVSize == 64) &&
8704 "IV size is not compatible with the omp runtime");
8705 RuntimeFunction Name = IVSize == 32
8706 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8707 : omp::OMPRTL___kmpc_dispatch_next_4u)
8708 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8709 : omp::OMPRTL___kmpc_dispatch_next_8u);
8710
8711 return getOrCreateRuntimeFunction(M, Name);
8712}
8713
8715 bool IVSigned) {
8716 assert((IVSize == 32 || IVSize == 64) &&
8717 "IV size is not compatible with the omp runtime");
8718 RuntimeFunction Name = IVSize == 32
8719 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8720 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8721 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8722 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8723
8724 return getOrCreateRuntimeFunction(M, Name);
8725}
8726
8728 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8729}
8730
8732 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8733 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8734
8735 DISubprogram *NewSP = Func->getSubprogram();
8736 if (!NewSP)
8737 return;
8738
8740
8741 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8742 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8743 // Only use cached variable if the arg number matches. This is important
8744 // so that DIVariable created for privatized variables are not discarded.
8745 if (NewVar && (arg == NewVar->getArg()))
8746 return NewVar;
8747
8749 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8750 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8751 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8752 return NewVar;
8753 };
8754
8755 auto UpdateDebugRecord = [&](auto *DR) {
8756 DILocalVariable *OldVar = DR->getVariable();
8757 unsigned ArgNo = 0;
8758 for (auto Loc : DR->location_ops()) {
8759 auto Iter = ValueReplacementMap.find(Loc);
8760 if (Iter != ValueReplacementMap.end()) {
8761 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8762 ArgNo = std::get<1>(Iter->second) + 1;
8763 }
8764 }
8765 if (ArgNo != 0)
8766 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8767 };
8768
8770 auto MoveDebugRecordToCorrectBlock = [&](DbgVariableRecord *DVR) {
8771 if (DVR->getNumVariableLocationOps() != 1u) {
8772 DVR->setKillLocation();
8773 return;
8774 }
8775 Value *Loc = DVR->getVariableLocationOp(0u);
8776 BasicBlock *CurBB = DVR->getParent();
8777 BasicBlock *RequiredBB = nullptr;
8778
8779 if (Instruction *LocInst = dyn_cast<Instruction>(Loc))
8780 RequiredBB = LocInst->getParent();
8781 else if (isa<llvm::Argument>(Loc))
8782 RequiredBB = &DVR->getFunction()->getEntryBlock();
8783
8784 if (RequiredBB && RequiredBB != CurBB) {
8785 assert(!RequiredBB->empty());
8786 RequiredBB->insertDbgRecordBefore(DVR->clone(),
8787 RequiredBB->back().getIterator());
8788 DVRsToDelete.push_back(DVR);
8789 }
8790 };
8791
8792 // The location and scope of variable intrinsics and records still point to
8793 // the parent function of the target region. Update them.
8794 for (Instruction &I : instructions(Func)) {
8796 "Unexpected debug intrinsic");
8797 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
8798 UpdateDebugRecord(&DVR);
8799 MoveDebugRecordToCorrectBlock(&DVR);
8800 }
8801 }
8802 for (auto *DVR : DVRsToDelete)
8803 DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
8804 // An extra argument is passed to the device. Create the debug data for it.
8805 if (OMPBuilder.Config.isTargetDevice()) {
8806 DICompileUnit *CU = NewSP->getUnit();
8807 Module *M = Func->getParent();
8808 DIBuilder DB(*M, true, CU);
8809 DIType *VoidPtrTy =
8810 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8811 unsigned ArgNo = Func->arg_size();
8812 DILocalVariable *Var = DB.createParameterVariable(
8813 NewSP, "dyn_ptr", ArgNo, NewSP->getFile(), /*LineNo=*/0, VoidPtrTy,
8814 /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8815 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8816 Argument *LastArg = Func->getArg(Func->arg_size() - 1);
8817 DB.insertDeclare(LastArg, Var, DB.createExpression(), Loc,
8818 &(*Func->begin()));
8819 }
8820}
8821
8823 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8824 return cast<Operator>(V)->getOperand(0);
8825 return V;
8826}
8827
8829 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8831 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8834 SmallVector<Type *> ParameterTypes;
8835 if (OMPBuilder.Config.isTargetDevice()) {
8836 // All parameters to target devices are passed as pointers
8837 // or i64. This assumes 64-bit address spaces/pointers.
8838 for (auto &Arg : Inputs)
8839 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8840 ? Arg->getType()
8841 : Type::getInt64Ty(Builder.getContext()));
8842 } else {
8843 for (auto &Arg : Inputs)
8844 ParameterTypes.push_back(Arg->getType());
8845 }
8846
8847 // The implicit dyn_ptr argument is always the last parameter on both host
8848 // and device so the argument counts match without runtime manipulation.
8849 auto *PtrTy = PointerType::getUnqual(Builder.getContext());
8850 ParameterTypes.push_back(PtrTy);
8851
8852 auto BB = Builder.GetInsertBlock();
8853 auto M = BB->getModule();
8854 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8855 /*isVarArg*/ false);
8856 auto Func =
8857 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8858
8859 // Forward target-cpu and target-features function attributes from the
8860 // original function to the new outlined function.
8861 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8862
8863 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8864 if (TargetCpuAttr.isStringAttribute())
8865 Func->addFnAttr(TargetCpuAttr);
8866
8867 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8868 if (TargetFeaturesAttr.isStringAttribute())
8869 Func->addFnAttr(TargetFeaturesAttr);
8870
8871 if (OMPBuilder.Config.isTargetDevice()) {
8872 Value *ExecMode =
8873 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8874 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8875 }
8876
8877 // Save insert point.
8878 IRBuilder<>::InsertPointGuard IPG(Builder);
8879 // We will generate the entries in the outlined function but the debug
8880 // location may still be pointing to the parent function. Reset it now.
8881 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8882
8883 // Generate the region into the function.
8884 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8885 Builder.SetInsertPoint(EntryBB);
8886
8887 // Insert target init call in the device compilation pass.
8888 if (OMPBuilder.Config.isTargetDevice())
8889 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8890
8891 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8892
8893 // As we embed the user code in the middle of our target region after we
8894 // generate entry code, we must move what allocas we can into the entry
8895 // block to avoid possible breaking optimisations for device
8896 if (OMPBuilder.Config.isTargetDevice())
8898
8899 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "target.exit");
8900 BasicBlock *OutlinedBodyBB =
8901 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8903 Builder.saveIP(),
8904 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()),
8905 ExitBB);
8906 if (!AfterIP)
8907 return AfterIP.takeError();
8908 Builder.SetInsertPoint(ExitBB);
8909
8910 // Insert target deinit call in the device compilation pass.
8911 if (OMPBuilder.Config.isTargetDevice())
8912 OMPBuilder.createTargetDeinit(Builder);
8913
8914 // Insert return instruction.
8915 Builder.CreateRetVoid();
8916
8917 // New Alloca IP at entry point of created device function.
8918 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8919 auto AllocaIP = Builder.saveIP();
8920
8921 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8922
8923 // Do not include the artificial dyn_ptr argument.
8924 const auto &ArgRange = make_range(Func->arg_begin(), Func->arg_end() - 1);
8925
8927
8928 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8929 // Things like GEP's can come in the form of Constants. Constants and
8930 // ConstantExpr's do not have access to the knowledge of what they're
8931 // contained in, so we must dig a little to find an instruction so we
8932 // can tell if they're used inside of the function we're outlining. We
8933 // also replace the original constant expression with a new instruction
8934 // equivalent; an instruction as it allows easy modification in the
8935 // following loop, as we can now know the constant (instruction) is
8936 // owned by our target function and replaceUsesOfWith can now be invoked
8937 // on it (cannot do this with constants it seems). A brand new one also
8938 // allows us to be cautious as it is perhaps possible the old expression
8939 // was used inside of the function but exists and is used externally
8940 // (unlikely by the nature of a Constant, but still).
8941 // NOTE: We cannot remove dead constants that have been rewritten to
8942 // instructions at this stage, we run the risk of breaking later lowering
8943 // by doing so as we could still be in the process of lowering the module
8944 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8945 // constants we have created rewritten versions of.
8946 if (auto *Const = dyn_cast<Constant>(Input))
8947 convertUsersOfConstantsToInstructions(Const, Func, false);
8948
8949 // Collect users before iterating over them to avoid invalidating the
8950 // iteration in case a user uses Input more than once (e.g. a call
8951 // instruction).
8952 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8953 // Collect all the instructions
8955 if (auto *Instr = dyn_cast<Instruction>(User))
8956 if (Instr->getFunction() == Func)
8957 Instr->replaceUsesOfWith(Input, InputCopy);
8958 };
8959
8960 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8961
8962 // Rewrite uses of input valus to parameters.
8963 for (auto InArg : zip(Inputs, ArgRange)) {
8964 Value *Input = std::get<0>(InArg);
8965 Argument &Arg = std::get<1>(InArg);
8966 Value *InputCopy = nullptr;
8967
8968 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = ArgAccessorFuncCB(
8969 Arg, Input, InputCopy, AllocaIP, Builder.saveIP(),
8970 OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin()));
8971 if (!AfterIP)
8972 return AfterIP.takeError();
8973 Builder.restoreIP(*AfterIP);
8974 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8975
8976 // In certain cases a Global may be set up for replacement, however, this
8977 // Global may be used in multiple arguments to the kernel, just segmented
8978 // apart, for example, if we have a global array, that is sectioned into
8979 // multiple mappings (technically not legal in OpenMP, but there is a case
8980 // in Fortran for Common Blocks where this is neccesary), we will end up
8981 // with GEP's into this array inside the kernel, that refer to the Global
8982 // but are technically separate arguments to the kernel for all intents and
8983 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8984 // index, it will fold into an referal to the Global, if we then encounter
8985 // this folded GEP during replacement all of the references to the
8986 // Global in the kernel will be replaced with the argument we have generated
8987 // that corresponds to it, including any other GEP's that refer to the
8988 // Global that may be other arguments. This will invalidate all of the other
8989 // preceding mapped arguments that refer to the same global that may be
8990 // separate segments. To prevent this, we defer global processing until all
8991 // other processing has been performed.
8994 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8995 continue;
8996 }
8997
8999 continue;
9000
9001 ReplaceValue(Input, InputCopy, Func);
9002 }
9003
9004 // Replace all of our deferred Input values, currently just Globals.
9005 for (auto Deferred : DeferredReplacement)
9006 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
9007
9008 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
9009 ValueReplacementMap);
9010 return Func;
9011}
9012/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
9013/// of pointers containing shared data between the parent task and the created
9014/// task.
9016 IRBuilderBase &Builder,
9017 Value *TaskWithPrivates,
9018 Type *TaskWithPrivatesTy) {
9019
9020 Type *TaskTy = OMPIRBuilder.Task;
9021 LLVMContext &Ctx = Builder.getContext();
9022 Value *TaskT =
9023 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
9024 Value *Shareds = TaskT;
9025 // TaskWithPrivatesTy can be one of the following
9026 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
9027 // %struct.privates }
9028 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
9029 //
9030 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
9031 // its first member has to be the task descriptor. TaskTy is the type of the
9032 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
9033 // first member of TaskT, gives us the pointer to shared data.
9034 if (TaskWithPrivatesTy != TaskTy)
9035 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
9036 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
9037}
9038/// Create an entry point for a target task with the following.
9039/// It'll have the following signature
9040/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
9041/// This function is called from emitTargetTask once the
9042/// code to launch the target kernel has been outlined already.
9043/// NumOffloadingArrays is the number of offloading arrays that we need to copy
9044/// into the task structure so that the deferred target task can access this
9045/// data even after the stack frame of the generating task has been rolled
9046/// back. Offloading arrays contain base pointers, pointers, sizes etc
9047/// of the data that the target kernel will access. These in effect are the
9048/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
9050 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
9051 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
9052 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
9053
9054 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
9055 // This is because PrivatesTy is the type of the structure in which
9056 // we pass the offloading arrays to the deferred target task.
9057 assert((!NumOffloadingArrays || PrivatesTy) &&
9058 "PrivatesTy cannot be nullptr when there are offloadingArrays"
9059 "to privatize");
9060
9061 Module &M = OMPBuilder.M;
9062 // KernelLaunchFunction is the target launch function, i.e.
9063 // the function that sets up kernel arguments and calls
9064 // __tgt_target_kernel to launch the kernel on the device.
9065 //
9066 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
9067
9068 // StaleCI is the CallInst which is the call to the outlined
9069 // target kernel launch function. If there are local live-in values
9070 // that the outlined function uses then these are aggregated into a structure
9071 // which is passed as the second argument. If there are no local live-in
9072 // values or if all values used by the outlined kernel are global variables,
9073 // then there's only one argument, the threadID. So, StaleCI can be
9074 //
9075 // %structArg = alloca { ptr, ptr }, align 8
9076 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
9077 // store ptr %20, ptr %gep_, align 8
9078 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
9079 // store ptr %21, ptr %gep_8, align 8
9080 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
9081 //
9082 // OR
9083 //
9084 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
9086 StaleCI->getIterator());
9087
9088 LLVMContext &Ctx = StaleCI->getParent()->getContext();
9089
9090 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
9091 Type *TaskPtrTy = OMPBuilder.TaskPtr;
9092 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
9093
9094 auto ProxyFnTy =
9095 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
9096 /* isVarArg */ false);
9097 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
9098 ".omp_target_task_proxy_func",
9099 Builder.GetInsertBlock()->getModule());
9100 Value *ThreadId = ProxyFn->getArg(0);
9101 Value *TaskWithPrivates = ProxyFn->getArg(1);
9102 ThreadId->setName("thread.id");
9103 TaskWithPrivates->setName("task");
9104
9105 bool HasShareds = SharedArgsOperandNo > 0;
9106 bool HasOffloadingArrays = NumOffloadingArrays > 0;
9107 BasicBlock *EntryBB =
9108 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
9109 Builder.SetInsertPoint(EntryBB);
9110
9111 SmallVector<Value *> KernelLaunchArgs;
9112 KernelLaunchArgs.reserve(StaleCI->arg_size());
9113 KernelLaunchArgs.push_back(ThreadId);
9114
9115 if (HasOffloadingArrays) {
9116 assert(TaskTy != TaskWithPrivatesTy &&
9117 "If there are offloading arrays to pass to the target"
9118 "TaskTy cannot be the same as TaskWithPrivatesTy");
9119 (void)TaskTy;
9120 Value *Privates =
9121 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
9122 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
9123 KernelLaunchArgs.push_back(
9124 Builder.CreateStructGEP(PrivatesTy, Privates, i));
9125 }
9126
9127 if (HasShareds) {
9128 auto *ArgStructAlloca =
9129 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
9130 assert(ArgStructAlloca &&
9131 "Unable to find the alloca instruction corresponding to arguments "
9132 "for extracted function");
9133 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
9134 std::optional<TypeSize> ArgAllocSize =
9135 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9136 assert(ArgStructType && ArgAllocSize &&
9137 "Unable to determine size of arguments for extracted function");
9138 uint64_t StructSize = ArgAllocSize->getFixedValue();
9139
9140 AllocaInst *NewArgStructAlloca =
9141 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
9142
9143 Value *SharedsSize = Builder.getInt64(StructSize);
9144
9146 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
9147
9148 Builder.CreateMemCpy(
9149 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
9150 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
9151 KernelLaunchArgs.push_back(NewArgStructAlloca);
9152 }
9153 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
9154 Builder.CreateRetVoid();
9155 return ProxyFn;
9156}
9158
9159 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
9160 return GEP->getSourceElementType();
9161 if (auto *Alloca = dyn_cast<AllocaInst>(V))
9162 return Alloca->getAllocatedType();
9163
9164 llvm_unreachable("Unhandled Instruction type");
9165 return nullptr;
9166}
9167// This function returns a struct that has at most two members.
9168// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
9169// descriptor. The second member, if needed, is a struct containing arrays
9170// that need to be passed to the offloaded target kernel. For example,
9171// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
9172// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
9173// respectively, then the types created by this function are
9174//
9175// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
9176// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
9177// %struct.privates }
9178// %struct.task_with_privates is returned by this function.
9179// If there aren't any offloading arrays to pass to the target kernel,
9180// %struct.kmp_task_ompbuilder_t is returned.
9181static StructType *
9183 ArrayRef<Value *> OffloadingArraysToPrivatize) {
9184
9185 if (OffloadingArraysToPrivatize.empty())
9186 return OMPIRBuilder.Task;
9187
9188 SmallVector<Type *, 4> StructFieldTypes;
9189 for (Value *V : OffloadingArraysToPrivatize) {
9190 assert(V->getType()->isPointerTy() &&
9191 "Expected pointer to array to privatize. Got a non-pointer value "
9192 "instead");
9193 Type *ArrayTy = getOffloadingArrayType(V);
9194 assert(ArrayTy && "ArrayType cannot be nullptr");
9195 StructFieldTypes.push_back(ArrayTy);
9196 }
9197 StructType *PrivatesStructTy =
9198 StructType::create(StructFieldTypes, "struct.privates");
9199 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
9200 "struct.task_with_privates");
9201}
9203 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
9204 TargetRegionEntryInfo &EntryInfo,
9206 Function *&OutlinedFn, Constant *&OutlinedFnID,
9210
9211 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
9212 [&](StringRef EntryFnName) {
9213 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
9214 EntryFnName, Inputs, CBFunc,
9215 ArgAccessorFuncCB);
9216 };
9217
9218 return OMPBuilder.emitTargetRegionFunction(
9219 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
9220 OutlinedFnID);
9221}
9222
9224 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
9226 const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs,
9227 bool HasNoWait) {
9228
9229 // The following explains the code-gen scenario for the `target` directive. A
9230 // similar scneario is followed for other device-related directives (e.g.
9231 // `target enter data`) but in similar fashion since we only need to emit task
9232 // that encapsulates the proper runtime call.
9233 //
9234 // When we arrive at this function, the target region itself has been
9235 // outlined into the function OutlinedFn.
9236 // So at ths point, for
9237 // --------------------------------------------------------------
9238 // void user_code_that_offloads(...) {
9239 // omp target depend(..) map(from:a) map(to:b) private(i)
9240 // do i = 1, 10
9241 // a(i) = b(i) + n
9242 // }
9243 //
9244 // --------------------------------------------------------------
9245 //
9246 // we have
9247 //
9248 // --------------------------------------------------------------
9249 //
9250 // void user_code_that_offloads(...) {
9251 // %.offload_baseptrs = alloca [2 x ptr], align 8
9252 // %.offload_ptrs = alloca [2 x ptr], align 8
9253 // %.offload_mappers = alloca [2 x ptr], align 8
9254 // ;; target region has been outlined and now we need to
9255 // ;; offload to it via a target task.
9256 // }
9257 // void outlined_device_function(ptr a, ptr b, ptr n) {
9258 // n = *n_ptr;
9259 // do i = 1, 10
9260 // a(i) = b(i) + n
9261 // }
9262 //
9263 // We have to now do the following
9264 // (i) Make an offloading call to outlined_device_function using the OpenMP
9265 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
9266 // emitted by emitKernelLaunch
9267 // (ii) Create a task entry point function that calls kernel_launch_function
9268 // and is the entry point for the target task. See
9269 // '@.omp_target_task_proxy_func in the pseudocode below.
9270 // (iii) Create a task with the task entry point created in (ii)
9271 //
9272 // That is we create the following
9273 // struct task_with_privates {
9274 // struct kmp_task_ompbuilder_t task_struct;
9275 // struct privates {
9276 // [2 x ptr] ; baseptrs
9277 // [2 x ptr] ; ptrs
9278 // [2 x i64] ; sizes
9279 // }
9280 // }
9281 // void user_code_that_offloads(...) {
9282 // %.offload_baseptrs = alloca [2 x ptr], align 8
9283 // %.offload_ptrs = alloca [2 x ptr], align 8
9284 // %.offload_sizes = alloca [2 x i64], align 8
9285 //
9286 // %structArg = alloca { ptr, ptr, ptr }, align 8
9287 // %strucArg[0] = a
9288 // %strucArg[1] = b
9289 // %strucArg[2] = &n
9290 //
9291 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
9292 // sizeof(kmp_task_ompbuilder_t),
9293 // sizeof(structArg),
9294 // @.omp_target_task_proxy_func,
9295 // ...)
9296 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
9297 // sizeof(structArg))
9298 // memcpy(target_task_with_privates->privates->baseptrs,
9299 // offload_baseptrs, sizeof(offload_baseptrs)
9300 // memcpy(target_task_with_privates->privates->ptrs,
9301 // offload_ptrs, sizeof(offload_ptrs)
9302 // memcpy(target_task_with_privates->privates->sizes,
9303 // offload_sizes, sizeof(offload_sizes)
9304 // dependencies_array = ...
9305 // ;; if nowait not present
9306 // call @__kmpc_omp_wait_deps(..., dependencies_array)
9307 // call @__kmpc_omp_task_begin_if0(...)
9308 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
9309 // %target_task_with_privates)
9310 // call @__kmpc_omp_task_complete_if0(...)
9311 // }
9312 //
9313 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
9314 // ptr %task) {
9315 // %structArg = alloca {ptr, ptr, ptr}
9316 // %task_ptr = getelementptr(%task, 0, 0)
9317 // %shared_data = load (getelementptr %task_ptr, 0, 0)
9318 // mempcy(%structArg, %shared_data, sizeof(%structArg))
9319 //
9320 // %offloading_arrays = getelementptr(%task, 0, 1)
9321 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
9322 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
9323 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
9324 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
9325 // %offload_sizes, %structArg)
9326 // }
9327 //
9328 // We need the proxy function because the signature of the task entry point
9329 // expected by kmpc_omp_task is always the same and will be different from
9330 // that of the kernel_launch function.
9331 //
9332 // kernel_launch_function is generated by emitKernelLaunch and has the
9333 // always_inline attribute. For this example, it'll look like so:
9334 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
9335 // %offload_sizes, %structArg) alwaysinline {
9336 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
9337 // ; load aggregated data from %structArg
9338 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
9339 // ; offload_sizes
9340 // call i32 @__tgt_target_kernel(...,
9341 // outlined_device_function,
9342 // ptr %kernel_args)
9343 // }
9344 // void outlined_device_function(ptr a, ptr b, ptr n) {
9345 // n = *n_ptr;
9346 // do i = 1, 10
9347 // a(i) = b(i) + n
9348 // }
9349 //
9350 BasicBlock *TargetTaskBodyBB =
9351 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
9352 BasicBlock *TargetTaskAllocaBB =
9353 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
9354
9355 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
9356 TargetTaskAllocaBB->begin());
9357 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
9358
9359 auto OI = std::make_unique<OutlineInfo>();
9360 OI->EntryBB = TargetTaskAllocaBB;
9361 OI->OuterAllocBB = AllocaIP.getBlock();
9362
9363 // Add the thread ID argument.
9365 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9366 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
9367
9368 // Generate the task body which will subsequently be outlined.
9369 Builder.restoreIP(TargetTaskBodyIP);
9370 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
9371 return Err;
9372
9373 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
9374 // it is given. These blocks are enumerated by
9375 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
9376 // to be outside the region. In other words, OI.ExitBlock is expected to be
9377 // the start of the region after the outlining. We used to set OI.ExitBlock
9378 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
9379 // except when the task body is a single basic block. In that case,
9380 // OI.ExitBlock is set to the single task body block and will get left out of
9381 // the outlining process. So, simply create a new empty block to which we
9382 // uncoditionally branch from where TaskBodyCB left off
9383 OI->ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
9384 emitBlock(OI->ExitBB, Builder.GetInsertBlock()->getParent(),
9385 /*IsFinished=*/true);
9386
9387 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
9388 bool NeedsTargetTask = HasNoWait && DeviceID;
9389 if (NeedsTargetTask) {
9390 for (auto *V :
9391 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
9392 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
9393 RTArgs.SizesArray}) {
9395 OffloadingArraysToPrivatize.push_back(V);
9396 OI->ExcludeArgsFromAggregate.push_back(V);
9397 }
9398 }
9399 }
9400 OI->PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
9401 DeviceID, OffloadingArraysToPrivatize](
9402 Function &OutlinedFn) mutable {
9403 assert(OutlinedFn.hasOneUse() &&
9404 "there must be a single user for the outlined function");
9405
9406 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9407
9408 // The first argument of StaleCI is always the thread id.
9409 // The next few arguments are the pointers to offloading arrays
9410 // if any. (see OffloadingArraysToPrivatize)
9411 // Finally, all other local values that are live-in into the outlined region
9412 // end up in a structure whose pointer is passed as the last argument. This
9413 // piece of data is passed in the "shared" field of the task structure. So,
9414 // we know we have to pass shareds to the task if the number of arguments is
9415 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
9416 // thread id. Further, for safety, we assert that the number of arguments of
9417 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
9418 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
9419 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
9420 assert((!HasShareds ||
9421 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
9422 "Wrong number of arguments for StaleCI when shareds are present");
9423 int SharedArgOperandNo =
9424 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
9425
9426 StructType *TaskWithPrivatesTy =
9427 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
9428 StructType *PrivatesTy = nullptr;
9429
9430 if (!OffloadingArraysToPrivatize.empty())
9431 PrivatesTy =
9432 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
9433
9435 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
9436 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
9437
9438 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
9439 << "\n");
9440
9441 Builder.SetInsertPoint(StaleCI);
9442
9443 // Gather the arguments for emitting the runtime call.
9444 uint32_t SrcLocStrSize;
9445 Constant *SrcLocStr =
9447 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9448
9449 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
9450 //
9451 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
9452 // the DeviceID to the deferred task and also since
9453 // @__kmpc_omp_target_task_alloc creates an untied/async task.
9454 Function *TaskAllocFn =
9455 !NeedsTargetTask
9456 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
9458 OMPRTL___kmpc_omp_target_task_alloc);
9459
9460 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
9461 // call.
9462 Value *ThreadID = getOrCreateThreadID(Ident);
9463
9464 // Argument - `sizeof_kmp_task_t` (TaskSize)
9465 // Tasksize refers to the size in bytes of kmp_task_t data structure
9466 // plus any other data to be passed to the target task, if any, which
9467 // is packed into a struct. kmp_task_t and the struct so created are
9468 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
9469 Value *TaskSize = Builder.getInt64(
9470 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
9471
9472 // Argument - `sizeof_shareds` (SharedsSize)
9473 // SharedsSize refers to the shareds array size in the kmp_task_t data
9474 // structure.
9475 Value *SharedsSize = Builder.getInt64(0);
9476 if (HasShareds) {
9477 auto *ArgStructAlloca =
9478 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
9479 assert(ArgStructAlloca &&
9480 "Unable to find the alloca instruction corresponding to arguments "
9481 "for extracted function");
9482 std::optional<TypeSize> ArgAllocSize =
9483 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9484 assert(ArgAllocSize &&
9485 "Unable to determine size of arguments for extracted function");
9486 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
9487 }
9488
9489 // Argument - `flags`
9490 // Task is tied iff (Flags & 1) == 1.
9491 // Task is untied iff (Flags & 1) == 0.
9492 // Task is final iff (Flags & 2) == 2.
9493 // Task is not final iff (Flags & 2) == 0.
9494 // A target task is not final and is untied.
9495 Value *Flags = Builder.getInt32(0);
9496
9497 // Emit the @__kmpc_omp_task_alloc runtime call
9498 // The runtime call returns a pointer to an area where the task captured
9499 // variables must be copied before the task is run (TaskData)
9500 CallInst *TaskData = nullptr;
9501
9502 SmallVector<llvm::Value *> TaskAllocArgs = {
9503 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
9504 /*flags=*/Flags,
9505 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
9506 /*task_func=*/ProxyFn};
9507
9508 if (NeedsTargetTask) {
9509 assert(DeviceID && "Expected non-empty device ID.");
9510 TaskAllocArgs.push_back(DeviceID);
9511 }
9512
9513 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
9514
9515 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
9516 if (HasShareds) {
9517 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
9519 *this, Builder, TaskData, TaskWithPrivatesTy);
9520 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
9521 SharedsSize);
9522 }
9523 if (!OffloadingArraysToPrivatize.empty()) {
9524 Value *Privates =
9525 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
9526 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
9527 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
9528 [[maybe_unused]] Type *ArrayType =
9529 getOffloadingArrayType(PtrToPrivatize);
9530 assert(ArrayType && "ArrayType cannot be nullptr");
9531
9532 Type *ElementType = PrivatesTy->getElementType(i);
9533 assert(ElementType == ArrayType &&
9534 "ElementType should match ArrayType");
9535 (void)ArrayType;
9536
9537 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
9538 Builder.CreateMemCpy(
9539 Dst, Alignment, PtrToPrivatize, Alignment,
9540 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
9541 }
9542 }
9543
9544 Value *DepArray = nullptr;
9545 Value *NumDeps = nullptr;
9546 if (Dependencies.DepArray) {
9547 DepArray = Dependencies.DepArray;
9548 NumDeps = Dependencies.NumDeps;
9549 } else if (!Dependencies.Deps.empty()) {
9550 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
9551 NumDeps = Builder.getInt32(Dependencies.Deps.size());
9552 }
9553
9554 // ---------------------------------------------------------------
9555 // V5.2 13.8 target construct
9556 // If the nowait clause is present, execution of the target task
9557 // may be deferred. If the nowait clause is not present, the target task is
9558 // an included task.
9559 // ---------------------------------------------------------------
9560 // The above means that the lack of a nowait on the target construct
9561 // translates to '#pragma omp task if(0)'
9562 if (!NeedsTargetTask) {
9563 if (DepArray) {
9564 Function *TaskWaitFn =
9565 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
9567 TaskWaitFn,
9568 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
9569 /*ndeps=*/NumDeps,
9570 /*dep_list=*/DepArray,
9571 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
9572 /*noalias_dep_list=*/
9574 }
9575 // Included task.
9576 Function *TaskBeginFn =
9577 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
9578 Function *TaskCompleteFn =
9579 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
9580 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
9581 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
9582 CI->setDebugLoc(StaleCI->getDebugLoc());
9583 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
9584 } else if (DepArray) {
9585 // HasNoWait - meaning the task may be deferred. Call
9586 // __kmpc_omp_task_with_deps if there are dependencies,
9587 // else call __kmpc_omp_task
9588 Function *TaskFn =
9589 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
9591 TaskFn,
9592 {Ident, ThreadID, TaskData, NumDeps, DepArray,
9593 ConstantInt::get(Builder.getInt32Ty(), 0),
9595 } else {
9596 // Emit the @__kmpc_omp_task runtime call to spawn the task
9597 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
9598 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
9599 }
9600
9601 StaleCI->eraseFromParent();
9602 for (Instruction *I : llvm::reverse(ToBeDeleted))
9603 I->eraseFromParent();
9604 };
9605 addOutlineInfo(std::move(OI));
9606
9607 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
9608 << *(Builder.GetInsertBlock()) << "\n");
9609 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
9610 << *(Builder.GetInsertBlock()->getParent()->getParent())
9611 << "\n");
9612 return Builder.saveIP();
9613}
9614
9616 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
9617 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
9618 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
9619 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9620 if (Error Err =
9621 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
9622 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
9623 return Err;
9624 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
9625 return Error::success();
9626}
9627
9628static void emitTargetCall(
9629 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
9634 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
9638 const OpenMPIRBuilder::DependenciesInfo &Dependencies, bool HasNoWait,
9639 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9640 // Generate a function call to the host fallback implementation of the target
9641 // region. This is called by the host when no offload entry was generated for
9642 // the target region and when the offloading call fails at runtime.
9643 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
9645 Builder.restoreIP(IP);
9646 // Ensure the host fallback has the same dyn_ptr ABI as the device.
9647 SmallVector<Value *> FallbackArgs(Args.begin(), Args.end());
9648 FallbackArgs.push_back(
9649 Constant::getNullValue(PointerType::getUnqual(Builder.getContext())));
9650 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, FallbackArgs);
9651 return Builder.saveIP();
9652 };
9653
9654 bool HasDependencies = !Dependencies.empty();
9655 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
9656
9658
9659 auto TaskBodyCB =
9660 [&](Value *DeviceID, Value *RTLoc,
9661 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
9662 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9663 // produce any.
9665 // emitKernelLaunch makes the necessary runtime call to offload the
9666 // kernel. We then outline all that code into a separate function
9667 // ('kernel_launch_function' in the pseudo code above). This function is
9668 // then called by the target task proxy function (see
9669 // '@.omp_target_task_proxy_func' in the pseudo code above)
9670 // "@.omp_target_task_proxy_func' is generated by
9671 // emitTargetTaskProxyFunction.
9672 if (OutlinedFnID && DeviceID)
9673 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9674 EmitTargetCallFallbackCB, KArgs,
9675 DeviceID, RTLoc, TargetTaskAllocaIP);
9676
9677 // We only need to do the outlining if `DeviceID` is set to avoid calling
9678 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9679 // generating the `else` branch of an `if` clause.
9680 //
9681 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9682 // In this case, we execute the host implementation directly.
9683 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9684 }());
9685
9686 OMPBuilder.Builder.restoreIP(AfterIP);
9687 return Error::success();
9688 };
9689
9690 auto &&EmitTargetCallElse =
9691 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9693 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
9694 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9695 // produce any.
9697 if (RequiresOuterTargetTask) {
9698 // Arguments that are intended to be directly forwarded to an
9699 // emitKernelLaunch call are pased as nullptr, since
9700 // OutlinedFnID=nullptr results in that call not being done.
9702 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9703 /*RTLoc=*/nullptr, AllocaIP,
9704 Dependencies, EmptyRTArgs, HasNoWait);
9705 }
9706 return EmitTargetCallFallbackCB(Builder.saveIP());
9707 }());
9708
9709 Builder.restoreIP(AfterIP);
9710 return Error::success();
9711 };
9712
9713 auto &&EmitTargetCallThen =
9714 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9716 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
9717 Info.HasNoWait = HasNoWait;
9718 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9719
9721 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9722 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9723 /*IsNonContiguous=*/true,
9724 /*ForEndCall=*/false))
9725 return Err;
9726
9727 SmallVector<Value *, 3> NumTeamsC;
9728 for (auto [DefaultVal, RuntimeVal] :
9729 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9730 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9731 : Builder.getInt32(DefaultVal));
9732
9733 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9734 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9735 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9736 if (Clause)
9737 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9738 /*isSigned=*/false);
9739 return Clause;
9740 };
9741 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9742 if (Clause)
9743 Result =
9744 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9745 Result, Clause)
9746 : Clause;
9747 };
9748
9749 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9750 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9751 SmallVector<Value *, 3> NumThreadsC;
9752 Value *MaxThreadsClause =
9753 RuntimeAttrs.TeamsThreadLimit.size() == 1
9754 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9755 : nullptr;
9756
9757 for (auto [TeamsVal, TargetVal] : zip_equal(
9758 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9759 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9760 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9761
9762 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9763 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9764
9765 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9766 }
9767
9768 unsigned NumTargetItems = Info.NumberOfPtrs;
9769 uint32_t SrcLocStrSize;
9770 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9771 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9772 llvm::omp::IdentFlag(0), 0);
9773
9774 Value *TripCount = RuntimeAttrs.LoopTripCount
9775 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9776 Builder.getInt64Ty(),
9777 /*isSigned=*/false)
9778 : Builder.getInt64(0);
9779
9780 // Request zero groupprivate bytes by default.
9781 if (!DynCGroupMem)
9782 DynCGroupMem = Builder.getInt32(0);
9783
9785 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9786 HasNoWait, DynCGroupMemFallback);
9787
9788 // Assume no error was returned because TaskBodyCB and
9789 // EmitTargetCallFallbackCB don't produce any.
9791 // The presence of certain clauses on the target directive require the
9792 // explicit generation of the target task.
9793 if (RequiresOuterTargetTask)
9794 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9795 RTLoc, AllocaIP, Dependencies,
9796 KArgs.RTArgs, Info.HasNoWait);
9797
9798 return OMPBuilder.emitKernelLaunch(
9799 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9800 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9801 }());
9802
9803 Builder.restoreIP(AfterIP);
9804 return Error::success();
9805 };
9806
9807 // If we don't have an ID for the target region, it means an offload entry
9808 // wasn't created. In this case we just run the host fallback directly and
9809 // ignore any potential 'if' clauses.
9810 if (!OutlinedFnID) {
9811 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP(), DeallocBlocks));
9812 return;
9813 }
9814
9815 // If there's no 'if' clause, only generate the kernel launch code path.
9816 if (!IfCond) {
9817 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP(), DeallocBlocks));
9818 return;
9819 }
9820
9821 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9822 EmitTargetCallElse, AllocaIP));
9823}
9824
9826 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9827 InsertPointTy CodeGenIP, ArrayRef<BasicBlock *> DeallocBlocks,
9828 TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo,
9829 const TargetKernelDefaultAttrs &DefaultAttrs,
9830 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9831 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9834 CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies,
9835 bool HasNowait, Value *DynCGroupMem,
9836 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9837
9838 if (!updateToLocation(Loc))
9839 return InsertPointTy();
9840
9841 Builder.restoreIP(CodeGenIP);
9842
9843 Function *OutlinedFn;
9844 Constant *OutlinedFnID = nullptr;
9845 // The target region is outlined into its own function. The LLVM IR for
9846 // the target region itself is generated using the callbacks CBFunc
9847 // and ArgAccessorFuncCB
9849 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9850 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9851 return Err;
9852
9853 // If we are not on the target device, then we need to generate code
9854 // to make a remote call (offload) to the previously outlined function
9855 // that represents the target region. Do that now.
9856 if (!Config.isTargetDevice())
9857 emitTargetCall(*this, Builder, AllocaIP, DeallocBlocks, Info, DefaultAttrs,
9858 RuntimeAttrs, IfCond, OutlinedFn, OutlinedFnID, Inputs,
9859 GenMapInfoCB, CustomMapperCB, Dependencies, HasNowait,
9860 DynCGroupMem, DynCGroupMemFallback);
9861 return Builder.saveIP();
9862}
9863
9864std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9865 StringRef FirstSeparator,
9866 StringRef Separator) {
9867 SmallString<128> Buffer;
9868 llvm::raw_svector_ostream OS(Buffer);
9869 StringRef Sep = FirstSeparator;
9870 for (StringRef Part : Parts) {
9871 OS << Sep << Part;
9872 Sep = Separator;
9873 }
9874 return OS.str().str();
9875}
9876
9877std::string
9879 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9880 Config.separator());
9881}
9882
9884 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9885 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9886 if (Elem.second) {
9887 assert(Elem.second->getValueType() == Ty &&
9888 "OMP internal variable has different type than requested");
9889 } else {
9890 // TODO: investigate the appropriate linkage type used for the global
9891 // variable for possibly changing that to internal or private, or maybe
9892 // create different versions of the function for different OMP internal
9893 // variables.
9894 const DataLayout &DL = M.getDataLayout();
9895 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9896 // default global AS is 1.
9897 // See double-target-call-with-declare-target.f90 and
9898 // declare-target-vars-in-target-region.f90 libomptarget
9899 // tests.
9900 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9901 : M.getTargetTriple().isAMDGPU()
9902 ? 0
9903 : DL.getDefaultGlobalsAddressSpace();
9904 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9907 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9908 Constant::getNullValue(Ty), Elem.first(),
9909 /*InsertBefore=*/nullptr,
9910 GlobalValue::NotThreadLocal, AddressSpaceVal);
9911 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9912 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9913 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9914 Elem.second = GV;
9915 }
9916
9917 return Elem.second;
9918}
9919
9920Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9921 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9922 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9923 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9924}
9925
9927 LLVMContext &Ctx = Builder.getContext();
9928 Value *Null =
9929 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9930 Value *SizeGep =
9931 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9932 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9933 return SizePtrToInt;
9934}
9935
9938 std::string VarName) {
9939 llvm::Constant *MaptypesArrayInit =
9940 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9941 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9942 M, MaptypesArrayInit->getType(),
9943 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9944 VarName);
9945 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9946 return MaptypesArrayGlobal;
9947}
9948
9950 InsertPointTy AllocaIP,
9951 unsigned NumOperands,
9952 struct MapperAllocas &MapperAllocas) {
9953 if (!updateToLocation(Loc))
9954 return;
9955
9956 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9957 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9958 Builder.restoreIP(AllocaIP);
9959 AllocaInst *ArgsBase = Builder.CreateAlloca(
9960 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9961 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9962 ".offload_ptrs");
9963 AllocaInst *ArgSizes = Builder.CreateAlloca(
9964 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9966 MapperAllocas.ArgsBase = ArgsBase;
9967 MapperAllocas.Args = Args;
9968 MapperAllocas.ArgSizes = ArgSizes;
9969}
9970
9972 Function *MapperFunc, Value *SrcLocInfo,
9973 Value *MaptypesArg, Value *MapnamesArg,
9975 int64_t DeviceID, unsigned NumOperands) {
9976 if (!updateToLocation(Loc))
9977 return;
9978
9979 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9980 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9981 Value *ArgsBaseGEP =
9982 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9983 {Builder.getInt32(0), Builder.getInt32(0)});
9984 Value *ArgsGEP =
9985 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9986 {Builder.getInt32(0), Builder.getInt32(0)});
9987 Value *ArgSizesGEP =
9988 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9989 {Builder.getInt32(0), Builder.getInt32(0)});
9990 Value *NullPtr =
9991 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9992 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9993 Builder.getInt32(NumOperands),
9994 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9995 MaptypesArg, MapnamesArg, NullPtr});
9996}
9997
9999 TargetDataRTArgs &RTArgs,
10000 TargetDataInfo &Info,
10001 bool ForEndCall) {
10002 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
10003 "expected region end call to runtime only when end call is separate");
10004 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
10005 auto VoidPtrTy = UnqualPtrTy;
10006 auto VoidPtrPtrTy = UnqualPtrTy;
10007 auto Int64Ty = Type::getInt64Ty(M.getContext());
10008 auto Int64PtrTy = UnqualPtrTy;
10009
10010 if (!Info.NumberOfPtrs) {
10011 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
10012 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
10013 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
10014 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
10015 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
10016 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
10017 return;
10018 }
10019
10020 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
10021 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
10022 Info.RTArgs.BasePointersArray,
10023 /*Idx0=*/0, /*Idx1=*/0);
10024 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
10025 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
10026 /*Idx0=*/0,
10027 /*Idx1=*/0);
10028 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
10029 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10030 /*Idx0=*/0, /*Idx1=*/0);
10031 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
10032 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
10033 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
10034 : Info.RTArgs.MapTypesArray,
10035 /*Idx0=*/0,
10036 /*Idx1=*/0);
10037
10038 // Only emit the mapper information arrays if debug information is
10039 // requested.
10040 if (!Info.EmitDebug)
10041 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
10042 else
10043 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
10044 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
10045 /*Idx0=*/0,
10046 /*Idx1=*/0);
10047 // If there is no user-defined mapper, set the mapper array to nullptr to
10048 // avoid an unnecessary data privatization
10049 if (!Info.HasMapper)
10050 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
10051 else
10052 RTArgs.MappersArray =
10053 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
10054}
10055
10057 InsertPointTy CodeGenIP,
10058 MapInfosTy &CombinedInfo,
10059 TargetDataInfo &Info) {
10061 CombinedInfo.NonContigInfo;
10062
10063 // Build an array of struct descriptor_dim and then assign it to
10064 // offload_args.
10065 //
10066 // struct descriptor_dim {
10067 // uint64_t offset;
10068 // uint64_t count;
10069 // uint64_t stride
10070 // };
10071 Type *Int64Ty = Builder.getInt64Ty();
10073 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
10074 "struct.descriptor_dim");
10075
10076 enum { OffsetFD = 0, CountFD, StrideFD };
10077 // We need two index variable here since the size of "Dims" is the same as
10078 // the size of Components, however, the size of offset, count, and stride is
10079 // equal to the size of base declaration that is non-contiguous.
10080 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
10081 // Skip emitting ir if dimension size is 1 since it cannot be
10082 // non-contiguous.
10083 if (NonContigInfo.Dims[I] == 1)
10084 continue;
10085 Builder.restoreIP(AllocaIP);
10086 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
10087 AllocaInst *DimsAddr =
10088 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
10089 Builder.restoreIP(CodeGenIP);
10090 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
10091 unsigned RevIdx = EE - II - 1;
10092 Value *DimsLVal = Builder.CreateInBoundsGEP(
10093 ArrayTy, DimsAddr, {Builder.getInt64(0), Builder.getInt64(II)});
10094 // Offset
10095 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
10096 Builder.CreateAlignedStore(
10097 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
10098 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
10099 // Count
10100 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
10101 Builder.CreateAlignedStore(
10102 NonContigInfo.Counts[L][RevIdx], CountLVal,
10103 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
10104 // Stride
10105 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
10106 Builder.CreateAlignedStore(
10107 NonContigInfo.Strides[L][RevIdx], StrideLVal,
10108 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
10109 }
10110 // args[I] = &dims
10111 Builder.restoreIP(CodeGenIP);
10112 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
10113 DimsAddr, Builder.getPtrTy());
10114 Value *P = Builder.CreateConstInBoundsGEP2_32(
10115 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
10116 Info.RTArgs.PointersArray, 0, I);
10117 Builder.CreateAlignedStore(
10118 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
10119 ++L;
10120 }
10121}
10122
10123void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
10124 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
10125 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
10126 BasicBlock *ExitBB, bool IsInit) {
10127 StringRef Prefix = IsInit ? ".init" : ".del";
10128
10129 // Evaluate if this is an array section.
10131 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
10132 Value *IsArray =
10133 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
10134 Value *DeleteBit = Builder.CreateAnd(
10135 MapType,
10136 Builder.getInt64(
10137 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10138 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
10139 Value *DeleteCond;
10140 Value *Cond;
10141 if (IsInit) {
10142 // base != begin?
10143 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
10144 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
10145 DeleteCond = Builder.CreateIsNull(
10146 DeleteBit,
10147 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
10148 } else {
10149 Cond = IsArray;
10150 DeleteCond = Builder.CreateIsNotNull(
10151 DeleteBit,
10152 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
10153 }
10154 Cond = Builder.CreateAnd(Cond, DeleteCond);
10155 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
10156
10157 emitBlock(BodyBB, MapperFn);
10158 // Get the array size by multiplying element size and element number (i.e., \p
10159 // Size).
10160 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
10161 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
10162 // memory allocation/deletion purpose only.
10163 Value *MapTypeArg = Builder.CreateAnd(
10164 MapType,
10165 Builder.getInt64(
10166 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10167 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10168 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10169 MapTypeArg = Builder.CreateOr(
10170 MapTypeArg,
10171 Builder.getInt64(
10172 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10173 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
10174
10175 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
10176 // data structure.
10177 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
10178 ArraySize, MapTypeArg, MapName};
10180 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
10181 OffloadingArgs);
10182}
10183
10186 llvm::Value *BeginArg)>
10187 GenMapInfoCB,
10188 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB,
10189 bool PreserveMemberOfFlags) {
10190 SmallVector<Type *> Params;
10191 Params.emplace_back(Builder.getPtrTy());
10192 Params.emplace_back(Builder.getPtrTy());
10193 Params.emplace_back(Builder.getPtrTy());
10194 Params.emplace_back(Builder.getInt64Ty());
10195 Params.emplace_back(Builder.getInt64Ty());
10196 Params.emplace_back(Builder.getPtrTy());
10197
10198 auto *FnTy =
10199 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
10200
10201 SmallString<64> TyStr;
10202 raw_svector_ostream Out(TyStr);
10203 Function *MapperFn =
10205 MapperFn->addFnAttr(Attribute::NoInline);
10206 MapperFn->addFnAttr(Attribute::NoUnwind);
10207 MapperFn->addParamAttr(0, Attribute::NoUndef);
10208 MapperFn->addParamAttr(1, Attribute::NoUndef);
10209 MapperFn->addParamAttr(2, Attribute::NoUndef);
10210 MapperFn->addParamAttr(3, Attribute::NoUndef);
10211 MapperFn->addParamAttr(4, Attribute::NoUndef);
10212 MapperFn->addParamAttr(5, Attribute::NoUndef);
10213
10214 // Start the mapper function code generation.
10215 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
10216 auto SavedIP = Builder.saveIP();
10217 Builder.SetInsertPoint(EntryBB);
10218
10219 Value *MapperHandle = MapperFn->getArg(0);
10220 Value *BaseIn = MapperFn->getArg(1);
10221 Value *BeginIn = MapperFn->getArg(2);
10222 Value *Size = MapperFn->getArg(3);
10223 Value *MapType = MapperFn->getArg(4);
10224 Value *MapName = MapperFn->getArg(5);
10225
10226 // Compute the starting and end addresses of array elements.
10227 // Prepare common arguments for array initiation and deletion.
10228 // Convert the size in bytes into the number of array elements.
10229 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
10230 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
10231 Value *PtrBegin = BeginIn;
10232 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
10233
10234 // Emit array initiation if this is an array section and \p MapType indicates
10235 // that memory allocation is required.
10236 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
10237 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
10238 MapType, MapName, ElementSize, HeadBB,
10239 /*IsInit=*/true);
10240
10241 // Emit a for loop to iterate through SizeArg of elements and map all of them.
10242
10243 // Emit the loop header block.
10244 emitBlock(HeadBB, MapperFn);
10245 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
10246 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
10247 // Evaluate whether the initial condition is satisfied.
10248 Value *IsEmpty =
10249 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
10250 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
10251
10252 // Emit the loop body block.
10253 emitBlock(BodyBB, MapperFn);
10254 BasicBlock *LastBB = BodyBB;
10255 PHINode *PtrPHI =
10256 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
10257 PtrPHI->addIncoming(PtrBegin, HeadBB);
10258
10259 // Get map clause information. Fill up the arrays with all mapped variables.
10260 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
10261 if (!Info)
10262 return Info.takeError();
10263
10264 // Call the runtime API __tgt_mapper_num_components to get the number of
10265 // pre-existing components.
10266 Value *OffloadingArgs[] = {MapperHandle};
10267 Value *PreviousSize = createRuntimeFunctionCall(
10268 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
10269 OffloadingArgs);
10270 Value *ShiftedPreviousSize =
10271 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
10272
10273 // Fill up the runtime mapper handle for all components.
10274 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
10275 Value *CurBaseArg = Info->BasePointers[I];
10276 Value *CurBeginArg = Info->Pointers[I];
10277 Value *CurSizeArg = Info->Sizes[I];
10278 Value *CurNameArg = Info->Names.size()
10279 ? Info->Names[I]
10280 : Constant::getNullValue(Builder.getPtrTy());
10281
10282 // Extract the MEMBER_OF field from the map type.
10283 Value *OriMapType = Builder.getInt64(
10284 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10285 Info->Types[I]));
10286 Value *MemberMapType;
10287 if (PreserveMemberOfFlags) {
10288 constexpr uint64_t MemberOfMask =
10289 static_cast<uint64_t>(OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF);
10290 uint64_t OrigFlags =
10291 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10292 Info->Types[I]);
10293 bool HasMemberOf = (OrigFlags & MemberOfMask) != 0;
10294 if (HasMemberOf)
10295 MemberMapType = Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
10296 else
10297 MemberMapType = OriMapType;
10298 } else {
10299 MemberMapType = Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
10300 }
10301
10302 // Combine the map type inherited from user-defined mapper with that
10303 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
10304 // bits of the \a MapType, which is the input argument of the mapper
10305 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
10306 // bits of MemberMapType.
10307 // [OpenMP 5.0], 1.2.6. map-type decay.
10308 // | alloc | to | from | tofrom | release | delete
10309 // ----------------------------------------------------------
10310 // alloc | alloc | alloc | alloc | alloc | release | delete
10311 // to | alloc | to | alloc | to | release | delete
10312 // from | alloc | alloc | from | from | release | delete
10313 // tofrom | alloc | to | from | tofrom | release | delete
10314 Value *LeftToFrom = Builder.CreateAnd(
10315 MapType,
10316 Builder.getInt64(
10317 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10318 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10319 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10320 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
10321 BasicBlock *AllocElseBB =
10322 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
10323 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
10324 BasicBlock *ToElseBB =
10325 BasicBlock::Create(M.getContext(), "omp.type.to.else");
10326 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
10327 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
10328 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
10329 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
10330 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
10331 emitBlock(AllocBB, MapperFn);
10332 Value *AllocMapType = Builder.CreateAnd(
10333 MemberMapType,
10334 Builder.getInt64(
10335 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10336 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10337 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10338 Builder.CreateBr(EndBB);
10339 emitBlock(AllocElseBB, MapperFn);
10340 Value *IsTo = Builder.CreateICmpEQ(
10341 LeftToFrom,
10342 Builder.getInt64(
10343 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10344 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10345 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
10346 // In case of to, clear OMP_MAP_FROM.
10347 emitBlock(ToBB, MapperFn);
10348 Value *ToMapType = Builder.CreateAnd(
10349 MemberMapType,
10350 Builder.getInt64(
10351 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10352 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10353 Builder.CreateBr(EndBB);
10354 emitBlock(ToElseBB, MapperFn);
10355 Value *IsFrom = Builder.CreateICmpEQ(
10356 LeftToFrom,
10357 Builder.getInt64(
10358 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10359 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10360 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
10361 // In case of from, clear OMP_MAP_TO.
10362 emitBlock(FromBB, MapperFn);
10363 Value *FromMapType = Builder.CreateAnd(
10364 MemberMapType,
10365 Builder.getInt64(
10366 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10367 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10368 // In case of tofrom, do nothing.
10369 emitBlock(EndBB, MapperFn);
10370 LastBB = EndBB;
10371 PHINode *CurMapType =
10372 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
10373 CurMapType->addIncoming(AllocMapType, AllocBB);
10374 CurMapType->addIncoming(ToMapType, ToBB);
10375 CurMapType->addIncoming(FromMapType, FromBB);
10376 CurMapType->addIncoming(MemberMapType, ToElseBB);
10377
10378 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
10379 CurSizeArg, CurMapType, CurNameArg};
10380
10381 auto ChildMapperFn = CustomMapperCB(I);
10382 if (!ChildMapperFn)
10383 return ChildMapperFn.takeError();
10384 if (*ChildMapperFn) {
10385 // Call the corresponding mapper function.
10386 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
10387 ->setDoesNotThrow();
10388 } else {
10389 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
10390 // data structure.
10392 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
10393 OffloadingArgs);
10394 }
10395 }
10396
10397 // Update the pointer to point to the next element that needs to be mapped,
10398 // and check whether we have mapped all elements.
10399 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
10400 "omp.arraymap.next");
10401 PtrPHI->addIncoming(PtrNext, LastBB);
10402 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
10403 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
10404 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
10405
10406 emitBlock(ExitBB, MapperFn);
10407 // Emit array deletion if this is an array section and \p MapType indicates
10408 // that deletion is required.
10409 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
10410 MapType, MapName, ElementSize, DoneBB,
10411 /*IsInit=*/false);
10412
10413 // Emit the function exit block.
10414 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
10415
10416 Builder.CreateRetVoid();
10417 Builder.restoreIP(SavedIP);
10418 return MapperFn;
10419}
10420
10422 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
10423 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
10424 bool IsNonContiguous,
10425 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
10426
10427 // Reset the array information.
10428 Info.clearArrayInfo();
10429 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
10430
10431 if (Info.NumberOfPtrs == 0)
10432 return Error::success();
10433
10434 Builder.restoreIP(AllocaIP);
10435 // Detect if we have any capture size requiring runtime evaluation of the
10436 // size so that a constant array could be eventually used.
10437 ArrayType *PointerArrayType =
10438 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
10439
10440 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
10441 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
10442
10443 Info.RTArgs.PointersArray = Builder.CreateAlloca(
10444 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
10445 AllocaInst *MappersArray = Builder.CreateAlloca(
10446 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
10447 Info.RTArgs.MappersArray = MappersArray;
10448
10449 // If we don't have any VLA types or other types that require runtime
10450 // evaluation, we can use a constant array for the map sizes, otherwise we
10451 // need to fill up the arrays as we do for the pointers.
10452 Type *Int64Ty = Builder.getInt64Ty();
10453 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
10454 ConstantInt::get(Int64Ty, 0));
10455 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
10456 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
10457 bool IsNonContigEntry =
10458 IsNonContiguous &&
10459 (static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10460 CombinedInfo.Types[I] &
10461 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG) != 0);
10462 // For NON_CONTIG entries, ArgSizes stores the dimension count (number of
10463 // descriptor_dim records), not the byte size.
10464 if (IsNonContigEntry) {
10465 assert(I < CombinedInfo.NonContigInfo.Dims.size() &&
10466 "Index must be in-bounds for NON_CONTIG Dims array");
10467 const uint64_t DimCount = CombinedInfo.NonContigInfo.Dims[I];
10468 assert(DimCount > 0 && "NON_CONTIG DimCount must be > 0");
10469 ConstSizes[I] = ConstantInt::get(Int64Ty, DimCount);
10470 continue;
10471 }
10472 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
10473 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
10474 ConstSizes[I] = CI;
10475 continue;
10476 }
10477 }
10478 RuntimeSizes.set(I);
10479 }
10480
10481 if (RuntimeSizes.all()) {
10482 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10483 Info.RTArgs.SizesArray = Builder.CreateAlloca(
10484 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10485 restoreIPandDebugLoc(Builder, CodeGenIP);
10486 } else {
10487 auto *SizesArrayInit = ConstantArray::get(
10488 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
10489 std::string Name = createPlatformSpecificName({"offload_sizes"});
10490 auto *SizesArrayGbl =
10491 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
10492 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
10493 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
10494
10495 if (!RuntimeSizes.any()) {
10496 Info.RTArgs.SizesArray = SizesArrayGbl;
10497 } else {
10498 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10499 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
10500 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10501 AllocaInst *Buffer = Builder.CreateAlloca(
10502 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10503 Buffer->setAlignment(OffloadSizeAlign);
10504 restoreIPandDebugLoc(Builder, CodeGenIP);
10505 Builder.CreateMemCpy(
10506 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
10507 SizesArrayGbl, OffloadSizeAlign,
10508 Builder.getIntN(
10509 IndexSize,
10510 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
10511
10512 Info.RTArgs.SizesArray = Buffer;
10513 }
10514 restoreIPandDebugLoc(Builder, CodeGenIP);
10515 }
10516
10517 // The map types are always constant so we don't need to generate code to
10518 // fill arrays. Instead, we create an array constant.
10520 for (auto mapFlag : CombinedInfo.Types)
10521 Mapping.push_back(
10522 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10523 mapFlag));
10524 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
10525 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10526 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
10527
10528 // The information types are only built if provided.
10529 if (!CombinedInfo.Names.empty()) {
10530 auto *MapNamesArrayGbl = createOffloadMapnames(
10531 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
10532 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
10533 Info.EmitDebug = true;
10534 } else {
10535 Info.RTArgs.MapNamesArray =
10537 Info.EmitDebug = false;
10538 }
10539
10540 // If there's a present map type modifier, it must not be applied to the end
10541 // of a region, so generate a separate map type array in that case.
10542 if (Info.separateBeginEndCalls()) {
10543 bool EndMapTypesDiffer = false;
10544 for (uint64_t &Type : Mapping) {
10545 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10546 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
10547 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10548 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
10549 EndMapTypesDiffer = true;
10550 }
10551 }
10552 if (EndMapTypesDiffer) {
10553 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10554 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
10555 }
10556 }
10557
10558 PointerType *PtrTy = Builder.getPtrTy();
10559 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
10560 Value *BPVal = CombinedInfo.BasePointers[I];
10561 Value *BP = Builder.CreateConstInBoundsGEP2_32(
10562 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
10563 0, I);
10564 Builder.CreateAlignedStore(BPVal, BP,
10565 M.getDataLayout().getPrefTypeAlign(PtrTy));
10566
10567 if (Info.requiresDevicePointerInfo()) {
10568 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
10569 CodeGenIP = Builder.saveIP();
10570 Builder.restoreIP(AllocaIP);
10571 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
10572 Builder.restoreIP(CodeGenIP);
10573 if (DeviceAddrCB)
10574 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
10575 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
10576 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
10577 if (DeviceAddrCB)
10578 DeviceAddrCB(I, BP);
10579 }
10580 }
10581
10582 Value *PVal = CombinedInfo.Pointers[I];
10583 Value *P = Builder.CreateConstInBoundsGEP2_32(
10584 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
10585 I);
10586 // TODO: Check alignment correct.
10587 Builder.CreateAlignedStore(PVal, P,
10588 M.getDataLayout().getPrefTypeAlign(PtrTy));
10589
10590 if (RuntimeSizes.test(I)) {
10591 Value *S = Builder.CreateConstInBoundsGEP2_32(
10592 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10593 /*Idx0=*/0,
10594 /*Idx1=*/I);
10595 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
10596 Int64Ty,
10597 /*isSigned=*/true),
10598 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
10599 }
10600 // Fill up the mapper array.
10601 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10602 Value *MFunc = ConstantPointerNull::get(PtrTy);
10603
10604 auto CustomMFunc = CustomMapperCB(I);
10605 if (!CustomMFunc)
10606 return CustomMFunc.takeError();
10607 if (*CustomMFunc)
10608 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
10609
10610 Value *MAddr = Builder.CreateInBoundsGEP(
10611 PointerArrayType, MappersArray,
10612 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
10613 Builder.CreateAlignedStore(
10614 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
10615 }
10616
10617 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
10618 Info.NumberOfPtrs == 0)
10619 return Error::success();
10620 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
10621 return Error::success();
10622}
10623
10625 BasicBlock *CurBB = Builder.GetInsertBlock();
10626
10627 if (!CurBB || CurBB->hasTerminator()) {
10628 // If there is no insert point or the previous block is already
10629 // terminated, don't touch it.
10630 } else {
10631 // Otherwise, create a fall-through branch.
10632 Builder.CreateBr(Target);
10633 }
10634
10635 Builder.ClearInsertionPoint();
10636}
10637
10639 bool IsFinished) {
10640 BasicBlock *CurBB = Builder.GetInsertBlock();
10641
10642 // Fall out of the current block (if necessary).
10643 emitBranch(BB);
10644
10645 if (IsFinished && BB->use_empty()) {
10646 BB->eraseFromParent();
10647 return;
10648 }
10649
10650 // Place the block after the current block, if possible, or else at
10651 // the end of the function.
10652 if (CurBB && CurBB->getParent())
10653 CurFn->insert(std::next(CurBB->getIterator()), BB);
10654 else
10655 CurFn->insert(CurFn->end(), BB);
10656 Builder.SetInsertPoint(BB);
10657}
10658
10660 BodyGenCallbackTy ElseGen,
10661 InsertPointTy AllocaIP,
10662 ArrayRef<BasicBlock *> DeallocBlocks) {
10663 // If the condition constant folds and can be elided, try to avoid emitting
10664 // the condition and the dead arm of the if/else.
10665 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
10666 auto CondConstant = CI->getSExtValue();
10667 if (CondConstant)
10668 return ThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
10669
10670 return ElseGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
10671 }
10672
10673 Function *CurFn = Builder.GetInsertBlock()->getParent();
10674
10675 // Otherwise, the condition did not fold, or we couldn't elide it. Just
10676 // emit the conditional branch.
10677 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
10678 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
10679 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
10680 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
10681 // Emit the 'then' code.
10682 emitBlock(ThenBlock, CurFn);
10683 if (Error Err = ThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks))
10684 return Err;
10685 emitBranch(ContBlock);
10686 // Emit the 'else' code if present.
10687 // There is no need to emit line number for unconditional branch.
10688 emitBlock(ElseBlock, CurFn);
10689 if (Error Err = ElseGen(AllocaIP, Builder.saveIP(), DeallocBlocks))
10690 return Err;
10691 // There is no need to emit line number for unconditional branch.
10692 emitBranch(ContBlock);
10693 // Emit the continuation block for code after the if.
10694 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10695 return Error::success();
10696}
10697
10698bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10699 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10702 "Unexpected Atomic Ordering.");
10703
10704 bool Flush = false;
10706
10707 switch (AK) {
10708 case Read:
10711 FlushAO = AtomicOrdering::Acquire;
10712 Flush = true;
10713 }
10714 break;
10715 case Write:
10716 case Compare:
10717 case Update:
10720 FlushAO = AtomicOrdering::Release;
10721 Flush = true;
10722 }
10723 break;
10724 case Capture:
10725 switch (AO) {
10727 FlushAO = AtomicOrdering::Acquire;
10728 Flush = true;
10729 break;
10731 FlushAO = AtomicOrdering::Release;
10732 Flush = true;
10733 break;
10737 Flush = true;
10738 break;
10739 default:
10740 // do nothing - leave silently.
10741 break;
10742 }
10743 }
10744
10745 if (Flush) {
10746 // Currently Flush RT call still doesn't take memory_ordering, so for when
10747 // that happens, this tries to do the resolution of which atomic ordering
10748 // to use with but issue the flush call
10749 // TODO: pass `FlushAO` after memory ordering support is added
10750 (void)FlushAO;
10751 emitFlush(Loc);
10752 }
10753
10754 // for AO == AtomicOrdering::Monotonic and all other case combinations
10755 // do nothing
10756 return Flush;
10757}
10758
10762 AtomicOrdering AO, InsertPointTy AllocaIP) {
10763 if (!updateToLocation(Loc))
10764 return Loc.IP;
10765
10766 assert(X.Var->getType()->isPointerTy() &&
10767 "OMP Atomic expects a pointer to target memory");
10768 Type *XElemTy = X.ElemTy;
10769 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10770 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10771 "OMP atomic read expected a scalar type");
10772
10773 Value *XRead = nullptr;
10774
10775 if (XElemTy->isIntegerTy()) {
10776 LoadInst *XLD =
10777 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10778 XLD->setAtomic(AO);
10779 XRead = cast<Value>(XLD);
10780 } else if (XElemTy->isStructTy()) {
10781 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10782 // target does not support `atomicrmw` of the size of the struct
10783 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10784 OldVal->setAtomic(AO);
10785 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10786 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10787 OpenMPIRBuilder::AtomicInfo atomicInfo(
10788 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10789 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10790 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10791 XRead = AtomicLoadRes.first;
10792 OldVal->eraseFromParent();
10793 } else {
10794 // We need to perform atomic op as integer
10795 IntegerType *IntCastTy =
10796 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10797 LoadInst *XLoad =
10798 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10799 XLoad->setAtomic(AO);
10800 if (XElemTy->isFloatingPointTy()) {
10801 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10802 } else {
10803 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10804 }
10805 }
10806 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10807 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10808 return Builder.saveIP();
10809}
10810
10813 AtomicOpValue &X, Value *Expr,
10814 AtomicOrdering AO, InsertPointTy AllocaIP) {
10815 if (!updateToLocation(Loc))
10816 return Loc.IP;
10817
10818 assert(X.Var->getType()->isPointerTy() &&
10819 "OMP Atomic expects a pointer to target memory");
10820 Type *XElemTy = X.ElemTy;
10821 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10822 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10823 "OMP atomic write expected a scalar type");
10824
10825 if (XElemTy->isIntegerTy()) {
10826 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10827 XSt->setAtomic(AO);
10828 } else if (XElemTy->isStructTy()) {
10829 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10830 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10831 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10832 OpenMPIRBuilder::AtomicInfo atomicInfo(
10833 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10834 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10835 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10836 OldVal->eraseFromParent();
10837 } else {
10838 // We need to bitcast and perform atomic op as integers
10839 IntegerType *IntCastTy =
10840 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10841 Value *ExprCast =
10842 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10843 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10844 XSt->setAtomic(AO);
10845 }
10846
10847 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10848 return Builder.saveIP();
10849}
10850
10853 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10854 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10855 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10856 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10857 if (!updateToLocation(Loc))
10858 return Loc.IP;
10859
10860 LLVM_DEBUG({
10861 Type *XTy = X.Var->getType();
10862 assert(XTy->isPointerTy() &&
10863 "OMP Atomic expects a pointer to target memory");
10864 Type *XElemTy = X.ElemTy;
10865 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10866 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10867 "OMP atomic update expected a scalar or struct type");
10868 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10869 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10870 "OpenMP atomic does not support LT or GT operations");
10871 });
10872
10873 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10874 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10875 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10876 if (!AtomicResult)
10877 return AtomicResult.takeError();
10878 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10879 return Builder.saveIP();
10880}
10881
10882// FIXME: Duplicating AtomicExpand
10883Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10884 AtomicRMWInst::BinOp RMWOp) {
10885 switch (RMWOp) {
10886 case AtomicRMWInst::Add:
10887 return Builder.CreateAdd(Src1, Src2);
10888 case AtomicRMWInst::Sub:
10889 return Builder.CreateSub(Src1, Src2);
10890 case AtomicRMWInst::And:
10891 return Builder.CreateAnd(Src1, Src2);
10893 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10894 case AtomicRMWInst::Or:
10895 return Builder.CreateOr(Src1, Src2);
10896 case AtomicRMWInst::Xor:
10897 return Builder.CreateXor(Src1, Src2);
10902 case AtomicRMWInst::Max:
10903 case AtomicRMWInst::Min:
10916 llvm_unreachable("Unsupported atomic update operation");
10917 }
10918 llvm_unreachable("Unsupported atomic update operation");
10919}
10920
10921Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10922 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10924 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10925 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10926 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2.
10927 bool emitRMWOp = false;
10928 switch (RMWOp) {
10929 case AtomicRMWInst::Add:
10930 case AtomicRMWInst::And:
10932 case AtomicRMWInst::Or:
10933 case AtomicRMWInst::Xor:
10935 emitRMWOp = XElemTy;
10936 break;
10937 case AtomicRMWInst::Sub:
10938 emitRMWOp = (IsXBinopExpr && XElemTy);
10939 break;
10940 default:
10941 emitRMWOp = false;
10942 }
10943 emitRMWOp &= XElemTy->isIntegerTy();
10944
10945 std::pair<Value *, Value *> Res;
10946 if (emitRMWOp) {
10947 AtomicRMWInst *RMWInst =
10948 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10949 if (T.isAMDGPU()) {
10950 if (IsIgnoreDenormalMode)
10951 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10952 llvm::MDNode::get(Builder.getContext(), {}));
10953 if (!IsFineGrainedMemory)
10954 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10955 llvm::MDNode::get(Builder.getContext(), {}));
10956 if (!IsRemoteMemory)
10957 RMWInst->setMetadata("amdgpu.no.remote.memory",
10958 llvm::MDNode::get(Builder.getContext(), {}));
10959 }
10960 Res.first = RMWInst;
10961 // not needed except in case of postfix captures. Generate anyway for
10962 // consistency with the else part. Will be removed with any DCE pass.
10963 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10964 if (RMWOp == AtomicRMWInst::Xchg)
10965 Res.second = Res.first;
10966 else
10967 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10968 } else if (XElemTy->isStructTy()) {
10969 LoadInst *OldVal =
10970 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10971 OldVal->setAtomic(AO);
10972 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10973 unsigned LoadSize = LoadDL.getTypeStoreSize(XElemTy);
10974
10975 OpenMPIRBuilder::AtomicInfo atomicInfo(
10976 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10977 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10978 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10979 BasicBlock *CurBB = Builder.GetInsertBlock();
10980 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10981 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10982 BasicBlock *ExitBB =
10983 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10984 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10985 X->getName() + ".atomic.cont");
10986 ContBB->getTerminator()->eraseFromParent();
10987 Builder.restoreIP(AllocaIP);
10988 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10989 NewAtomicAddr->setName(X->getName() + "x.new.val");
10990 Builder.SetInsertPoint(ContBB);
10991 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10992 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10993 Value *OldExprVal = PHI;
10994 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10995 if (!CBResult)
10996 return CBResult.takeError();
10997 Value *Upd = *CBResult;
10998 Builder.CreateStore(Upd, NewAtomicAddr);
11001 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
11002 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
11003 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
11004 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
11005 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
11006 OldVal->eraseFromParent();
11007 Res.first = OldExprVal;
11008 Res.second = Upd;
11009
11010 if (UnreachableInst *ExitTI =
11012 CurBBTI->eraseFromParent();
11013 Builder.SetInsertPoint(ExitBB);
11014 } else {
11015 Builder.SetInsertPoint(ExitTI);
11016 }
11017 } else {
11018 IntegerType *IntCastTy =
11019 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
11020 LoadInst *OldVal =
11021 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
11022 OldVal->setAtomic(AO);
11023 // CurBB
11024 // | /---\
11025 // ContBB |
11026 // | \---/
11027 // ExitBB
11028 BasicBlock *CurBB = Builder.GetInsertBlock();
11029 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11030 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11031 BasicBlock *ExitBB =
11032 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
11033 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
11034 X->getName() + ".atomic.cont");
11035 ContBB->getTerminator()->eraseFromParent();
11036 Builder.restoreIP(AllocaIP);
11037 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
11038 NewAtomicAddr->setName(X->getName() + "x.new.val");
11039 Builder.SetInsertPoint(ContBB);
11040 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
11041 PHI->addIncoming(OldVal, CurBB);
11042 bool IsIntTy = XElemTy->isIntegerTy();
11043 Value *OldExprVal = PHI;
11044 if (!IsIntTy) {
11045 if (XElemTy->isFloatingPointTy()) {
11046 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
11047 X->getName() + ".atomic.fltCast");
11048 } else {
11049 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
11050 X->getName() + ".atomic.ptrCast");
11051 }
11052 }
11053
11054 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
11055 if (!CBResult)
11056 return CBResult.takeError();
11057 Value *Upd = *CBResult;
11058 Builder.CreateStore(Upd, NewAtomicAddr);
11059 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
11062 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
11063 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
11064 Result->setVolatile(VolatileX);
11065 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
11066 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
11067 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
11068 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
11069
11070 Res.first = OldExprVal;
11071 Res.second = Upd;
11072
11073 // set Insertion point in exit block
11074 if (UnreachableInst *ExitTI =
11076 CurBBTI->eraseFromParent();
11077 Builder.SetInsertPoint(ExitBB);
11078 } else {
11079 Builder.SetInsertPoint(ExitTI);
11080 }
11081 }
11082
11083 return Res;
11084}
11085
11088 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
11089 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
11090 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
11091 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
11092 if (!updateToLocation(Loc))
11093 return Loc.IP;
11094
11095 LLVM_DEBUG({
11096 Type *XTy = X.Var->getType();
11097 assert(XTy->isPointerTy() &&
11098 "OMP Atomic expects a pointer to target memory");
11099 Type *XElemTy = X.ElemTy;
11100 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
11101 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
11102 "OMP atomic capture expected a scalar or struct type");
11103 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
11104 "OpenMP atomic does not support LT or GT operations");
11105 });
11106
11107 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
11108 // 'x' is simply atomically rewritten with 'expr'.
11109 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
11110 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
11111 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
11112 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
11113 if (!AtomicResult)
11114 return AtomicResult.takeError();
11115 Value *CapturedVal =
11116 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
11117 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
11118
11119 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
11120 return Builder.saveIP();
11121}
11122
11126 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
11127 bool IsFailOnly) {
11128
11130 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
11131 IsPostfixUpdate, IsFailOnly, Failure);
11132}
11133
11137 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
11138 bool IsFailOnly, AtomicOrdering Failure) {
11139
11140 if (!updateToLocation(Loc))
11141 return Loc.IP;
11142
11143 assert(X.Var->getType()->isPointerTy() &&
11144 "OMP atomic expects a pointer to target memory");
11145 // compare capture
11146 if (V.Var) {
11147 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
11148 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
11149 }
11150
11151 bool IsInteger = E->getType()->isIntegerTy();
11152
11153 if (Op == OMPAtomicCompareOp::EQ) {
11154 // OldValue and SuccessOrFail are set below and used in the shared V.Var /
11155 // R.Var handling.
11156 Value *OldValue = nullptr;
11157 Value *SuccessOrFail = nullptr;
11158
11159 if (!IsInteger && HandleFPNegZero) {
11160 // IEEE 754 special cases for cmpxchg (which is bitwise):
11161 // 1. -0.0 == +0.0 but they have different bit patterns.
11162 // 2. NaN != NaN but identical NaN bit patterns would match.
11163 //
11164 // CurBB:
11165 // %e_int = bitcast E to intN
11166 // %d_int = bitcast D to intN
11167 // %x_curr = load atomic intN, X
11168 // %x_fp = bitcast %x_curr to FP
11169 // %e_is_nan = fcmp uno E, E
11170 // %x_is_nan = fcmp uno %x_fp, %x_fp
11171 // %either_nan = or %e_is_nan, %x_is_nan
11172 // br %either_nan, NaNBB, NotNaNBB
11173 // NaNBB: ; NaN == anything is always false
11174 // br ExitBB
11175 // NotNaNBB:
11176 // %x_is_zero = fcmp oeq %x_fp, 0.0
11177 // %e_is_zero = fcmp oeq E, 0.0
11178 // %both_zero = and %x_is_zero, %e_is_zero
11179 // br %both_zero, ZeroBB, NormalBB
11180 // ZeroBB: ; both ±0.0 → x = d
11181 // cmpxchg X, %x_curr, %d_int
11182 // br ExitBB
11183 // NormalBB: ; original path
11184 // cmpxchg X, %e_int, %d_int
11185 // br ExitBB
11186 // ExitBB:
11187 // phi merge
11188 IntegerType *IntCastTy =
11189 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
11190 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
11191 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
11192
11193 // Load X atomically.
11194 LoadInst *XCurr = Builder.CreateLoad(IntCastTy, X.Var,
11195 X.Var->getName() + ".atomic.load");
11197 Value *XFP = Builder.CreateBitCast(XCurr, X.ElemTy);
11198
11199 // IEEE 754: NaN != NaN, but cmpxchg would succeed if E and X have
11200 // the same NaN bit pattern. Skip cmpxchg when either is NaN.
11201 Value *EIsNaN = Builder.CreateFCmpUNO(E, E, "atomic.e.isnan");
11202 Value *XIsNaN = Builder.CreateFCmpUNO(XFP, XFP, "atomic.x.isnan");
11203 Value *EitherNaN = Builder.CreateOr(EIsNaN, XIsNaN, "atomic.either.nan");
11204
11205 BasicBlock *CurBB = Builder.GetInsertBlock();
11206 Function *F = CurBB->getParent();
11207 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11208 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11209 BasicBlock *ExitBB =
11210 CurBB->splitBasicBlock(CurBBTI, X.Var->getName() + ".atomic.exit");
11212 M.getContext(), X.Var->getName() + ".atomic.nan", F, ExitBB);
11213 BasicBlock *NotNaNBB = BasicBlock::Create(
11214 M.getContext(), X.Var->getName() + ".atomic.notnan", F, ExitBB);
11216 M.getContext(), X.Var->getName() + ".atomic.zero", F, ExitBB);
11217 BasicBlock *NormalBB = BasicBlock::Create(
11218 M.getContext(), X.Var->getName() + ".atomic.normal", F, ExitBB);
11219
11220 // If either E or X is NaN → NaNBB (always fails), else check for ±0.0.
11221 CurBB->getTerminator()->eraseFromParent();
11222 Builder.SetInsertPoint(CurBB);
11223 Builder.CreateCondBr(EitherNaN, NaNBB, NotNaNBB);
11224
11225 // NaNBB: NaN == anything is always false; skip cmpxchg.
11226 Builder.SetInsertPoint(NaNBB);
11227 Builder.CreateBr(ExitBB);
11228
11229 // NotNaNBB: check both X and E for ±0.0.
11230 Builder.SetInsertPoint(NotNaNBB);
11231 Value *XIsZero =
11232 Builder.CreateFCmpOEQ(XFP, ConstantFP::getZero(X.ElemTy),
11233 X.Var->getName() + ".atomic.xiszero");
11234 Value *EIsZero = Builder.CreateFCmpOEQ(E, ConstantFP::getZero(X.ElemTy),
11235 "atomic.e.iszero");
11236 Value *BothZero = Builder.CreateAnd(XIsZero, EIsZero, "atomic.both.zero");
11237 Builder.CreateCondBr(BothZero, ZeroBB, NormalBB);
11238
11239 // ZeroBB: cmpxchg with X's loaded bit-pattern.
11240 Builder.SetInsertPoint(ZeroBB);
11241 AtomicCmpXchgInst *ResZero = Builder.CreateAtomicCmpXchg(
11242 X.Var, XCurr, DBCast, MaybeAlign(), AO, Failure);
11243 Value *OldZero = Builder.CreateExtractValue(ResZero, /*Idxs=*/0);
11244 Value *OkZero = Builder.CreateExtractValue(ResZero, /*Idxs=*/1);
11245 Builder.CreateBr(ExitBB);
11246
11247 // NormalBB: original bitwise cmpxchg.
11248 Builder.SetInsertPoint(NormalBB);
11249 AtomicCmpXchgInst *ResNormal = Builder.CreateAtomicCmpXchg(
11250 X.Var, EBCast, DBCast, MaybeAlign(), AO, Failure);
11251 Value *OldNormal = Builder.CreateExtractValue(ResNormal, /*Idxs=*/0);
11252 Value *OkNormal = Builder.CreateExtractValue(ResNormal, /*Idxs=*/1);
11253 Builder.CreateBr(ExitBB);
11254
11255 // ExitBB: merge results from NaN, Zero, and Normal paths.
11256 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11257 PHINode *OldIntPHI =
11258 Builder.CreatePHI(IntCastTy, 3, X.Var->getName() + ".atomic.old");
11259 OldIntPHI->addIncoming(XCurr, NaNBB);
11260 OldIntPHI->addIncoming(OldZero, ZeroBB);
11261 OldIntPHI->addIncoming(OldNormal, NormalBB);
11262 PHINode *SuccessPHI = Builder.CreatePHI(Builder.getInt1Ty(), 3,
11263 X.Var->getName() + ".atomic.ok");
11264 SuccessPHI->addIncoming(Builder.getFalse(), NaNBB);
11265 SuccessPHI->addIncoming(OkZero, ZeroBB);
11266 SuccessPHI->addIncoming(OkNormal, NormalBB);
11267
11268 if (isa<UnreachableInst>(ExitBB->getTerminator())) {
11269 CurBBTI->eraseFromParent();
11270 Builder.SetInsertPoint(ExitBB);
11271 } else {
11272 Builder.SetInsertPoint(&*ExitBB->getFirstNonPHIIt());
11273 }
11274
11275 OldValue = Builder.CreateBitCast(OldIntPHI, X.ElemTy,
11276 X.Var->getName() + ".atomic.old.fp");
11277 SuccessOrFail = SuccessPHI;
11278 } else {
11279 AtomicCmpXchgInst *Result = nullptr;
11280 if (!IsInteger) {
11281 IntegerType *IntCastTy =
11282 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
11283 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
11284 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
11285 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast,
11286 MaybeAlign(), AO, Failure);
11287 } else {
11288 Result =
11289 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
11290 }
11291
11292 if (V.Var) {
11293 OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
11294 if (!IsInteger)
11295 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
11296 assert(OldValue->getType() == V.ElemTy &&
11297 "OldValue and V must be of same type");
11298 if (IsPostfixUpdate) {
11299 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
11300 } else {
11301 SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
11302 if (IsFailOnly) {
11303 BasicBlock *CurBB = Builder.GetInsertBlock();
11304 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11305 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11306 BasicBlock *ExitBB = CurBB->splitBasicBlock(
11307 CurBBTI, X.Var->getName() + ".atomic.exit");
11308 BasicBlock *ContBB = CurBB->splitBasicBlock(
11309 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
11310 ContBB->getTerminator()->eraseFromParent();
11311 CurBB->getTerminator()->eraseFromParent();
11312
11313 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
11314
11315 Builder.SetInsertPoint(ContBB);
11316 Builder.CreateStore(OldValue, V.Var);
11317 Builder.CreateBr(ExitBB);
11318
11319 if (UnreachableInst *ExitTI =
11321 CurBBTI->eraseFromParent();
11322 Builder.SetInsertPoint(ExitBB);
11323 } else {
11324 Builder.SetInsertPoint(ExitTI);
11325 }
11326 } else {
11327 Value *CapturedValue =
11328 Builder.CreateSelect(SuccessOrFail, E, OldValue);
11329 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
11330 }
11331 }
11332 }
11333 // The comparison result has to be stored.
11334 if (R.Var) {
11335 assert(R.Var->getType()->isPointerTy() &&
11336 "r.var must be of pointer type");
11337 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
11338
11339 Value *SuccessFailureVal =
11340 Builder.CreateExtractValue(Result, /*Idxs=*/1);
11341 Value *ResultCast =
11342 R.IsSigned ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
11343 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
11344 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
11345 }
11346 }
11347
11348 // For the HandleFPNegZero path, handle V.Var and R.Var using the
11349 // pre-computed OldValue and SuccessOrFail.
11350 if (HandleFPNegZero && !IsInteger) {
11351 if (V.Var) {
11352 assert(OldValue->getType() == V.ElemTy &&
11353 "OldValue and V must be of same type");
11354 if (IsPostfixUpdate) {
11355 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
11356 } else {
11357 if (IsFailOnly) {
11358 BasicBlock *CurBB = Builder.GetInsertBlock();
11359 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11360 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11361 BasicBlock *ExitBB = CurBB->splitBasicBlock(
11362 CurBBTI, X.Var->getName() + ".atomic.exit");
11363 BasicBlock *ContBB = CurBB->splitBasicBlock(
11364 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
11365 ContBB->getTerminator()->eraseFromParent();
11366 CurBB->getTerminator()->eraseFromParent();
11367
11368 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
11369
11370 Builder.SetInsertPoint(ContBB);
11371 Builder.CreateStore(OldValue, V.Var);
11372 Builder.CreateBr(ExitBB);
11373
11374 if (UnreachableInst *ExitTI =
11376 CurBBTI->eraseFromParent();
11377 Builder.SetInsertPoint(ExitBB);
11378 } else {
11379 Builder.SetInsertPoint(ExitTI);
11380 }
11381 } else {
11382 Value *CapturedValue =
11383 Builder.CreateSelect(SuccessOrFail, E, OldValue);
11384 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
11385 }
11386 }
11387 }
11388 // The comparison result has to be stored.
11389 if (R.Var) {
11390 assert(R.Var->getType()->isPointerTy() &&
11391 "r.var must be of pointer type");
11392 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
11393
11394 Value *ResultCast = R.IsSigned
11395 ? Builder.CreateSExt(SuccessOrFail, R.ElemTy)
11396 : Builder.CreateZExt(SuccessOrFail, R.ElemTy);
11397 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
11398 }
11399 }
11400 } else {
11401 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
11402 "Op should be either max or min at this point");
11403 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
11404
11405 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
11406 // Let's take max as example.
11407 // OpenMP form:
11408 // x = x > expr ? expr : x;
11409 // LLVM form:
11410 // *ptr = *ptr > val ? *ptr : val;
11411 // We need to transform to LLVM form.
11412 // x = x <= expr ? x : expr;
11414 if (IsXBinopExpr) {
11415 if (IsInteger) {
11416 if (X.IsSigned)
11417 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
11419 else
11420 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
11422 } else {
11423 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
11425 }
11426 } else {
11427 if (IsInteger) {
11428 if (X.IsSigned)
11429 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
11431 else
11432 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
11434 } else {
11435 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
11437 }
11438 }
11439
11440 AtomicRMWInst *OldValue =
11441 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
11442 if (V.Var) {
11443 Value *CapturedValue = nullptr;
11444 if (IsPostfixUpdate) {
11445 CapturedValue = OldValue;
11446 } else {
11447 CmpInst::Predicate Pred;
11448 switch (NewOp) {
11449 case AtomicRMWInst::Max:
11450 Pred = CmpInst::ICMP_SGT;
11451 break;
11453 Pred = CmpInst::ICMP_UGT;
11454 break;
11456 Pred = CmpInst::FCMP_OGT;
11457 break;
11458 case AtomicRMWInst::Min:
11459 Pred = CmpInst::ICMP_SLT;
11460 break;
11462 Pred = CmpInst::ICMP_ULT;
11463 break;
11465 Pred = CmpInst::FCMP_OLT;
11466 break;
11467 default:
11468 llvm_unreachable("unexpected comparison op");
11469 }
11470 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
11471 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
11472 }
11473 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
11474 }
11475 }
11476
11477 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
11478
11479 return Builder.saveIP();
11480}
11481
11484 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
11485 Value *NumTeamsUpper, Value *ThreadLimit,
11486 Value *IfExpr) {
11487 if (!updateToLocation(Loc))
11488 return InsertPointTy();
11489
11490 uint32_t SrcLocStrSize;
11491 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
11492 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
11493 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
11494
11495 // Outer allocation basicblock is the entry block of the current function.
11496 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
11497 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
11498 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
11499 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11500 }
11501
11502 // The current basic block is split into four basic blocks. After outlining,
11503 // they will be mapped as follows:
11504 // ```
11505 // def current_fn() {
11506 // current_basic_block:
11507 // br label %teams.exit
11508 // teams.exit:
11509 // ; instructions after teams
11510 // }
11511 //
11512 // def outlined_fn() {
11513 // teams.alloca:
11514 // br label %teams.body
11515 // teams.body:
11516 // ; instructions within teams body
11517 // }
11518 // ```
11519 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
11520 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
11521 BasicBlock *AllocaBB =
11522 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
11523
11524 bool SubClausesPresent =
11525 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
11526 // Push num_teams
11527 if (!Config.isTargetDevice() && SubClausesPresent) {
11528 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
11529 "if lowerbound is non-null, then upperbound must also be non-null "
11530 "for bounds on num_teams");
11531
11532 if (NumTeamsUpper == nullptr)
11533 NumTeamsUpper = Builder.getInt32(0);
11534
11535 if (NumTeamsLower == nullptr)
11536 NumTeamsLower = NumTeamsUpper;
11537
11538 if (IfExpr) {
11539 assert(IfExpr->getType()->isIntegerTy() &&
11540 "argument to if clause must be an integer value");
11541
11542 // upper = ifexpr ? upper : 1
11543 if (IfExpr->getType() != Int1)
11544 IfExpr = Builder.CreateICmpNE(IfExpr,
11545 ConstantInt::get(IfExpr->getType(), 0));
11546 NumTeamsUpper = Builder.CreateSelect(
11547 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
11548
11549 // lower = ifexpr ? lower : 1
11550 NumTeamsLower = Builder.CreateSelect(
11551 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
11552 }
11553
11554 if (ThreadLimit == nullptr)
11555 ThreadLimit = Builder.getInt32(0);
11556
11557 // The __kmpc_push_num_teams_51 function expects int32 as the arguments. So,
11558 // truncate or sign extend the passed values to match the int32 parameters.
11559 Value *NumTeamsLowerInt32 =
11560 Builder.CreateSExtOrTrunc(NumTeamsLower, Builder.getInt32Ty());
11561 Value *NumTeamsUpperInt32 =
11562 Builder.CreateSExtOrTrunc(NumTeamsUpper, Builder.getInt32Ty());
11563 Value *ThreadLimitInt32 =
11564 Builder.CreateSExtOrTrunc(ThreadLimit, Builder.getInt32Ty());
11565
11566 Value *ThreadNum = getOrCreateThreadID(Ident);
11567
11569 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
11570 {Ident, ThreadNum, NumTeamsLowerInt32, NumTeamsUpperInt32,
11571 ThreadLimitInt32});
11572 }
11573 // Generate the body of teams.
11574 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11575 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11576 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, ExitBB))
11577 return Err;
11578
11579 auto OI = std::make_unique<OutlineInfo>();
11580 OI->EntryBB = AllocaBB;
11581 OI->ExitBB = ExitBB;
11582 OI->OuterAllocBB = &OuterAllocaBB;
11583
11584 // Insert fake values for global tid and bound tid.
11586 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
11587 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
11588 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
11589 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
11590 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
11591
11592 auto HostPostOutlineCB = [this, Ident,
11593 ToBeDeleted](Function &OutlinedFn) mutable {
11594 // The stale call instruction will be replaced with a new call instruction
11595 // for runtime call with the outlined function.
11596
11597 assert(OutlinedFn.hasOneUse() &&
11598 "there must be a single user for the outlined function");
11599 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
11600 ToBeDeleted.push_back(StaleCI);
11601
11602 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
11603 "Outlined function must have two or three arguments only");
11604
11605 bool HasShared = OutlinedFn.arg_size() == 3;
11606
11607 OutlinedFn.getArg(0)->setName("global.tid.ptr");
11608 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
11609 if (HasShared)
11610 OutlinedFn.getArg(2)->setName("data");
11611
11612 // Call to the runtime function for teams in the current function.
11613 assert(StaleCI && "Error while outlining - no CallInst user found for the "
11614 "outlined function.");
11615 Builder.SetInsertPoint(StaleCI);
11616 SmallVector<Value *> Args = {
11617 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
11618 if (HasShared)
11619 Args.push_back(StaleCI->getArgOperand(2));
11622 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
11623 Args);
11624
11625 for (Instruction *I : llvm::reverse(ToBeDeleted))
11626 I->eraseFromParent();
11627 };
11628
11629 if (!Config.isTargetDevice())
11630 OI->PostOutlineCB = HostPostOutlineCB;
11631
11632 addOutlineInfo(std::move(OI));
11633
11634 Builder.SetInsertPoint(ExitBB);
11635
11636 return Builder.saveIP();
11637}
11638
11640 const LocationDescription &Loc, InsertPointTy OuterAllocIP,
11641 ArrayRef<BasicBlock *> OuterDeallocBlocks, BodyGenCallbackTy BodyGenCB) {
11642 if (!updateToLocation(Loc))
11643 return InsertPointTy();
11644
11645 BasicBlock *OuterAllocaBB = OuterAllocIP.getBlock();
11646
11647 if (OuterAllocaBB == Builder.GetInsertBlock()) {
11648 BasicBlock *BodyBB =
11649 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
11650 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11651 }
11652 BasicBlock *ExitBB =
11653 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
11654 BasicBlock *BodyBB =
11655 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
11656 BasicBlock *AllocaBB =
11657 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
11658
11659 // Generate the body of distribute clause
11660 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11661 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11662 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, ExitBB))
11663 return Err;
11664
11665 // When using target we use different runtime functions which require a
11666 // callback.
11667 if (Config.isTargetDevice()) {
11668 auto OI = std::make_unique<OutlineInfo>();
11669 OI->OuterAllocBB = OuterAllocIP.getBlock();
11670 OI->EntryBB = AllocaBB;
11671 OI->ExitBB = ExitBB;
11672 OI->OuterDeallocBBs.reserve(OuterDeallocBlocks.size());
11673 copy(OuterDeallocBlocks, OI->OuterDeallocBBs.end());
11674
11675 addOutlineInfo(std::move(OI));
11676 }
11677 Builder.SetInsertPoint(ExitBB);
11678
11679 return Builder.saveIP();
11680}
11681
11684 std::string VarName) {
11685 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
11687 Names.size()),
11688 Names);
11689 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
11690 M, MapNamesArrayInit->getType(),
11691 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
11692 VarName);
11693 return MapNamesArrayGlobal;
11694}
11695
11696// Create all simple and struct types exposed by the runtime and remember
11697// the llvm::PointerTypes of them for easy access later.
11698void OpenMPIRBuilder::initializeTypes(Module &M) {
11699 LLVMContext &Ctx = M.getContext();
11700 StructType *T;
11701 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
11702 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
11703#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
11704#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
11705 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
11706 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
11707#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
11708 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
11709 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
11710#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
11711 T = StructType::getTypeByName(Ctx, StructName); \
11712 if (!T) \
11713 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
11714 VarName = T; \
11715 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
11716#include "llvm/Frontend/OpenMP/OMPKinds.def"
11717}
11718
11721 SmallVectorImpl<BasicBlock *> &BlockVector) {
11723 BlockSet.insert(EntryBB);
11724 BlockSet.insert(ExitBB);
11725
11726 Worklist.push_back(EntryBB);
11727 while (!Worklist.empty()) {
11728 BasicBlock *BB = Worklist.pop_back_val();
11729 BlockVector.push_back(BB);
11730 for (BasicBlock *SuccBB : successors(BB))
11731 if (BlockSet.insert(SuccBB).second)
11732 Worklist.push_back(SuccBB);
11733 }
11734}
11735
11736std::unique_ptr<CodeExtractor>
11738 bool ArgsInZeroAddressSpace,
11739 Twine Suffix) {
11740 return std::make_unique<CodeExtractor>(
11741 Blocks, /* DominatorTree */ nullptr,
11742 /* AggregateArgs */ true,
11743 /* BlockFrequencyInfo */ nullptr,
11744 /* BranchProbabilityInfo */ nullptr,
11745 /* AssumptionCache */ nullptr,
11746 /* AllowVarArgs */ true,
11747 /* AllowAlloca */ true,
11748 /* AllocationBlock*/ OuterAllocBB,
11749 /* DeallocationBlocks */ ArrayRef<BasicBlock *>(),
11750 /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace);
11751}
11752
11753std::unique_ptr<CodeExtractor> DeviceSharedMemOutlineInfo::createCodeExtractor(
11754 ArrayRef<BasicBlock *> Blocks, bool ArgsInZeroAddressSpace, Twine Suffix) {
11755 return std::make_unique<DeviceSharedMemCodeExtractor>(
11756 OMPBuilder, Blocks, /* DominatorTree */ nullptr,
11757 /* AggregateArgs */ true,
11758 /* BlockFrequencyInfo */ nullptr,
11759 /* BranchProbabilityInfo */ nullptr,
11760 /* AssumptionCache */ nullptr,
11761 /* AllowVarArgs */ true,
11762 /* AllowAlloca */ true,
11763 /* AllocationBlock*/ OuterAllocBB,
11764 /* DeallocationBlocks */ OuterDeallocBBs.empty()
11766 : OuterDeallocBBs,
11767 /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace);
11768}
11769
11771 uint64_t Size, int32_t Flags,
11773 StringRef Name) {
11774 if (!Config.isGPU()) {
11777 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
11778 return;
11779 }
11780 // TODO: Add support for global variables on the device after declare target
11781 // support.
11782 Function *Fn = dyn_cast<Function>(Addr);
11783 if (!Fn)
11784 return;
11785
11786 // Add a function attribute for the kernel.
11787 Fn->addFnAttr("kernel");
11788 if (T.isAMDGCN())
11789 Fn->addFnAttr("uniform-work-group-size");
11790 Fn->addFnAttr(Attribute::MustProgress);
11791}
11792
11793// We only generate metadata for function that contain target regions.
11796
11797 // If there are no entries, we don't need to do anything.
11798 if (OffloadInfoManager.empty())
11799 return;
11800
11801 LLVMContext &C = M.getContext();
11804 16>
11805 OrderedEntries(OffloadInfoManager.size());
11806
11807 // Auxiliary methods to create metadata values and strings.
11808 auto &&GetMDInt = [this](unsigned V) {
11809 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
11810 };
11811
11812 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
11813
11814 // Create the offloading info metadata node.
11815 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
11816 auto &&TargetRegionMetadataEmitter =
11817 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
11818 const TargetRegionEntryInfo &EntryInfo,
11820 // Generate metadata for target regions. Each entry of this metadata
11821 // contains:
11822 // - Entry 0 -> Kind of this type of metadata (0).
11823 // - Entry 1 -> Device ID of the file where the entry was identified.
11824 // - Entry 2 -> File ID of the file where the entry was identified.
11825 // - Entry 3 -> Mangled name of the function where the entry was
11826 // identified.
11827 // - Entry 4 -> Line in the file where the entry was identified.
11828 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
11829 // - Entry 6 -> Order the entry was created.
11830 // The first element of the metadata node is the kind.
11831 Metadata *Ops[] = {
11832 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
11833 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
11834 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
11835 GetMDInt(E.getOrder())};
11836
11837 // Save this entry in the right position of the ordered entries array.
11838 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
11839
11840 // Add metadata to the named metadata node.
11841 MD->addOperand(MDNode::get(C, Ops));
11842 };
11843
11844 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
11845
11846 // Create function that emits metadata for each device global variable entry;
11847 auto &&DeviceGlobalVarMetadataEmitter =
11848 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
11849 StringRef MangledName,
11851 // Generate metadata for global variables. Each entry of this metadata
11852 // contains:
11853 // - Entry 0 -> Kind of this type of metadata (1).
11854 // - Entry 1 -> Mangled name of the variable.
11855 // - Entry 2 -> Declare target kind.
11856 // - Entry 3 -> Order the entry was created.
11857 // The first element of the metadata node is the kind.
11858 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
11859 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
11860
11861 // Save this entry in the right position of the ordered entries array.
11862 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
11863 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
11864
11865 // Add metadata to the named metadata node.
11866 MD->addOperand(MDNode::get(C, Ops));
11867 };
11868
11869 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
11870 DeviceGlobalVarMetadataEmitter);
11871
11872 for (const auto &E : OrderedEntries) {
11873 assert(E.first && "All ordered entries must exist!");
11874 if (const auto *CE =
11876 E.first)) {
11877 if (!CE->getID() || !CE->getAddress()) {
11878 // Do not blame the entry if the parent funtion is not emitted.
11879 TargetRegionEntryInfo EntryInfo = E.second;
11880 StringRef FnName = EntryInfo.ParentName;
11881 if (!M.getNamedValue(FnName))
11882 continue;
11883 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
11884 continue;
11885 }
11886 createOffloadEntry(CE->getID(), CE->getAddress(),
11887 /*Size=*/0, CE->getFlags(),
11889 } else if (const auto *CE = dyn_cast<
11891 E.first)) {
11894 CE->getFlags());
11895 switch (Flags) {
11898 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
11899 continue;
11900 if (!CE->getAddress()) {
11901 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
11902 continue;
11903 }
11904 // The vaiable has no definition - no need to add the entry.
11905 if (CE->getVarSize() == 0)
11906 continue;
11907 break;
11909 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11910 (!Config.isTargetDevice() && CE->getAddress())) &&
11911 "Declaret target link address is set.");
11912 if (Config.isTargetDevice())
11913 continue;
11914 if (!CE->getAddress()) {
11916 continue;
11917 }
11918 break;
11921 if (!CE->getAddress()) {
11922 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11923 continue;
11924 }
11925 break;
11926 default:
11927 break;
11928 }
11929
11930 // Hidden or internal symbols on the device are not externally visible.
11931 // We should not attempt to register them by creating an offloading
11932 // entry. Indirect variables are handled separately on the device.
11933 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11934 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11935 (Flags !=
11937 Flags != OffloadEntriesInfoManager::
11938 OMPTargetGlobalVarEntryIndirectVTable))
11939 continue;
11940
11941 // Indirect globals need to use a special name that doesn't match the name
11942 // of the associated host global.
11944 Flags ==
11946 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11947 Flags, CE->getLinkage(), CE->getVarName());
11948 else
11949 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11950 Flags, CE->getLinkage());
11951
11952 } else {
11953 llvm_unreachable("Unsupported entry kind.");
11954 }
11955 }
11956
11957 // Emit requires directive globals to a special entry so the runtime can
11958 // register them when the device image is loaded.
11959 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11960 // entries should be redesigned to better suit this use-case.
11961 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11965 ".requires", /*Size=*/0,
11967 Config.getRequiresFlags());
11968}
11969
11972 unsigned FileID, unsigned Line, unsigned Count) {
11973 raw_svector_ostream OS(Name);
11974 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11975 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11976 if (Count)
11977 OS << "_" << Count;
11978}
11979
11981 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11982 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11984 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11985 EntryInfo.Line, NewCount);
11986}
11987
11990 vfs::FileSystem &VFS,
11991 StringRef ParentName) {
11992 sys::fs::UniqueID ID(0xdeadf17e, 0);
11993 auto FileIDInfo = CallBack();
11994 uint64_t FileID = 0;
11995 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11996 ID = Status->getUniqueID();
11997 FileID = Status->getUniqueID().getFile();
11998 } else {
11999 // If the inode ID could not be determined, create a hash value
12000 // the current file name and use that as an ID.
12001 FileID = hash_value(std::get<0>(FileIDInfo));
12002 }
12003
12004 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
12005 std::get<1>(FileIDInfo));
12006}
12007
12009 unsigned Offset = 0;
12010 for (uint64_t Remain =
12011 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
12013 !(Remain & 1); Remain = Remain >> 1)
12014 Offset++;
12015 return Offset;
12016}
12017
12020 // Rotate by getFlagMemberOffset() bits.
12021 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
12022 << getFlagMemberOffset());
12023}
12024
12027 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
12028 // If the entry is PTR_AND_OBJ but has not been marked with the special
12029 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
12030 // marked as MEMBER_OF.
12031 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
12033 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
12036 return;
12037
12038 // Entries with ATTACH are not members-of anything. They are handled
12039 // separately by the runtime after other maps have been handled.
12040 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
12042 return;
12043
12044 // Reset the placeholder value to prepare the flag for the assignment of the
12045 // proper MEMBER_OF value.
12046 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
12047 Flags |= MemberOfFlag;
12048}
12049
12053 bool IsDeclaration, bool IsExternallyVisible,
12054 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
12055 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
12056 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
12057 std::function<Constant *()> GlobalInitializer,
12058 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
12059 // TODO: convert this to utilise the IRBuilder Config rather than
12060 // a passed down argument.
12061 if (OpenMPSIMD)
12062 return nullptr;
12063
12066 CaptureClause ==
12068 Config.hasRequiresUnifiedSharedMemory())) {
12069 SmallString<64> PtrName;
12070 {
12071 raw_svector_ostream OS(PtrName);
12072 OS << MangledName;
12073 if (!IsExternallyVisible)
12074 OS << format("_%x", EntryInfo.FileID);
12075 OS << "_decl_tgt_ref_ptr";
12076 }
12077
12078 Value *Ptr = M.getNamedValue(PtrName);
12079
12080 if (!Ptr) {
12081 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
12082 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
12083
12084 auto *GV = cast<GlobalVariable>(Ptr);
12085 GV->setLinkage(GlobalValue::WeakAnyLinkage);
12086
12087 if (!Config.isTargetDevice()) {
12088 if (GlobalInitializer)
12089 GV->setInitializer(GlobalInitializer());
12090 else
12091 GV->setInitializer(GlobalValue);
12092 }
12093
12095 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
12096 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
12097 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
12098 }
12099
12100 return cast<Constant>(Ptr);
12101 }
12102
12103 return nullptr;
12104}
12105
12109 bool IsDeclaration, bool IsExternallyVisible,
12110 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
12111 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
12112 std::vector<Triple> TargetTriple,
12113 std::function<Constant *()> GlobalInitializer,
12114 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
12115 Constant *Addr) {
12117 (TargetTriple.empty() && !Config.isTargetDevice()))
12118 return;
12119
12121 StringRef VarName;
12122 int64_t VarSize;
12124
12126 CaptureClause ==
12128 !Config.hasRequiresUnifiedSharedMemory()) {
12130 VarName = MangledName;
12131 GlobalValue *LlvmVal = M.getNamedValue(VarName);
12132
12133 if (!IsDeclaration)
12134 VarSize = divideCeil(
12135 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
12136 else
12137 VarSize = 0;
12138 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
12139
12140 // This is a workaround carried over from Clang which prevents undesired
12141 // optimisation of internal variables.
12142 if (Config.isTargetDevice() &&
12143 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
12144 // Do not create a "ref-variable" if the original is not also available
12145 // on the host.
12146 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
12147 return;
12148
12149 std::string RefName = createPlatformSpecificName({VarName, "ref"});
12150
12151 if (!M.getNamedValue(RefName)) {
12152 Constant *AddrRef =
12153 getOrCreateInternalVariable(Addr->getType(), RefName);
12154 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
12155 GvAddrRef->setConstant(true);
12156 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
12157 GvAddrRef->setInitializer(Addr);
12158 GeneratedRefs.push_back(GvAddrRef);
12159 }
12160 }
12161 } else {
12164 else
12166
12167 if (Config.isTargetDevice()) {
12168 VarName = (Addr) ? Addr->getName() : "";
12169 Addr = nullptr;
12170 } else {
12172 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
12173 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
12174 LlvmPtrTy, GlobalInitializer, VariableLinkage);
12175 VarName = (Addr) ? Addr->getName() : "";
12176 }
12177 VarSize = M.getDataLayout().getPointerSize();
12179 }
12180
12181 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
12182 Flags, Linkage);
12183}
12184
12185/// Loads all the offload entries information from the host IR
12186/// metadata.
12188 // If we are in target mode, load the metadata from the host IR. This code has
12189 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
12190
12191 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
12192 if (!MD)
12193 return;
12194
12195 for (MDNode *MN : MD->operands()) {
12196 auto &&GetMDInt = [MN](unsigned Idx) {
12197 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
12198 return cast<ConstantInt>(V->getValue())->getZExtValue();
12199 };
12200
12201 auto &&GetMDString = [MN](unsigned Idx) {
12202 auto *V = cast<MDString>(MN->getOperand(Idx));
12203 return V->getString();
12204 };
12205
12206 switch (GetMDInt(0)) {
12207 default:
12208 llvm_unreachable("Unexpected metadata!");
12209 break;
12210 case OffloadEntriesInfoManager::OffloadEntryInfo::
12211 OffloadingEntryInfoTargetRegion: {
12212 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
12213 /*DeviceID=*/GetMDInt(1),
12214 /*FileID=*/GetMDInt(2),
12215 /*Line=*/GetMDInt(4),
12216 /*Count=*/GetMDInt(5));
12217 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
12218 /*Order=*/GetMDInt(6));
12219 break;
12220 }
12221 case OffloadEntriesInfoManager::OffloadEntryInfo::
12222 OffloadingEntryInfoDeviceGlobalVar:
12223 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
12224 /*MangledName=*/GetMDString(1),
12226 /*Flags=*/GetMDInt(2)),
12227 /*Order=*/GetMDInt(3));
12228 break;
12229 }
12230 }
12231}
12232
12234 StringRef HostFilePath) {
12235 if (HostFilePath.empty())
12236 return;
12237
12238 auto Buf = VFS.getBufferForFile(HostFilePath);
12239 if (std::error_code Err = Buf.getError()) {
12240 report_fatal_error(("error opening host file from host file path inside of "
12241 "OpenMPIRBuilder: " +
12242 Err.message())
12243 .c_str());
12244 }
12245
12246 LLVMContext Ctx;
12248 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
12249 if (std::error_code Err = M.getError()) {
12251 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
12252 .c_str());
12253 }
12254
12255 loadOffloadInfoMetadata(*M.get());
12256}
12257
12260 llvm::StringRef Name) {
12261 Builder.restoreIP(Loc.IP);
12262
12263 BasicBlock *CurBB = Builder.GetInsertBlock();
12264 assert(CurBB &&
12265 "expected a valid insertion block for creating an iterator loop");
12266 Function *F = CurBB->getParent();
12267
12268 InsertPointTy SplitIP = Builder.saveIP();
12269 if (SplitIP.getPoint() == CurBB->end())
12270 if (Instruction *Terminator = CurBB->getTerminatorOrNull())
12271 SplitIP = InsertPointTy(CurBB, Terminator->getIterator());
12272
12273 BasicBlock *ContBB =
12274 splitBB(SplitIP, /*CreateBranch=*/false,
12275 Builder.getCurrentDebugLocation(), "omp.it.cont");
12276
12277 CanonicalLoopInfo *CLI =
12278 createLoopSkeleton(Builder.getCurrentDebugLocation(), TripCount, F,
12279 /*PreInsertBefore=*/ContBB,
12280 /*PostInsertBefore=*/ContBB, Name);
12281
12282 // Enter loop from original block.
12283 redirectTo(CurBB, CLI->getPreheader(), Builder.getCurrentDebugLocation());
12284
12285 // Remove the unconditional branch inserted by createLoopSkeleton in the body
12286 if (Instruction *T = CLI->getBody()->getTerminatorOrNull())
12287 T->eraseFromParent();
12288
12289 InsertPointTy BodyIP = CLI->getBodyIP();
12290 if (llvm::Error Err = BodyGen(BodyIP, CLI->getIndVar()))
12291 return Err;
12292
12293 // Body must either fallthrough to the latch or branch directly to it.
12294 if (Instruction *BodyTerminator = CLI->getBody()->getTerminatorOrNull()) {
12295 auto *BodyBr = dyn_cast<UncondBrInst>(BodyTerminator);
12296 if (!BodyBr || BodyBr->getSuccessor() != CLI->getLatch()) {
12298 "iterator bodygen must terminate the canonical body with an "
12299 "unconditional branch to the loop latch",
12301 }
12302 } else {
12303 // Ensure we end the loop body by jumping to the latch.
12304 Builder.SetInsertPoint(CLI->getBody());
12305 Builder.CreateBr(CLI->getLatch());
12306 }
12307
12308 // Link After -> ContBB
12309 Builder.SetInsertPoint(CLI->getAfter(), CLI->getAfter()->begin());
12310 if (!CLI->getAfter()->hasTerminator())
12311 Builder.CreateBr(ContBB);
12312
12313 return InsertPointTy{ContBB, ContBB->begin()};
12314}
12315
12316/// Mangle the parameter part of the vector function name according to
12317/// their OpenMP classification. The mangling function is defined in
12318/// section 4.5 of the AAVFABI(2021Q1).
12319static std::string mangleVectorParameters(
12321 SmallString<256> Buffer;
12322 llvm::raw_svector_ostream Out(Buffer);
12323 for (const auto &ParamAttr : ParamAttrs) {
12324 switch (ParamAttr.Kind) {
12326 Out << 'l';
12327 break;
12329 Out << 'R';
12330 break;
12332 Out << 'U';
12333 break;
12335 Out << 'L';
12336 break;
12338 Out << 'u';
12339 break;
12341 Out << 'v';
12342 break;
12343 }
12344 if (ParamAttr.HasVarStride)
12345 Out << "s" << ParamAttr.StrideOrArg;
12346 else if (ParamAttr.Kind ==
12348 ParamAttr.Kind ==
12350 ParamAttr.Kind ==
12352 ParamAttr.Kind ==
12354 // Don't print the step value if it is not present or if it is
12355 // equal to 1.
12356 if (ParamAttr.StrideOrArg < 0)
12357 Out << 'n' << -ParamAttr.StrideOrArg;
12358 else if (ParamAttr.StrideOrArg != 1)
12359 Out << ParamAttr.StrideOrArg;
12360 }
12361
12362 if (!!ParamAttr.Alignment)
12363 Out << 'a' << ParamAttr.Alignment;
12364 }
12365
12366 return std::string(Out.str());
12367}
12368
12370 llvm::Function *Fn, unsigned NumElts, const llvm::APSInt &VLENVal,
12372 struct ISADataTy {
12373 char ISA;
12374 unsigned VecRegSize;
12375 };
12376 ISADataTy ISAData[] = {
12377 {'b', 128}, // SSE
12378 {'c', 256}, // AVX
12379 {'d', 256}, // AVX2
12380 {'e', 512}, // AVX512
12381 };
12383 switch (Branch) {
12385 Masked.push_back('N');
12386 Masked.push_back('M');
12387 break;
12389 Masked.push_back('N');
12390 break;
12392 Masked.push_back('M');
12393 break;
12394 }
12395 for (char Mask : Masked) {
12396 for (const ISADataTy &Data : ISAData) {
12398 llvm::raw_svector_ostream Out(Buffer);
12399 Out << "_ZGV" << Data.ISA << Mask;
12400 if (!VLENVal) {
12401 assert(NumElts && "Non-zero simdlen/cdtsize expected");
12402 Out << llvm::APSInt::getUnsigned(Data.VecRegSize / NumElts);
12403 } else {
12404 Out << VLENVal;
12405 }
12406 Out << mangleVectorParameters(ParamAttrs);
12407 Out << '_' << Fn->getName();
12408 Fn->addFnAttr(Out.str());
12409 }
12410 }
12411}
12412
12413// Function used to add the attribute. The parameter `VLEN` is templated to
12414// allow the use of `x` when targeting scalable functions for SVE.
12415template <typename T>
12416static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix,
12417 char ISA, StringRef ParSeq,
12418 StringRef MangledName, bool OutputBecomesInput,
12419 llvm::Function *Fn) {
12420 SmallString<256> Buffer;
12421 llvm::raw_svector_ostream Out(Buffer);
12422 Out << Prefix << ISA << LMask << VLEN;
12423 if (OutputBecomesInput)
12424 Out << 'v';
12425 Out << ParSeq << '_' << MangledName;
12426 Fn->addFnAttr(Out.str());
12427}
12428
12429// Helper function to generate the Advanced SIMD names depending on the value
12430// of the NDS when simdlen is not present.
12431static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask,
12432 StringRef Prefix, char ISA,
12433 StringRef ParSeq, StringRef MangledName,
12434 bool OutputBecomesInput,
12435 llvm::Function *Fn) {
12436 switch (NDS) {
12437 case 8:
12438 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
12439 OutputBecomesInput, Fn);
12440 addAArch64VectorName(16, Mask, Prefix, ISA, ParSeq, MangledName,
12441 OutputBecomesInput, Fn);
12442 break;
12443 case 16:
12444 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
12445 OutputBecomesInput, Fn);
12446 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
12447 OutputBecomesInput, Fn);
12448 break;
12449 case 32:
12450 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
12451 OutputBecomesInput, Fn);
12452 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
12453 OutputBecomesInput, Fn);
12454 break;
12455 case 64:
12456 case 128:
12457 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
12458 OutputBecomesInput, Fn);
12459 break;
12460 default:
12461 llvm_unreachable("Scalar type is too wide.");
12462 }
12463}
12464
12465/// Emit vector function attributes for AArch64, as defined in the AAVFABI.
12467 llvm::Function *Fn, unsigned UserVLEN,
12469 char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput) {
12470 assert((ISA == 'n' || ISA == 's') && "Expected ISA either 's' or 'n'.");
12471
12472 // Sort out parameter sequence.
12473 const std::string ParSeq = mangleVectorParameters(ParamAttrs);
12474 StringRef Prefix = "_ZGV";
12475 StringRef MangledName = Fn->getName();
12476
12477 // Generate simdlen from user input (if any).
12478 if (UserVLEN) {
12479 if (ISA == 's') {
12480 // SVE generates only a masked function.
12481 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12482 OutputBecomesInput, Fn);
12483 return;
12484 }
12485
12486 switch (Branch) {
12488 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
12489 OutputBecomesInput, Fn);
12490 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12491 OutputBecomesInput, Fn);
12492 break;
12494 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12495 OutputBecomesInput, Fn);
12496 break;
12498 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
12499 OutputBecomesInput, Fn);
12500 break;
12501 }
12502 return;
12503 }
12504
12505 if (ISA == 's') {
12506 // SVE, section 3.4.1, item 1.
12507 addAArch64VectorName("x", "M", Prefix, ISA, ParSeq, MangledName,
12508 OutputBecomesInput, Fn);
12509 return;
12510 }
12511
12512 switch (Branch) {
12514 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
12515 MangledName, OutputBecomesInput, Fn);
12516 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
12517 MangledName, OutputBecomesInput, Fn);
12518 break;
12520 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
12521 MangledName, OutputBecomesInput, Fn);
12522 break;
12524 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
12525 MangledName, OutputBecomesInput, Fn);
12526 break;
12527 }
12528}
12529
12530//===----------------------------------------------------------------------===//
12531// OffloadEntriesInfoManager
12532//===----------------------------------------------------------------------===//
12533
12535 return OffloadEntriesTargetRegion.empty() &&
12536 OffloadEntriesDeviceGlobalVar.empty();
12537}
12538
12539unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
12540 const TargetRegionEntryInfo &EntryInfo) const {
12541 auto It = OffloadEntriesTargetRegionCount.find(
12542 getTargetRegionEntryCountKey(EntryInfo));
12543 if (It == OffloadEntriesTargetRegionCount.end())
12544 return 0;
12545 return It->second;
12546}
12547
12548void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
12549 const TargetRegionEntryInfo &EntryInfo) {
12550 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
12551 EntryInfo.Count + 1;
12552}
12553
12554/// Initialize target region entry.
12556 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
12557 OffloadEntriesTargetRegion[EntryInfo] =
12558 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
12560 ++OffloadingEntriesNum;
12561}
12562
12564 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
12566 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
12567
12568 // Update the EntryInfo with the next available count for this location.
12569 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12570
12571 // If we are emitting code for a target, the entry is already initialized,
12572 // only has to be registered.
12573 if (OMPBuilder->Config.isTargetDevice()) {
12574 // This could happen if the device compilation is invoked standalone.
12575 if (!hasTargetRegionEntryInfo(EntryInfo)) {
12576 return;
12577 }
12578 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
12579 Entry.setAddress(Addr);
12580 Entry.setID(ID);
12581 Entry.setFlags(Flags);
12582 } else {
12584 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
12585 return;
12586 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
12587 "Target region entry already registered!");
12588 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
12589 OffloadEntriesTargetRegion[EntryInfo] = Entry;
12590 ++OffloadingEntriesNum;
12591 }
12592 incrementTargetRegionEntryInfoCount(EntryInfo);
12593}
12594
12596 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
12597
12598 // Update the EntryInfo with the next available count for this location.
12599 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12600
12601 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
12602 if (It == OffloadEntriesTargetRegion.end()) {
12603 return false;
12604 }
12605 // Fail if this entry is already registered.
12606 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
12607 return false;
12608 return true;
12609}
12610
12612 const OffloadTargetRegionEntryInfoActTy &Action) {
12613 // Scan all target region entries and perform the provided action.
12614 for (const auto &It : OffloadEntriesTargetRegion) {
12615 Action(It.first, It.second);
12616 }
12617}
12618
12620 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
12621 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
12622 ++OffloadingEntriesNum;
12623}
12624
12626 StringRef VarName, Constant *Addr, int64_t VarSize,
12628 if (OMPBuilder->Config.isTargetDevice()) {
12629 // This could happen if the device compilation is invoked standalone.
12630 if (!hasDeviceGlobalVarEntryInfo(VarName))
12631 return;
12632 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12633 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
12634 if (Entry.getVarSize() == 0) {
12635 Entry.setVarSize(VarSize);
12636 Entry.setLinkage(Linkage);
12637 }
12638 return;
12639 }
12640 Entry.setVarSize(VarSize);
12641 Entry.setLinkage(Linkage);
12642 Entry.setAddress(Addr);
12643 } else {
12644 if (hasDeviceGlobalVarEntryInfo(VarName)) {
12645 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12646 assert(Entry.isValid() && Entry.getFlags() == Flags &&
12647 "Entry not initialized!");
12648 if (Entry.getVarSize() == 0) {
12649 Entry.setVarSize(VarSize);
12650 Entry.setLinkage(Linkage);
12651 }
12652 return;
12653 }
12655 Flags ==
12657 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
12658 Addr, VarSize, Flags, Linkage,
12659 VarName.str());
12660 else
12661 OffloadEntriesDeviceGlobalVar.try_emplace(
12662 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
12663 ++OffloadingEntriesNum;
12664 }
12665}
12666
12669 // Scan all target region entries and perform the provided action.
12670 for (const auto &E : OffloadEntriesDeviceGlobalVar)
12671 Action(E.getKey(), E.getValue());
12672}
12673
12674//===----------------------------------------------------------------------===//
12675// CanonicalLoopInfo
12676//===----------------------------------------------------------------------===//
12677
12678void CanonicalLoopInfo::collectControlBlocks(
12680 // We only count those BBs as control block for which we do not need to
12681 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
12682 // flow. For consistency, this also means we do not add the Body block, which
12683 // is just the entry to the body code.
12684 BBs.reserve(BBs.size() + 6);
12685 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
12686}
12687
12689 assert(isValid() && "Requires a valid canonical loop");
12690 for (BasicBlock *Pred : predecessors(Header)) {
12691 if (Pred != Latch)
12692 return Pred;
12693 }
12694 llvm_unreachable("Missing preheader");
12695}
12696
12697void CanonicalLoopInfo::setTripCount(Value *TripCount) {
12698 assert(isValid() && "Requires a valid canonical loop");
12699
12700 Instruction *CmpI = &getCond()->front();
12701 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
12702 CmpI->setOperand(1, TripCount);
12703
12704#ifndef NDEBUG
12705 assertOK();
12706#endif
12707}
12708
12709void CanonicalLoopInfo::mapIndVar(
12710 llvm::function_ref<Value *(Instruction *)> Updater) {
12711 assert(isValid() && "Requires a valid canonical loop");
12712
12713 Instruction *OldIV = getIndVar();
12714
12715 // Record all uses excluding those introduced by the updater. Uses by the
12716 // CanonicalLoopInfo itself to keep track of the number of iterations are
12717 // excluded.
12718 SmallVector<Use *> ReplacableUses;
12719 for (Use &U : OldIV->uses()) {
12720 auto *User = dyn_cast<Instruction>(U.getUser());
12721 if (!User)
12722 continue;
12723 if (User->getParent() == getCond())
12724 continue;
12725 if (User->getParent() == getLatch())
12726 continue;
12727 ReplacableUses.push_back(&U);
12728 }
12729
12730 // Run the updater that may introduce new uses
12731 Value *NewIV = Updater(OldIV);
12732
12733 // Replace the old uses with the value returned by the updater.
12734 for (Use *U : ReplacableUses)
12735 U->set(NewIV);
12736
12737#ifndef NDEBUG
12738 assertOK();
12739#endif
12740}
12741
12743#ifndef NDEBUG
12744 // No constraints if this object currently does not describe a loop.
12745 if (!isValid())
12746 return;
12747
12748 BasicBlock *Preheader = getPreheader();
12749 BasicBlock *Body = getBody();
12750 BasicBlock *After = getAfter();
12751
12752 // Verify standard control-flow we use for OpenMP loops.
12753 assert(Preheader);
12754 assert(isa<UncondBrInst>(Preheader->getTerminator()) &&
12755 "Preheader must terminate with unconditional branch");
12756 assert(Preheader->getSingleSuccessor() == Header &&
12757 "Preheader must jump to header");
12758
12759 assert(Header);
12760 assert(isa<UncondBrInst>(Header->getTerminator()) &&
12761 "Header must terminate with unconditional branch");
12762 assert(Header->getSingleSuccessor() == Cond &&
12763 "Header must jump to exiting block");
12764
12765 assert(Cond);
12766 assert(Cond->getSinglePredecessor() == Header &&
12767 "Exiting block only reachable from header");
12768
12769 assert(isa<CondBrInst>(Cond->getTerminator()) &&
12770 "Exiting block must terminate with conditional branch");
12771 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
12772 "Exiting block's first successor jump to the body");
12773 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
12774 "Exiting block's second successor must exit the loop");
12775
12776 assert(Body);
12777 assert(Body->getSinglePredecessor() == Cond &&
12778 "Body only reachable from exiting block");
12779 assert(!isa<PHINode>(Body->front()));
12780
12781 assert(Latch);
12782 assert(isa<UncondBrInst>(Latch->getTerminator()) &&
12783 "Latch must terminate with unconditional branch");
12784 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
12785 // TODO: To support simple redirecting of the end of the body code that has
12786 // multiple; introduce another auxiliary basic block like preheader and after.
12787 assert(Latch->getSinglePredecessor() != nullptr);
12788 assert(!isa<PHINode>(Latch->front()));
12789
12790 assert(Exit);
12791 assert(isa<UncondBrInst>(Exit->getTerminator()) &&
12792 "Exit block must terminate with unconditional branch");
12793 assert(Exit->getSingleSuccessor() == After &&
12794 "Exit block must jump to after block");
12795
12796 assert(After);
12797 assert(After->getSinglePredecessor() == Exit &&
12798 "After block only reachable from exit block");
12799 assert(After->empty() || !isa<PHINode>(After->front()));
12800
12801 Instruction *IndVar = getIndVar();
12802 assert(IndVar && "Canonical induction variable not found?");
12803 assert(isa<IntegerType>(IndVar->getType()) &&
12804 "Induction variable must be an integer");
12805 assert(cast<PHINode>(IndVar)->getParent() == Header &&
12806 "Induction variable must be a PHI in the loop header");
12807 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
12808 assert(
12809 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
12810 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
12811
12812 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
12813 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
12814 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
12815 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
12816 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
12817 ->isOne());
12818
12819 Value *TripCount = getTripCount();
12820 assert(TripCount && "Loop trip count not found?");
12821 assert(IndVar->getType() == TripCount->getType() &&
12822 "Trip count and induction variable must have the same type");
12823
12824 auto *CmpI = cast<CmpInst>(&Cond->front());
12825 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
12826 "Exit condition must be a signed less-than comparison");
12827 assert(CmpI->getOperand(0) == IndVar &&
12828 "Exit condition must compare the induction variable");
12829 assert(CmpI->getOperand(1) == TripCount &&
12830 "Exit condition must compare with the trip count");
12831#endif
12832}
12833
12835 Header = nullptr;
12836 Cond = nullptr;
12837 Latch = nullptr;
12838 Exit = nullptr;
12839}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
@ ParamAttr
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static Function * createTargetParallelWrapper(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn)
Create wrapper function used to gather the outlined function's argument structure from a shared buffe...
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static std::string mangleVectorParameters(ArrayRef< llvm::OpenMPIRBuilder::DeclareSimdAttrTy > ParamAttrs)
Mangle the parameter part of the vector function name according to their OpenMP classification.
static bool isGenericKernel(Function &Fn)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static std::optional< omp::OMPTgtExecModeFlags > getTargetKernelExecMode(Function &Kernel)
Given a function, if it represents the entry point of a target kernel, this returns the execution mod...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static cl::opt< bool > UseDefaultMaxThreads("openmp-ir-builder-use-default-max-threads", cl::Hidden, cl::desc("Use a default max threads if none is provided."), cl::init(true))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const OpenMPIRBuilder::DependenciesInfo &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static bool hasGridValue(const Triple &T)
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getUnsigned(uint64_t X)
Definition APSInt.h:349
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:130
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:109
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:150
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMaximumNum
*p = maximumnum(old, v) maximumnum matches the behavior of llvm.maximumnum.
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ FMinimumNum
*p = minimumnum(old, v) minimumnum matches the behavior of llvm.minimumnum.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:477
bool hasTerminator() const LLVM_READONLY
Returns whether the block has a terminator.
Definition BasicBlock.h:232
bool empty() const
Definition BasicBlock.h:483
const Instruction & back() const
Definition BasicBlock.h:486
LLVM_ABI BasicBlock * splitBasicBlockBefore(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction and insert the new basic blo...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI void insertDbgRecordBefore(DbgRecord *DR, InstListType::iterator Here)
Insert a DbgRecord into a block at the position given by Here.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:484
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
const Instruction * getTerminatorOrNull() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:248
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:868
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true, bool ByteString=false)
This method constructs a CDS and initializes it with a text string.
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:579
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:638
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:168
const BasicBlock & getEntryBlock() const
Definition Function.h:809
Argument * arg_iterator
Definition Function.h:73
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:445
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
const Function & getFunction() const
Definition Function.h:166
iterator begin()
Definition Function.h:853
arg_iterator arg_begin()
Definition Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:666
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:755
size_t arg_size() const
Definition Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
iterator end()
Definition Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition Function.h:276
Argument * getArg(unsigned i) const
Definition Function.h:886
bool hasMetadata() const
Return true if this GlobalObject has any metadata attached to it.
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
InsertPoint - A saved insertion point.
Definition IRBuilder.h:298
BasicBlock * getBlock() const
Definition IRBuilder.h:313
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:311
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:314
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:318
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:330
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:996
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1080
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1580
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
size_type size() const
Definition MapVector.h:58
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:287
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:280
A tuple of MDNodes.
Definition Metadata.h:1760
iterator_range< op_iterator > operands()
Definition Metadata.h:1856
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI void emitAArch64DeclareSimdFunction(llvm::Function *Fn, unsigned VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch, char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput)
Emit AArch64 vector-function ABI attributes for a declare simd function.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
LLVM_ABI CallInst * createOMPAllocShared(const LocationDescription &Loc, Value *Size, const Twine &Name=Twine(""))
Create a runtime call for kmpc_alloc_shared.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
LLVM_ABI InsertPointOrErrorTy createScope(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait)
Generator for 'omp scope'.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI llvm::StructType * getKmpTaskAffinityInfoTy()
Return the LLVM struct type matching runtime kmp_task_affinity_info_t.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies={}, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI InsertPointOrErrorTy createIteratorLoop(LocationDescription Loc, llvm::Value *TripCount, IteratorBodyGenTy BodyGen, llvm::StringRef Name="iterator")
Create a canonical iterator loop at the current insertion point.
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks)> TargetBodyGenCallbackTy
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
bool HandleFPNegZero
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB, bool PreserveMemberOfFlags=false)
Emit the user-defined mapper function.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, const DependenciesInfo &Dependencies={}, const AffinityData &Affinities={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={}, ArrayRef< BasicBlock * > DeallocBlocks={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
void addOutlineInfo(std::unique_ptr< OutlineInfo > &&OI)
Add a new region that will be outlined later.
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitTaskDependency(IRBuilderBase &Builder, Value *Entry, const DependData &Dep)
Store one kmp_depend_info entry at the given Entry pointer.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI CallInst * createOMPAlignedAlloc(const LocationDescription &Loc, Value *Align, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_align_alloc.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPFreeShared(const LocationDescription &Loc, Value *Addr, Value *Size, const Twine &Name=Twine(""))
Create a runtime call for kmpc_free_shared.
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
SmallVector< std::unique_ptr< OutlineInfo >, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
llvm::function_ref< llvm::Error( InsertPointTy BodyIP, llvm::Value *LinearIV)> IteratorBodyGenTy
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
LLVM_ABI CanonicalLoopInfo * fuseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops)
Fuse a sequence of loops.
LLVM_ABI void emitX86DeclareSimdFunction(llvm::Function *Fn, unsigned NumElements, const llvm::APSInt &VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch)
Emit x86 vector-function ABI attributes for a declare simd function.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< InsertPointTy > DeallocIPs)> TargetGenArgAccessorsCallbackTy
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::string str() const
Get the contents as an std::string.
Definition StringRef.h:222
constexpr bool empty() const
Check if the string is empty.
Definition StringRef.h:141
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:471
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:636
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:689
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1051
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1111
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1125
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Unconditional Branch instruction.
static UncondBrInst * Create(BasicBlock *Target, InsertPosition InsertBefore=nullptr)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:151
LLVM_ABI bool canUnroll(OptimizationRemarkEmitter *ORE=nullptr, const Loop *L=nullptr) const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:173
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:146
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:184
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:709
bool use_empty() const
Definition Value.h:346
user_iterator user_end()
Definition Value.h:410
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:557
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
constexpr StringLiteral MaxNTID("nvvm.maxntid")
constexpr StringLiteral MaxClusterRank("nvvm.maxclusterrank")
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr)
Definition Utility.cpp:105
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
@ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:558
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:377
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
auto successors(const MachineBasicBlock *BB)
LLVM_ABI std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition Error.cpp:94
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:334
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
Error make_error(ArgTs &&... Args)
Make a Error instance representing failure using the given error info type.
Definition Error.h:340
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1884
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto predecessors(const MachineBasicBlock *BB)
auto filter_to_vector(ContainerTy &&C, PredicateFn &&Pred)
Filter a range to a SmallVector with the element types deduced.
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:26
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
LLVM_ABI void computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
A struct to pack the relevant information for an OpenMP affinity clause.
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
omp::RTLDependenceKindTy DepKind
A struct to pack static and dynamic dependency information for a task.
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
virtual LLVM_ABI std::unique_ptr< CodeExtractor > createCodeExtractor(ArrayRef< BasicBlock * > Blocks, bool ArgsInZeroAddressSpace, Twine Suffix=Twine(""))
Create a CodeExtractor instance based on the information stored in this structure,...
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static LLVM_ABI const Target * lookupTarget(const Triple &TheTriple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...