LLVM 22.0.0git
OMPIRBuilder.h
Go to the documentation of this file.
1//===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the OpenMPIRBuilder class and helpers used as a convenient
10// way to create LLVM instructions for OpenMP directives.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
15#define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
16
17#include "llvm/ADT/SetVector.h"
21#include "llvm/IR/CallingConv.h"
22#include "llvm/IR/DebugLoc.h"
23#include "llvm/IR/IRBuilder.h"
24#include "llvm/IR/Module.h"
25#include "llvm/IR/ValueMap.h"
28#include "llvm/Support/Error.h"
30#include <forward_list>
31#include <map>
32#include <optional>
33
34namespace llvm {
35class CanonicalLoopInfo;
36class ScanInfo;
37struct TargetRegionEntryInfo;
38class OffloadEntriesInfoManager;
39class OpenMPIRBuilder;
40class Loop;
41class LoopAnalysis;
42class LoopInfo;
43
44namespace vfs {
45class FileSystem;
46} // namespace vfs
47
48/// Move the instruction after an InsertPoint to the beginning of another
49/// BasicBlock.
50///
51/// The instructions after \p IP are moved to the beginning of \p New which must
52/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
53/// \p New will be added such that there is no semantic change. Otherwise, the
54/// \p IP insert block remains degenerate and it is up to the caller to insert a
55/// terminator. \p DL is used as the debug location for the branch instruction
56/// if one is created.
58 bool CreateBranch, DebugLoc DL);
59
60/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
61/// insert location will stick to after the instruction before the insertion
62/// point (instead of moving with the instruction the InsertPoint stores
63/// internally).
64LLVM_ABI void spliceBB(IRBuilder<> &Builder, BasicBlock *New,
65 bool CreateBranch);
66
67/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
68/// (missing the terminator).
69///
70/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
71/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
72/// is true, a branch to the new successor will new created such that
73/// semantically there is no change; otherwise the block of the insertion point
74/// remains degenerate and it is the caller's responsibility to insert a
75/// terminator. \p DL is used as the debug location for the branch instruction
76/// if one is created. Returns the new successor block.
77LLVM_ABI BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
78 DebugLoc DL, llvm::Twine Name = {});
79
80/// Split a BasicBlock at \p Builder's insertion point, even if the block is
81/// degenerate (missing the terminator). Its new insert location will stick to
82/// after the instruction before the insertion point (instead of moving with the
83/// instruction the InsertPoint stores internally).
84LLVM_ABI BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
85 llvm::Twine Name = {});
86
87/// Split a BasicBlock at \p Builder's insertion point, even if the block is
88/// degenerate (missing the terminator). Its new insert location will stick to
89/// after the instruction before the insertion point (instead of moving with the
90/// instruction the InsertPoint stores internally).
91LLVM_ABI BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch,
92 llvm::Twine Name);
93
94/// Like splitBB, but reuses the current block's name for the new name.
95LLVM_ABI BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder,
96 bool CreateBranch,
97 llvm::Twine Suffix = ".split");
98
99/// Captures attributes that affect generating LLVM-IR using the
100/// OpenMPIRBuilder and related classes. Note that not all attributes are
101/// required for all classes or functions. In some use cases the configuration
102/// is not necessary at all, because because the only functions that are called
103/// are ones that are not dependent on the configuration.
104class OpenMPIRBuilderConfig {
105public:
106 /// Flag to define whether to generate code for the role of the OpenMP host
107 /// (if set to false) or device (if set to true) in an offloading context. It
108 /// is set when the -fopenmp-is-target-device compiler frontend option is
109 /// specified.
110 std::optional<bool> IsTargetDevice;
111
112 /// Flag for specifying if the compilation is done for an accelerator. It is
113 /// set according to the architecture of the target triple and currently only
114 /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform
115 /// the role of an OpenMP target device, so `IsTargetDevice` must also be true
116 /// if `IsGPU` is true. This restriction might be lifted if an accelerator-
117 /// like target with the ability to work as the OpenMP host is added, or if
118 /// the capabilities of the currently supported GPU architectures are
119 /// expanded.
120 std::optional<bool> IsGPU;
121
122 /// Flag for specifying if LLVMUsed information should be emitted.
123 std::optional<bool> EmitLLVMUsedMetaInfo;
124
125 /// Flag for specifying if offloading is mandatory.
126 std::optional<bool> OpenMPOffloadMandatory;
127
128 /// First separator used between the initial two parts of a name.
129 std::optional<StringRef> FirstSeparator;
130 /// Separator used between all of the rest consecutive parts of s name.
131 std::optional<StringRef> Separator;
132
133 // Grid Value for the GPU target.
134 std::optional<omp::GV> GridValue;
135
136 /// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice =
137 /// false`), this contains the list of offloading triples associated, if any.
138 SmallVector<Triple> TargetTriples;
139
140 // Default address space for the target.
141 unsigned DefaultTargetAS = 0;
142
143 CallingConv::ID RuntimeCC = llvm::CallingConv::C;
144
145 LLVM_ABI OpenMPIRBuilderConfig();
146 LLVM_ABI OpenMPIRBuilderConfig(bool IsTargetDevice, bool IsGPU,
147 bool OpenMPOffloadMandatory,
148 bool HasRequiresReverseOffload,
149 bool HasRequiresUnifiedAddress,
150 bool HasRequiresUnifiedSharedMemory,
151 bool HasRequiresDynamicAllocators);
152
153 // Getters functions that assert if the required values are not present.
154 bool isTargetDevice() const {
155 assert(IsTargetDevice.has_value() && "IsTargetDevice is not set");
156 return *IsTargetDevice;
157 }
158
159 bool isGPU() const {
160 assert(IsGPU.has_value() && "IsGPU is not set");
161 return *IsGPU;
162 }
163
164 bool openMPOffloadMandatory() const {
165 assert(OpenMPOffloadMandatory.has_value() &&
166 "OpenMPOffloadMandatory is not set");
167 return *OpenMPOffloadMandatory;
168 }
169
170 omp::GV getGridValue() const {
171 assert(GridValue.has_value() && "GridValue is not set");
172 return *GridValue;
173 }
174
175 unsigned getDefaultTargetAS() const { return DefaultTargetAS; }
176
177 CallingConv::ID getRuntimeCC() const { return RuntimeCC; }
178
179 bool hasRequiresFlags() const { return RequiresFlags; }
180 LLVM_ABI bool hasRequiresReverseOffload() const;
181 LLVM_ABI bool hasRequiresUnifiedAddress() const;
182 LLVM_ABI bool hasRequiresUnifiedSharedMemory() const;
183 LLVM_ABI bool hasRequiresDynamicAllocators() const;
184
185 /// Returns requires directive clauses as flags compatible with those expected
186 /// by libomptarget.
187 LLVM_ABI int64_t getRequiresFlags() const;
188
189 // Returns the FirstSeparator if set, otherwise use the default separator
190 // depending on isGPU
191 StringRef firstSeparator() const {
192 if (FirstSeparator.has_value())
193 return *FirstSeparator;
194 if (isGPU())
195 return "_";
196 return ".";
197 }
198
199 // Returns the Separator if set, otherwise use the default separator depending
200 // on isGPU
201 StringRef separator() const {
202 if (Separator.has_value())
203 return *Separator;
204 if (isGPU())
205 return "$";
206 return ".";
207 }
208
209 void setIsTargetDevice(bool Value) { IsTargetDevice = Value; }
210 void setIsGPU(bool Value) { IsGPU = Value; }
211 void setEmitLLVMUsed(bool Value = true) { EmitLLVMUsedMetaInfo = Value; }
212 void setOpenMPOffloadMandatory(bool Value) { OpenMPOffloadMandatory = Value; }
213 void setFirstSeparator(StringRef FS) { FirstSeparator = FS; }
214 void setSeparator(StringRef S) { Separator = S; }
215 void setGridValue(omp::GV G) { GridValue = G; }
216 void setDefaultTargetAS(unsigned AS) { DefaultTargetAS = AS; }
217 void setRuntimeCC(CallingConv::ID CC) { RuntimeCC = CC; }
218
219 LLVM_ABI void setHasRequiresReverseOffload(bool Value);
220 LLVM_ABI void setHasRequiresUnifiedAddress(bool Value);
221 LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value);
222 LLVM_ABI void setHasRequiresDynamicAllocators(bool Value);
223
224private:
225 /// Flags for specifying which requires directive clauses are present.
226 int64_t RequiresFlags;
227};
228
229/// Data structure to contain the information needed to uniquely identify
230/// a target entry.
231struct TargetRegionEntryInfo {
232 /// The prefix used for kernel names.
233 static constexpr const char *KernelNamePrefix = "__omp_offloading_";
234
235 std::string ParentName;
236 unsigned DeviceID;
237 unsigned FileID;
238 unsigned Line;
239 unsigned Count;
240
241 TargetRegionEntryInfo() : DeviceID(0), FileID(0), Line(0), Count(0) {}
242 TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID,
243 unsigned FileID, unsigned Line, unsigned Count = 0)
244 : ParentName(ParentName), DeviceID(DeviceID), FileID(FileID), Line(Line),
245 Count(Count) {}
246
247 LLVM_ABI static void
248 getTargetRegionEntryFnName(SmallVectorImpl<char> &Name, StringRef ParentName,
249 unsigned DeviceID, unsigned FileID, unsigned Line,
250 unsigned Count);
251
252 bool operator<(const TargetRegionEntryInfo &RHS) const {
253 return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
254 std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
255 RHS.Count);
256 }
257};
258
259/// Class that manages information about offload code regions and data
260class OffloadEntriesInfoManager {
261 /// Number of entries registered so far.
262 OpenMPIRBuilder *OMPBuilder;
263 unsigned OffloadingEntriesNum = 0;
264
265public:
266 /// Base class of the entries info.
267 class OffloadEntryInfo {
268 public:
269 /// Kind of a given entry.
270 enum OffloadingEntryInfoKinds : unsigned {
271 /// Entry is a target region.
272 OffloadingEntryInfoTargetRegion = 0,
273 /// Entry is a declare target variable.
274 OffloadingEntryInfoDeviceGlobalVar = 1,
275 /// Invalid entry info.
276 OffloadingEntryInfoInvalid = ~0u
277 };
278
279 protected:
280 OffloadEntryInfo() = delete;
281 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
282 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
283 uint32_t Flags)
284 : Flags(Flags), Order(Order), Kind(Kind) {}
285 ~OffloadEntryInfo() = default;
286
287 public:
288 bool isValid() const { return Order != ~0u; }
289 unsigned getOrder() const { return Order; }
290 OffloadingEntryInfoKinds getKind() const { return Kind; }
291 uint32_t getFlags() const { return Flags; }
292 void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
293 Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
294 void setAddress(Constant *V) {
295 assert(!Addr.pointsToAliveValue() && "Address has been set before!");
296 Addr = V;
297 }
298 static bool classof(const OffloadEntryInfo *Info) { return true; }
299
300 private:
301 /// Address of the entity that has to be mapped for offloading.
302 WeakTrackingVH Addr;
303
304 /// Flags associated with the device global.
305 uint32_t Flags = 0u;
306
307 /// Order this entry was emitted.
308 unsigned Order = ~0u;
309
310 OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid;
311 };
312
313 /// Return true if a there are no entries defined.
314 LLVM_ABI bool empty() const;
315 /// Return number of entries defined so far.
316 unsigned size() const { return OffloadingEntriesNum; }
317
318 OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
319
320 //
321 // Target region entries related.
322 //
323
324 /// Kind of the target registry entry.
325 enum OMPTargetRegionEntryKind : uint32_t {
326 /// Mark the entry as target region.
327 OMPTargetRegionEntryTargetRegion = 0x0,
328 };
329
330 /// Target region entries info.
331 class OffloadEntryInfoTargetRegion final : public OffloadEntryInfo {
332 /// Address that can be used as the ID of the entry.
333 Constant *ID = nullptr;
334
335 public:
336 OffloadEntryInfoTargetRegion()
337 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion) {}
338 explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
339 Constant *ID,
340 OMPTargetRegionEntryKind Flags)
341 : OffloadEntryInfo(OffloadingEntryInfoTargetRegion, Order, Flags),
342 ID(ID) {
343 setAddress(Addr);
344 }
345
346 Constant *getID() const { return ID; }
347 void setID(Constant *V) {
348 assert(!ID && "ID has been set before!");
349 ID = V;
350 }
351 static bool classof(const OffloadEntryInfo *Info) {
352 return Info->getKind() == OffloadingEntryInfoTargetRegion;
353 }
354 };
355
356 /// Initialize target region entry.
357 /// This is ONLY needed for DEVICE compilation.
358 LLVM_ABI void
359 initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo,
360 unsigned Order);
361 /// Register target region entry.
362 LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
363 Constant *Addr, Constant *ID,
364 OMPTargetRegionEntryKind Flags);
365 /// Return true if a target region entry with the provided information
366 /// exists.
367 LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo,
368 bool IgnoreAddressId = false) const;
369
370 // Return the Name based on \a EntryInfo using the next available Count.
371 LLVM_ABI void
372 getTargetRegionEntryFnName(SmallVectorImpl<char> &Name,
373 const TargetRegionEntryInfo &EntryInfo);
374
375 /// brief Applies action \a Action on all registered entries.
376 typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
377 const OffloadEntryInfoTargetRegion &)>
378 OffloadTargetRegionEntryInfoActTy;
379 LLVM_ABI void
380 actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action);
381
382 //
383 // Device global variable entries related.
384 //
385
386 /// Kind of the global variable entry..
387 enum OMPTargetGlobalVarEntryKind : uint32_t {
388 /// Mark the entry as a to declare target.
389 OMPTargetGlobalVarEntryTo = 0x0,
390 /// Mark the entry as a to declare target link.
391 OMPTargetGlobalVarEntryLink = 0x1,
392 /// Mark the entry as a declare target enter.
393 OMPTargetGlobalVarEntryEnter = 0x2,
394 /// Mark the entry as having no declare target entry kind.
395 OMPTargetGlobalVarEntryNone = 0x3,
396 /// Mark the entry as a declare target indirect global.
397 OMPTargetGlobalVarEntryIndirect = 0x8,
398 /// Mark the entry as a register requires global.
399 OMPTargetGlobalRegisterRequires = 0x10,
400 /// Mark the entry as a declare target indirect vtable.
401 OMPTargetGlobalVarEntryIndirectVTable = 0x20,
402 };
403
404 /// Kind of device clause for declare target variables
405 /// and functions
406 /// NOTE: Currently not used as a part of a variable entry
407 /// used for Flang and Clang to interface with the variable
408 /// related registration functions
409 enum OMPTargetDeviceClauseKind : uint32_t {
410 /// The target is marked for all devices
411 OMPTargetDeviceClauseAny = 0x0,
412 /// The target is marked for non-host devices
413 OMPTargetDeviceClauseNoHost = 0x1,
414 /// The target is marked for host devices
415 OMPTargetDeviceClauseHost = 0x2,
416 /// The target is marked as having no clause
417 OMPTargetDeviceClauseNone = 0x3
418 };
419
420 /// Device global variable entries info.
421 class OffloadEntryInfoDeviceGlobalVar final : public OffloadEntryInfo {
422 /// Type of the global variable.
423 int64_t VarSize;
424 GlobalValue::LinkageTypes Linkage;
425 const std::string VarName;
426
427 public:
428 OffloadEntryInfoDeviceGlobalVar()
429 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar) {}
430 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
431 OMPTargetGlobalVarEntryKind Flags)
432 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags) {}
433 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
434 int64_t VarSize,
435 OMPTargetGlobalVarEntryKind Flags,
436 GlobalValue::LinkageTypes Linkage,
437 const std::string &VarName)
438 : OffloadEntryInfo(OffloadingEntryInfoDeviceGlobalVar, Order, Flags),
439 VarSize(VarSize), Linkage(Linkage), VarName(VarName) {
440 setAddress(Addr);
441 }
442
443 int64_t getVarSize() const { return VarSize; }
444 StringRef getVarName() const { return VarName; }
445 void setVarSize(int64_t Size) { VarSize = Size; }
446 GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
447 void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
448 static bool classof(const OffloadEntryInfo *Info) {
449 return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
450 }
451 };
452
453 /// Initialize device global variable entry.
454 /// This is ONLY used for DEVICE compilation.
455 LLVM_ABI void initializeDeviceGlobalVarEntryInfo(
456 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order);
457
458 /// Register device global variable entry.
459 LLVM_ABI void registerDeviceGlobalVarEntryInfo(
460 StringRef VarName, Constant *Addr, int64_t VarSize,
461 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage);
462 /// Checks if the variable with the given name has been registered already.
463 bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const {
464 return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
465 }
466 /// Applies action \a Action on all registered entries.
467 typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
468 OffloadDeviceGlobalVarEntryInfoActTy;
469 LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(
470 const OffloadDeviceGlobalVarEntryInfoActTy &Action);
471
472private:
473 /// Return the count of entries at a particular source location.
474 unsigned
475 getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
476
477 /// Update the count of entries at a particular source location.
478 void
479 incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
480
481 static TargetRegionEntryInfo
482 getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
483 return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
484 EntryInfo.FileID, EntryInfo.Line, 0);
485 }
486
487 // Count of entries at a location.
488 std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
489
490 // Storage for target region entries kind.
491 typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
492 OffloadEntriesTargetRegionTy;
493 OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
494 /// Storage for device global variable entries kind. The storage is to be
495 /// indexed by mangled name.
496 typedef StringMap<OffloadEntryInfoDeviceGlobalVar>
497 OffloadEntriesDeviceGlobalVarTy;
498 OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
499};
500
501/// An interface to create LLVM-IR for OpenMP directives.
502///
503/// Each OpenMP directive has a corresponding public generator method.
504class OpenMPIRBuilder {
505public:
506 /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
507 /// not have an effect on \p M (see initialize)
508 OpenMPIRBuilder(Module &M)
509 : M(M), Builder(M.getContext()), OffloadInfoManager(this),
510 T(M.getTargetTriple()), IsFinalized(false) {}
511 LLVM_ABI ~OpenMPIRBuilder();
512
513 class AtomicInfo : public llvm::AtomicInfo {
514 llvm::Value *AtomicVar;
515
516 public:
517 AtomicInfo(IRBuilder<> *Builder, llvm::Type *Ty, uint64_t AtomicSizeInBits,
518 uint64_t ValueSizeInBits, llvm::Align AtomicAlign,
519 llvm::Align ValueAlign, bool UseLibcall,
520 IRBuilderBase::InsertPoint AllocaIP, llvm::Value *AtomicVar)
521 : llvm::AtomicInfo(Builder, Ty, AtomicSizeInBits, ValueSizeInBits,
522 AtomicAlign, ValueAlign, UseLibcall, AllocaIP),
523 AtomicVar(AtomicVar) {}
524
525 llvm::Value *getAtomicPointer() const override { return AtomicVar; }
526 void decorateWithTBAA(llvm::Instruction *I) override {}
527 llvm::AllocaInst *CreateAlloca(llvm::Type *Ty,
528 const llvm::Twine &Name) const override {
529 llvm::AllocaInst *allocaInst = Builder->CreateAlloca(Ty);
530 allocaInst->setName(Name);
531 return allocaInst;
532 }
533 };
534 /// Initialize the internal state, this will put structures types and
535 /// potentially other helpers into the underlying module. Must be called
536 /// before any other method and only once! This internal state includes types
537 /// used in the OpenMPIRBuilder generated from OMPKinds.def.
538 LLVM_ABI void initialize();
539
540 void setConfig(OpenMPIRBuilderConfig C) { Config = C; }
541
542 /// Finalize the underlying module, e.g., by outlining regions.
543 /// \param Fn The function to be finalized. If not used,
544 /// all functions are finalized.
545 LLVM_ABI void finalize(Function *Fn = nullptr);
546
547 /// Check whether the finalize function has already run
548 /// \return true if the finalize function has already run
549 LLVM_ABI bool isFinalized();
550
551 /// Add attributes known for \p FnID to \p Fn.
552 LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn);
553
554 /// Type used throughout for insertion points.
555 using InsertPointTy = IRBuilder<>::InsertPoint;
556
557 /// Type used to represent an insertion point or an error value.
558 using InsertPointOrErrorTy = Expected<InsertPointTy>;
559
560 /// Get the create a name using the platform specific separators.
561 /// \param Parts parts of the final name that needs separation
562 /// The created name has a first separator between the first and second part
563 /// and a second separator between all other parts.
564 /// E.g. with FirstSeparator "$" and Separator "." and
565 /// parts: "p1", "p2", "p3", "p4"
566 /// The resulting name is "p1$p2.p3.p4"
567 /// The separators are retrieved from the OpenMPIRBuilderConfig.
568 LLVM_ABI std::string
569 createPlatformSpecificName(ArrayRef<StringRef> Parts) const;
570
571 /// Callback type for variable finalization (think destructors).
572 ///
573 /// \param CodeGenIP is the insertion point at which the finalization code
574 /// should be placed.
575 ///
576 /// A finalize callback knows about all objects that need finalization, e.g.
577 /// destruction, when the scope of the currently generated construct is left
578 /// at the time, and location, the callback is invoked.
579 using FinalizeCallbackTy = std::function<Error(InsertPointTy CodeGenIP)>;
580
581 struct FinalizationInfo {
582 FinalizationInfo(FinalizeCallbackTy FiniCB, omp::Directive DK,
583 bool IsCancellable)
584 : DK(DK), IsCancellable(IsCancellable), FiniCB(std::move(FiniCB)) {}
585 /// The directive kind of the innermost directive that has an associated
586 /// region which might require finalization when it is left.
587 const omp::Directive DK;
588
589 /// Flag to indicate if the directive is cancellable.
590 const bool IsCancellable;
591
592 /// The basic block to which control should be transferred to
593 /// implement the FiniCB. Memoized to avoid generating finalization
594 /// multiple times.
595 Expected<BasicBlock *> getFiniBB(IRBuilderBase &Builder);
596
597 /// For cases where there is an unavoidable existing finalization block
598 /// (e.g. loop finialization after omp sections). The existing finalization
599 /// block must not contain any non-finalization code.
600 Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB);
601
602 private:
603 /// Access via getFiniBB.
604 BasicBlock *FiniBB = nullptr;
605
606 /// The finalization callback provided by the last in-flight invocation of
607 /// createXXXX for the directive of kind DK.
608 FinalizeCallbackTy FiniCB;
609 };
610
611 /// Push a finalization callback on the finalization stack.
612 ///
613 /// NOTE: Temporary solution until Clang CG is gone.
614 void pushFinalizationCB(const FinalizationInfo &FI) {
615 FinalizationStack.push_back(FI);
616 }
617
618 /// Pop the last finalization callback from the finalization stack.
619 ///
620 /// NOTE: Temporary solution until Clang CG is gone.
621 void popFinalizationCB() { FinalizationStack.pop_back(); }
622
623 /// Callback type for body (=inner region) code generation
624 ///
625 /// The callback takes code locations as arguments, each describing a
626 /// location where additional instructions can be inserted.
627 ///
628 /// The CodeGenIP may be in the middle of a basic block or point to the end of
629 /// it. The basic block may have a terminator or be degenerate. The callback
630 /// function may just insert instructions at that position, but also split the
631 /// block (without the Before argument of BasicBlock::splitBasicBlock such
632 /// that the identify of the split predecessor block is preserved) and insert
633 /// additional control flow, including branches that do not lead back to what
634 /// follows the CodeGenIP. Note that since the callback is allowed to split
635 /// the block, callers must assume that InsertPoints to positions in the
636 /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
637 /// such InsertPoints need to be preserved, it can split the block itself
638 /// before calling the callback.
639 ///
640 /// AllocaIP and CodeGenIP must not point to the same position.
641 ///
642 /// \param AllocaIP is the insertion point at which new alloca instructions
643 /// should be placed. The BasicBlock it is pointing to must
644 /// not be split.
645 /// \param CodeGenIP is the insertion point at which the body code should be
646 /// placed.
647 ///
648 /// \return an error, if any were triggered during execution.
649 using BodyGenCallbackTy =
650 function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
651
652 /// Callback type for task duplication function code generation. This is the
653 /// task duplication function passed to __kmpc_taskloop. It is expected that
654 /// this function will set up (first)private variables in the duplicated task
655 /// which have non-trivial (copy-)constructors. Insertion points are handled
656 /// the same way as for BodyGenCallbackTy.
657 ///
658 /// \ref createTaskloop lays out the task's auxiliary data structure as:
659 /// `{ lower bound, upper bound, step, data... }`. DestPtr and SrcPtr point
660 /// to this data.
661 ///
662 /// It is acceptable for the callback to be set to nullptr. In that case no
663 /// function will be generated and nullptr will be passed as the task
664 /// duplication function to __kmpc_taskloop.
665 ///
666 /// \param AllocaIP is the insertion point at which new alloca instructions
667 /// should be placed. The BasicBlock it is pointing to must
668 /// not be split.
669 /// \param CodeGenIP is the insertion point at which the body code should be
670 /// placed.
671 /// \param DestPtr This is a pointer to data inside the newly duplicated
672 /// task's auxiliary data structure (allocated after the task
673 /// descriptor.)
674 /// \param SrcPtr This is a pointer to data inside the original task's
675 /// auxiliary data structure (allocated after the task
676 /// descriptor.)
677 ///
678 /// \return The insertion point immediately after the generated code, or an
679 /// error if any occured.
680 using TaskDupCallbackTy = function_ref<Expected<InsertPointTy>(
681 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr,
682 Value *SrcPtr)>;
683
684 // This is created primarily for sections construct as llvm::function_ref
685 // (BodyGenCallbackTy) is not storable (as described in the comments of
686 // function_ref class - function_ref contains non-ownable reference
687 // to the callable.
688 ///
689 /// \return an error, if any were triggered during execution.
690 using StorableBodyGenCallbackTy =
691 std::function<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
692
693 /// Callback type for loop body code generation.
694 ///
695 /// \param CodeGenIP is the insertion point where the loop's body code must be
696 /// placed. This will be a dedicated BasicBlock with a
697 /// conditional branch from the loop condition check and
698 /// terminated with an unconditional branch to the loop
699 /// latch.
700 /// \param IndVar is the induction variable usable at the insertion point.
701 ///
702 /// \return an error, if any were triggered during execution.
703 using LoopBodyGenCallbackTy =
704 function_ref<Error(InsertPointTy CodeGenIP, Value *IndVar)>;
705
706 /// Callback type for variable privatization (think copy & default
707 /// constructor).
708 ///
709 /// \param AllocaIP is the insertion point at which new alloca instructions
710 /// should be placed.
711 /// \param CodeGenIP is the insertion point at which the privatization code
712 /// should be placed.
713 /// \param Original The value being copied/created, should not be used in the
714 /// generated IR.
715 /// \param Inner The equivalent of \p Original that should be used in the
716 /// generated IR; this is equal to \p Original if the value is
717 /// a pointer and can thus be passed directly, otherwise it is
718 /// an equivalent but different value.
719 /// \param ReplVal The replacement value, thus a copy or new created version
720 /// of \p Inner.
721 ///
722 /// \returns The new insertion point where code generation continues and
723 /// \p ReplVal the replacement value.
724 using PrivatizeCallbackTy = function_ref<InsertPointOrErrorTy(
725 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
726 Value &Inner, Value *&ReplVal)>;
727
728 /// Description of a LLVM-IR insertion point (IP) and a debug/source location
729 /// (filename, line, column, ...).
730 struct LocationDescription {
731 LocationDescription(const IRBuilderBase &IRB)
732 : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
733 LocationDescription(const InsertPointTy &IP) : IP(IP) {}
734 LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
735 : IP(IP), DL(DL) {}
736 InsertPointTy IP;
737 DebugLoc DL;
738 };
739
740 /// Emitter methods for OpenMP directives.
741 ///
742 ///{
743
744 /// Generator for '#omp barrier'
745 ///
746 /// \param Loc The location where the barrier directive was encountered.
747 /// \param Kind The kind of directive that caused the barrier.
748 /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
749 /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
750 /// should be checked and acted upon.
751 /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
752 ///
753 /// \returns The insertion point after the barrier.
754 LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc,
755 omp::Directive Kind,
756 bool ForceSimpleCall = false,
757 bool CheckCancelFlag = true);
758
759 /// Generator for '#omp cancel'
760 ///
761 /// \param Loc The location where the directive was encountered.
762 /// \param IfCondition The evaluated 'if' clause expression, if any.
763 /// \param CanceledDirective The kind of directive that is cancled.
764 ///
765 /// \returns The insertion point after the barrier.
766 LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc,
767 Value *IfCondition,
768 omp::Directive CanceledDirective);
769
770 /// Generator for '#omp cancellation point'
771 ///
772 /// \param Loc The location where the directive was encountered.
773 /// \param CanceledDirective The kind of directive that is cancled.
774 ///
775 /// \returns The insertion point after the barrier.
776 LLVM_ABI InsertPointOrErrorTy createCancellationPoint(
777 const LocationDescription &Loc, omp::Directive CanceledDirective);
778
779 /// Creates a ScanInfo object, allocates and returns the pointer.
780 LLVM_ABI Expected<ScanInfo *> scanInfoInitialize();
781
782 /// Generator for '#omp parallel'
783 ///
784 /// \param Loc The insert and source location description.
785 /// \param AllocaIP The insertion points to be used for alloca instructions.
786 /// \param BodyGenCB Callback that will generate the region code.
787 /// \param PrivCB Callback to copy a given variable (think copy constructor).
788 /// \param FiniCB Callback to finalize variable copies.
789 /// \param IfCondition The evaluated 'if' clause expression, if any.
790 /// \param NumThreads The evaluated 'num_threads' clause expression, if any.
791 /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind).
792 /// \param IsCancellable Flag to indicate a cancellable parallel region.
793 ///
794 /// \returns The insertion position *after* the parallel.
795 LLVM_ABI InsertPointOrErrorTy createParallel(
796 const LocationDescription &Loc, InsertPointTy AllocaIP,
797 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
798 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
799 omp::ProcBindKind ProcBind, bool IsCancellable);
800
801 /// Generator for the control flow structure of an OpenMP canonical loop.
802 ///
803 /// This generator operates on the logical iteration space of the loop, i.e.
804 /// the caller only has to provide a loop trip count of the loop as defined by
805 /// base language semantics. The trip count is interpreted as an unsigned
806 /// integer. The induction variable passed to \p BodyGenCB will be of the same
807 /// type and run from 0 to \p TripCount - 1. It is up to the callback to
808 /// convert the logical iteration variable to the loop counter variable in the
809 /// loop body.
810 ///
811 /// \param Loc The insert and source location description. The insert
812 /// location can be between two instructions or the end of a
813 /// degenerate block (e.g. a BB under construction).
814 /// \param BodyGenCB Callback that will generate the loop body code.
815 /// \param TripCount Number of iterations the loop body is executed.
816 /// \param Name Base name used to derive BB and instruction names.
817 ///
818 /// \returns An object representing the created control flow structure which
819 /// can be used for loop-associated directives.
820 LLVM_ABI Expected<CanonicalLoopInfo *>
821 createCanonicalLoop(const LocationDescription &Loc,
822 LoopBodyGenCallbackTy BodyGenCB, Value *TripCount,
823 const Twine &Name = "loop");
824
825 /// Generator for the control flow structure of an OpenMP canonical loops if
826 /// the parent directive has an `inscan` modifier specified.
827 /// If the `inscan` modifier is specified, the region of the parent is
828 /// expected to have a `scan` directive. Based on the clauses in
829 /// scan directive, the body of the loop is split into two loops: Input loop
830 /// and Scan Loop. Input loop contains the code generated for input phase of
831 /// scan and Scan loop contains the code generated for scan phase of scan.
832 /// From the bodyGen callback of these loops, `createScan` would be called
833 /// when a scan directive is encountered from the loop body. `createScan`
834 /// based on whether 1. inclusive or exclusive scan is specified and, 2. input
835 /// loop or scan loop is generated, lowers the body of the for loop
836 /// accordingly.
837 ///
838 /// \param Loc The insert and source location description.
839 /// \param BodyGenCB Callback that will generate the loop body code.
840 /// \param Start Value of the loop counter for the first iterations.
841 /// \param Stop Loop counter values past this will stop the loop.
842 /// \param Step Loop counter increment after each iteration; negative
843 /// means counting down.
844 /// \param IsSigned Whether Start, Stop and Step are signed integers.
845 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
846 /// counter.
847 /// \param ComputeIP Insertion point for instructions computing the trip
848 /// count. Can be used to ensure the trip count is available
849 /// at the outermost loop of a loop nest. If not set,
850 /// defaults to the preheader of the generated loop.
851 /// \param Name Base name used to derive BB and instruction names.
852 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
853 /// `ScanInfoInitialize`.
854 ///
855 /// \returns A vector containing Loop Info of Input Loop and Scan Loop.
856 LLVM_ABI Expected<SmallVector<llvm::CanonicalLoopInfo *>>
857 createCanonicalScanLoops(const LocationDescription &Loc,
858 LoopBodyGenCallbackTy BodyGenCB, Value *Start,
859 Value *Stop, Value *Step, bool IsSigned,
860 bool InclusiveStop, InsertPointTy ComputeIP,
861 const Twine &Name, ScanInfo *ScanRedInfo);
862
863 /// Calculate the trip count of a canonical loop.
864 ///
865 /// This allows specifying user-defined loop counter values using increment,
866 /// upper- and lower bounds. To disambiguate the terminology when counting
867 /// downwards, instead of lower bounds we use \p Start for the loop counter
868 /// value in the first body iteration.
869 ///
870 /// Consider the following limitations:
871 ///
872 /// * A loop counter space over all integer values of its bit-width cannot be
873 /// represented. E.g using uint8_t, its loop trip count of 256 cannot be
874 /// stored into an 8 bit integer):
875 ///
876 /// DO I = 0, 255, 1
877 ///
878 /// * Unsigned wrapping is only supported when wrapping only "once"; E.g.
879 /// effectively counting downwards:
880 ///
881 /// for (uint8_t i = 100u; i > 0; i += 127u)
882 ///
883 ///
884 /// TODO: May need to add additional parameters to represent:
885 ///
886 /// * Allow representing downcounting with unsigned integers.
887 ///
888 /// * Sign of the step and the comparison operator might disagree:
889 ///
890 /// for (int i = 0; i < 42; i -= 1u)
891 ///
892 /// \param Loc The insert and source location description.
893 /// \param Start Value of the loop counter for the first iterations.
894 /// \param Stop Loop counter values past this will stop the loop.
895 /// \param Step Loop counter increment after each iteration; negative
896 /// means counting down.
897 /// \param IsSigned Whether Start, Stop and Step are signed integers.
898 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
899 /// counter.
900 /// \param Name Base name used to derive instruction names.
901 ///
902 /// \returns The value holding the calculated trip count.
903 LLVM_ABI Value *calculateCanonicalLoopTripCount(
904 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
905 bool IsSigned, bool InclusiveStop, const Twine &Name = "loop");
906
907 /// Generator for the control flow structure of an OpenMP canonical loop.
908 ///
909 /// Instead of a logical iteration space, this allows specifying user-defined
910 /// loop counter values using increment, upper- and lower bounds. To
911 /// disambiguate the terminology when counting downwards, instead of lower
912 /// bounds we use \p Start for the loop counter value in the first body
913 ///
914 /// It calls \see calculateCanonicalLoopTripCount for trip count calculations,
915 /// so limitations of that method apply here as well.
916 ///
917 /// \param Loc The insert and source location description.
918 /// \param BodyGenCB Callback that will generate the loop body code.
919 /// \param Start Value of the loop counter for the first iterations.
920 /// \param Stop Loop counter values past this will stop the loop.
921 /// \param Step Loop counter increment after each iteration; negative
922 /// means counting down.
923 /// \param IsSigned Whether Start, Stop and Step are signed integers.
924 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
925 /// counter.
926 /// \param ComputeIP Insertion point for instructions computing the trip
927 /// count. Can be used to ensure the trip count is available
928 /// at the outermost loop of a loop nest. If not set,
929 /// defaults to the preheader of the generated loop.
930 /// \param Name Base name used to derive BB and instruction names.
931 /// \param InScan Whether loop has a scan reduction specified.
932 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
933 /// `ScanInfoInitialize`.
934 ///
935 /// \returns An object representing the created control flow structure which
936 /// can be used for loop-associated directives.
937 LLVM_ABI Expected<CanonicalLoopInfo *> createCanonicalLoop(
938 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
939 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
940 InsertPointTy ComputeIP = {}, const Twine &Name = "loop",
941 bool InScan = false, ScanInfo *ScanRedInfo = nullptr);
942
943 /// Collapse a loop nest into a single loop.
944 ///
945 /// Merges loops of a loop nest into a single CanonicalLoopNest representation
946 /// that has the same number of innermost loop iterations as the origin loop
947 /// nest. The induction variables of the input loops are derived from the
948 /// collapsed loop's induction variable. This is intended to be used to
949 /// implement OpenMP's collapse clause. Before applying a directive,
950 /// collapseLoops normalizes a loop nest to contain only a single loop and the
951 /// directive's implementation does not need to handle multiple loops itself.
952 /// This does not remove the need to handle all loop nest handling by
953 /// directives, such as the ordered(<n>) clause or the simd schedule-clause
954 /// modifier of the worksharing-loop directive.
955 ///
956 /// Example:
957 /// \code
958 /// for (int i = 0; i < 7; ++i) // Canonical loop "i"
959 /// for (int j = 0; j < 9; ++j) // Canonical loop "j"
960 /// body(i, j);
961 /// \endcode
962 ///
963 /// After collapsing with Loops={i,j}, the loop is changed to
964 /// \code
965 /// for (int ij = 0; ij < 63; ++ij) {
966 /// int i = ij / 9;
967 /// int j = ij % 9;
968 /// body(i, j);
969 /// }
970 /// \endcode
971 ///
972 /// In the current implementation, the following limitations apply:
973 ///
974 /// * All input loops have an induction variable of the same type.
975 ///
976 /// * The collapsed loop will have the same trip count integer type as the
977 /// input loops. Therefore it is possible that the collapsed loop cannot
978 /// represent all iterations of the input loops. For instance, assuming a
979 /// 32 bit integer type, and two input loops both iterating 2^16 times, the
980 /// theoretical trip count of the collapsed loop would be 2^32 iteration,
981 /// which cannot be represented in an 32-bit integer. Behavior is undefined
982 /// in this case.
983 ///
984 /// * The trip counts of every input loop must be available at \p ComputeIP.
985 /// Non-rectangular loops are not yet supported.
986 ///
987 /// * At each nest level, code between a surrounding loop and its nested loop
988 /// is hoisted into the loop body, and such code will be executed more
989 /// often than before collapsing (or not at all if any inner loop iteration
990 /// has a trip count of 0). This is permitted by the OpenMP specification.
991 ///
992 /// \param DL Debug location for instructions added for collapsing,
993 /// such as instructions to compute/derive the input loop's
994 /// induction variables.
995 /// \param Loops Loops in the loop nest to collapse. Loops are specified
996 /// from outermost-to-innermost and every control flow of a
997 /// loop's body must pass through its directly nested loop.
998 /// \param ComputeIP Where additional instruction that compute the collapsed
999 /// trip count. If not set, defaults to before the generated
1000 /// loop.
1001 ///
1002 /// \returns The CanonicalLoopInfo object representing the collapsed loop.
1003 LLVM_ABI CanonicalLoopInfo *collapseLoops(DebugLoc DL,
1004 ArrayRef<CanonicalLoopInfo *> Loops,
1005 InsertPointTy ComputeIP);
1006
1007 /// Get the default alignment value for given target
1008 ///
1009 /// \param TargetTriple Target triple
1010 /// \param Features StringMap which describes extra CPU features
1011 LLVM_ABI static unsigned
1012 getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
1013 const StringMap<bool> &Features);
1014
1015 /// Retrieve (or create if non-existent) the address of a declare
1016 /// target variable, used in conjunction with registerTargetGlobalVariable
1017 /// to create declare target global variables.
1018 ///
1019 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
1020 /// clause used in conjunction with the variable being registered (link,
1021 /// to, enter).
1022 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
1023 /// clause used in conjunction with the variable being registered (nohost,
1024 /// host, any)
1025 /// \param IsDeclaration - boolean stating if the variable being registered
1026 /// is a declaration-only and not a definition
1027 /// \param IsExternallyVisible - boolean stating if the variable is externally
1028 /// visible
1029 /// \param EntryInfo - Unique entry information for the value generated
1030 /// using getTargetEntryUniqueInfo, used to name generated pointer references
1031 /// to the declare target variable
1032 /// \param MangledName - the mangled name of the variable being registered
1033 /// \param GeneratedRefs - references generated by invocations of
1034 /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar,
1035 /// these are required by Clang for book keeping.
1036 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
1037 /// \param TargetTriple - The OpenMP device target triple we are compiling
1038 /// for
1039 /// \param LlvmPtrTy - The type of the variable we are generating or
1040 /// retrieving an address for
1041 /// \param GlobalInitializer - a lambda function which creates a constant
1042 /// used for initializing a pointer reference to the variable in certain
1043 /// cases. If a nullptr is passed, it will default to utilising the original
1044 /// variable to initialize the pointer reference.
1045 /// \param VariableLinkage - a lambda function which returns the variables
1046 /// linkage type, if unspecified and a nullptr is given, it will instead
1047 /// utilise the linkage stored on the existing global variable in the
1048 /// LLVMModule.
1049 LLVM_ABI Constant *getAddrOfDeclareTargetVar(
1050 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
1051 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
1052 bool IsDeclaration, bool IsExternallyVisible,
1053 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
1054 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
1055 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
1056 std::function<Constant *()> GlobalInitializer,
1057 std::function<GlobalValue::LinkageTypes()> VariableLinkage);
1058
1059 /// Registers a target variable for device or host.
1060 ///
1061 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
1062 /// clause used in conjunction with the variable being registered (link,
1063 /// to, enter).
1064 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
1065 /// clause used in conjunction with the variable being registered (nohost,
1066 /// host, any)
1067 /// \param IsDeclaration - boolean stating if the variable being registered
1068 /// is a declaration-only and not a definition
1069 /// \param IsExternallyVisible - boolean stating if the variable is externally
1070 /// visible
1071 /// \param EntryInfo - Unique entry information for the value generated
1072 /// using getTargetEntryUniqueInfo, used to name generated pointer references
1073 /// to the declare target variable
1074 /// \param MangledName - the mangled name of the variable being registered
1075 /// \param GeneratedRefs - references generated by invocations of
1076 /// registerTargetGlobalVariable these are required by Clang for book
1077 /// keeping.
1078 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
1079 /// \param TargetTriple - The OpenMP device target triple we are compiling
1080 /// for
1081 /// \param GlobalInitializer - a lambda function which creates a constant
1082 /// used for initializing a pointer reference to the variable in certain
1083 /// cases. If a nullptr is passed, it will default to utilising the original
1084 /// variable to initialize the pointer reference.
1085 /// \param VariableLinkage - a lambda function which returns the variables
1086 /// linkage type, if unspecified and a nullptr is given, it will instead
1087 /// utilise the linkage stored on the existing global variable in the
1088 /// LLVMModule.
1089 /// \param LlvmPtrTy - The type of the variable we are generating or
1090 /// retrieving an address for
1091 /// \param Addr - the original llvm value (addr) of the variable to be
1092 /// registered
1093 LLVM_ABI void registerTargetGlobalVariable(
1094 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
1095 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
1096 bool IsDeclaration, bool IsExternallyVisible,
1097 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
1098 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
1099 std::vector<Triple> TargetTriple,
1100 std::function<Constant *()> GlobalInitializer,
1101 std::function<GlobalValue::LinkageTypes()> VariableLinkage,
1102 Type *LlvmPtrTy, Constant *Addr);
1103
1104 /// Get the offset of the OMP_MAP_MEMBER_OF field.
1105 LLVM_ABI unsigned getFlagMemberOffset();
1106
1107 /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
1108 /// the position given.
1109 /// \param Position - A value indicating the position of the parent
1110 /// of the member in the kernel argument structure, often retrieved
1111 /// by the parents position in the combined information vectors used
1112 /// to generate the structure itself. Multiple children (member's of)
1113 /// with the same parent will use the same returned member flag.
1114 LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position);
1115
1116 /// Given an initial flag set, this function modifies it to contain
1117 /// the passed in MemberOfFlag generated from the getMemberOfFlag
1118 /// function. The results are dependent on the existing flag bits
1119 /// set in the original flag set.
1120 /// \param Flags - The original set of flags to be modified with the
1121 /// passed in MemberOfFlag.
1122 /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
1123 /// slightly based on the getMemberOfFlag which adjusts the flag bits
1124 /// based on the members position in its parent.
1125 LLVM_ABI void
1126 setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags,
1127 omp::OpenMPOffloadMappingFlags MemberOfFlag);
1128
1129private:
1130 /// Modifies the canonical loop to be a statically-scheduled workshare loop
1131 /// which is executed on the device
1132 ///
1133 /// This takes a \p CLI representing a canonical loop, such as the one
1134 /// created by \see createCanonicalLoop and emits additional instructions to
1135 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1136 /// runtime function in the preheader to call OpenMP device rtl function
1137 /// which handles worksharing of loop body interations.
1138 ///
1139 /// \param DL Debug location for instructions added for the
1140 /// workshare-loop construct itself.
1141 /// \param CLI A descriptor of the canonical loop to workshare.
1142 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1143 /// preheader of the loop.
1144 /// \param LoopType Information about type of loop worksharing.
1145 /// It corresponds to type of loop workshare OpenMP pragma.
1146 /// \param NoLoop If true, no-loop code is generated.
1147 ///
1148 /// \returns Point where to insert code after the workshare construct.
1149 InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
1150 InsertPointTy AllocaIP,
1151 omp::WorksharingLoopType LoopType,
1152 bool NoLoop);
1153
1154 /// Modifies the canonical loop to be a statically-scheduled workshare loop.
1155 ///
1156 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1157 /// created by \p createCanonicalLoop and emits additional instructions to
1158 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1159 /// runtime function in the preheader to obtain the loop bounds to be used in
1160 /// the current thread, updates the relevant instructions in the canonical
1161 /// loop and calls to an OpenMP runtime finalization function after the loop.
1162 ///
1163 /// \param DL Debug location for instructions added for the
1164 /// workshare-loop construct itself.
1165 /// \param CLI A descriptor of the canonical loop to workshare.
1166 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1167 /// preheader of the loop.
1168 /// \param NeedsBarrier Indicates whether a barrier must be inserted after
1169 /// the loop.
1170 /// \param LoopType Type of workshare loop.
1171 /// \param HasDistSchedule Defines if the clause being lowered is
1172 /// dist_schedule as this is handled slightly differently
1173 /// \param DistScheduleSchedType Defines the Schedule Type for the Distribute
1174 /// loop. Defaults to None if no Distribute loop is present.
1175 ///
1176 /// \returns Point where to insert code after the workshare construct.
1177 InsertPointOrErrorTy applyStaticWorkshareLoop(
1178 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1179 omp::WorksharingLoopType LoopType, bool NeedsBarrier,
1180 bool HasDistSchedule = false,
1181 omp::OMPScheduleType DistScheduleSchedType = omp::OMPScheduleType::None);
1182
1183 /// Modifies the canonical loop a statically-scheduled workshare loop with a
1184 /// user-specified chunk size.
1185 ///
1186 /// \param DL Debug location for instructions added for the
1187 /// workshare-loop construct itself.
1188 /// \param CLI A descriptor of the canonical loop to workshare.
1189 /// \param AllocaIP An insertion point for Alloca instructions usable in
1190 /// the preheader of the loop.
1191 /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
1192 /// loop.
1193 /// \param ChunkSize The user-specified chunk size.
1194 /// \param SchedType Optional type of scheduling to be passed to the init
1195 /// function.
1196 /// \param DistScheduleChunkSize The size of dist_shcedule chunk considered
1197 /// as a unit when
1198 /// scheduling. If \p nullptr, defaults to 1.
1199 /// \param DistScheduleSchedType Defines the Schedule Type for the Distribute
1200 /// loop. Defaults to None if no Distribute loop is present.
1201 ///
1202 /// \returns Point where to insert code after the workshare construct.
1203 InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(
1204 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1205 bool NeedsBarrier, Value *ChunkSize,
1206 omp::OMPScheduleType SchedType =
1207 omp::OMPScheduleType::UnorderedStaticChunked,
1208 Value *DistScheduleChunkSize = nullptr,
1209 omp::OMPScheduleType DistScheduleSchedType = omp::OMPScheduleType::None);
1210
1211 /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
1212 ///
1213 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1214 /// created by \p createCanonicalLoop and emits additional instructions to
1215 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1216 /// runtime function in the preheader to obtain, and then in each iteration
1217 /// to update the loop counter.
1218 ///
1219 /// \param DL Debug location for instructions added for the
1220 /// workshare-loop construct itself.
1221 /// \param CLI A descriptor of the canonical loop to workshare.
1222 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1223 /// preheader of the loop.
1224 /// \param SchedType Type of scheduling to be passed to the init function.
1225 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1226 /// the loop.
1227 /// \param Chunk The size of loop chunk considered as a unit when
1228 /// scheduling. If \p nullptr, defaults to 1.
1229 ///
1230 /// \returns Point where to insert code after the workshare construct.
1231 InsertPointOrErrorTy applyDynamicWorkshareLoop(DebugLoc DL,
1232 CanonicalLoopInfo *CLI,
1233 InsertPointTy AllocaIP,
1234 omp::OMPScheduleType SchedType,
1235 bool NeedsBarrier,
1236 Value *Chunk = nullptr);
1237
1238 /// Create alternative version of the loop to support if clause
1239 ///
1240 /// OpenMP if clause can require to generate second loop. This loop
1241 /// will be executed when if clause condition is not met. createIfVersion
1242 /// adds branch instruction to the copied loop if \p ifCond is not met.
1243 ///
1244 /// \param Loop Original loop which should be versioned.
1245 /// \param IfCond Value which corresponds to if clause condition
1246 /// \param VMap Value to value map to define relation between
1247 /// original and copied loop values and loop blocks.
1248 /// \param NamePrefix Optional name prefix for if.then if.else blocks.
1249 void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
1250 ValueMap<const Value *, WeakTrackingVH> &VMap,
1251 LoopAnalysis &LIA, LoopInfo &LI, llvm::Loop *L,
1252 const Twine &NamePrefix = "");
1253
1254 /// Creates a task duplication function to be passed to kmpc_taskloop.
1255 ///
1256 /// The OpenMP runtime defines this function as taking the destination
1257 /// kmp_task_t, source kmp_task_t, and a lastprivate flag. This function is
1258 /// called on the source and destination tasks after the source task has been
1259 /// duplicated to create the destination task. At this point the destination
1260 /// task has been otherwise set up from the runtime's perspective, but this
1261 /// function is needed to fix up any data for the duplicated task e.g. private
1262 /// variables with non-trivial constructors.
1263 ///
1264 /// \param PrivatesTy The type of the privates structure for the task.
1265 /// \param PrivatesIndex The index inside the privates structure containing
1266 /// the data for the callback.
1267 /// \param DupCB The callback to generate the duplication code. See
1268 /// documentation for \ref TaskDupCallbackTy. This can be
1269 /// nullptr.
1270 Expected<Value *> createTaskDuplicationFunction(Type *PrivatesTy,
1271 int32_t PrivatesIndex,
1272 TaskDupCallbackTy DupCB);
1273
1274public:
1275 /// Modifies the canonical loop to be a workshare loop.
1276 ///
1277 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1278 /// created by \p createCanonicalLoop and emits additional instructions to
1279 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1280 /// runtime function in the preheader to obtain the loop bounds to be used in
1281 /// the current thread, updates the relevant instructions in the canonical
1282 /// loop and calls to an OpenMP runtime finalization function after the loop.
1283 ///
1284 /// The concrete transformation is done by applyStaticWorkshareLoop,
1285 /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
1286 /// on the value of \p SchedKind and \p ChunkSize.
1287 ///
1288 /// \param DL Debug location for instructions added for the
1289 /// workshare-loop construct itself.
1290 /// \param CLI A descriptor of the canonical loop to workshare.
1291 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1292 /// preheader of the loop.
1293 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1294 /// the loop.
1295 /// \param SchedKind Scheduling algorithm to use.
1296 /// \param ChunkSize The chunk size for the inner loop.
1297 /// \param HasSimdModifier Whether the simd modifier is present in the
1298 /// schedule clause.
1299 /// \param HasMonotonicModifier Whether the monotonic modifier is present in
1300 /// the schedule clause.
1301 /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
1302 /// present in the schedule clause.
1303 /// \param HasOrderedClause Whether the (parameterless) ordered clause is
1304 /// present.
1305 /// \param LoopType Information about type of loop worksharing.
1306 /// It corresponds to type of loop workshare OpenMP pragma.
1307 /// \param NoLoop If true, no-loop code is generated.
1308 /// \param HasDistSchedule Defines if the clause being lowered is
1309 /// dist_schedule as this is handled slightly differently
1310 ///
1311 /// \param DistScheduleChunkSize The chunk size for dist_schedule loop
1312 ///
1313 /// \returns Point where to insert code after the workshare construct.
1314 LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(
1315 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
1316 bool NeedsBarrier,
1317 llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
1318 Value *ChunkSize = nullptr, bool HasSimdModifier = false,
1319 bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
1320 bool HasOrderedClause = false,
1321 omp::WorksharingLoopType LoopType =
1322 omp::WorksharingLoopType::ForStaticLoop,
1323 bool NoLoop = false, bool HasDistSchedule = false,
1324 Value *DistScheduleChunkSize = nullptr);
1325
1326 /// Tile a loop nest.
1327 ///
1328 /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
1329 /// \p/ Loops must be perfectly nested, from outermost to innermost loop
1330 /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
1331 /// of every loop and every tile sizes must be usable in the outermost
1332 /// loop's preheader. This implies that the loop nest is rectangular.
1333 ///
1334 /// Example:
1335 /// \code
1336 /// for (int i = 0; i < 15; ++i) // Canonical loop "i"
1337 /// for (int j = 0; j < 14; ++j) // Canonical loop "j"
1338 /// body(i, j);
1339 /// \endcode
1340 ///
1341 /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
1342 /// \code
1343 /// for (int i1 = 0; i1 < 3; ++i1)
1344 /// for (int j1 = 0; j1 < 2; ++j1)
1345 /// for (int i2 = 0; i2 < 5; ++i2)
1346 /// for (int j2 = 0; j2 < 7; ++j2)
1347 /// body(i1*3+i2, j1*3+j2);
1348 /// \endcode
1349 ///
1350 /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
1351 /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
1352 /// handles non-constant trip counts, non-constant tile sizes and trip counts
1353 /// that are not multiples of the tile size. In the latter case the tile loop
1354 /// of the last floor-loop iteration will have fewer iterations than specified
1355 /// as its tile size.
1356 ///
1357 ///
1358 /// @param DL Debug location for instructions added by tiling, for
1359 /// instance the floor- and tile trip count computation.
1360 /// @param Loops Loops to tile. The CanonicalLoopInfo objects are
1361 /// invalidated by this method, i.e. should not used after
1362 /// tiling.
1363 /// @param TileSizes For each loop in \p Loops, the tile size for that
1364 /// dimensions.
1365 ///
1366 /// \returns A list of generated loops. Contains twice as many loops as the
1367 /// input loop nest; the first half are the floor loops and the
1368 /// second half are the tile loops.
1369 LLVM_ABI std::vector<CanonicalLoopInfo *>
1370 tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
1371 ArrayRef<Value *> TileSizes);
1372
1373 /// Fully unroll a loop.
1374 ///
1375 /// Instead of unrolling the loop immediately (and duplicating its body
1376 /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
1377 /// metadata.
1378 ///
1379 /// \param DL Debug location for instructions added by unrolling.
1380 /// \param Loop The loop to unroll. The loop will be invalidated.
1381 LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop);
1382
1383 /// Fully or partially unroll a loop. How the loop is unrolled is determined
1384 /// using LLVM's LoopUnrollPass.
1385 ///
1386 /// \param DL Debug location for instructions added by unrolling.
1387 /// \param Loop The loop to unroll. The loop will be invalidated.
1388 LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop);
1389
1390 /// Partially unroll a loop.
1391 ///
1392 /// The CanonicalLoopInfo of the unrolled loop for use with chained
1393 /// loop-associated directive can be requested using \p UnrolledCLI. Not
1394 /// needing the CanonicalLoopInfo allows more efficient code generation by
1395 /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
1396 /// A loop-associated directive applied to the unrolled loop needs to know the
1397 /// new trip count which means that if using a heuristically determined unroll
1398 /// factor (\p Factor == 0), that factor must be computed immediately. We are
1399 /// using the same logic as the LoopUnrollPass to derived the unroll factor,
1400 /// but which assumes that some canonicalization has taken place (e.g.
1401 /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
1402 /// better when the unrolled loop's CanonicalLoopInfo is not needed.
1403 ///
1404 /// \param DL Debug location for instructions added by unrolling.
1405 /// \param Loop The loop to unroll. The loop will be invalidated.
1406 /// \param Factor The factor to unroll the loop by. A factor of 0
1407 /// indicates that a heuristic should be used to determine
1408 /// the unroll-factor.
1409 /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
1410 /// partially unrolled loop. Otherwise, uses loop metadata
1411 /// to defer unrolling to the LoopUnrollPass.
1412 LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
1413 int32_t Factor,
1414 CanonicalLoopInfo **UnrolledCLI);
1415
1416 /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop
1417 /// is cloned. The metadata which prevents vectorization is added to
1418 /// to the cloned loop. The cloned loop is executed when ifCond is evaluated
1419 /// to false.
1420 ///
1421 /// \param Loop The loop to simd-ize.
1422 /// \param AlignedVars The map which containts pairs of the pointer
1423 /// and its corresponding alignment.
1424 /// \param IfCond The value which corresponds to the if clause
1425 /// condition.
1426 /// \param Order The enum to map order clause.
1427 /// \param Simdlen The Simdlen length to apply to the simd loop.
1428 /// \param Safelen The Safelen length to apply to the simd loop.
1429 LLVM_ABI void applySimd(CanonicalLoopInfo *Loop,
1430 MapVector<Value *, Value *> AlignedVars,
1431 Value *IfCond, omp::OrderKind Order,
1432 ConstantInt *Simdlen, ConstantInt *Safelen);
1433
1434 /// Generator for '#omp flush'
1435 ///
1436 /// \param Loc The location where the flush directive was encountered
1437 LLVM_ABI void createFlush(const LocationDescription &Loc);
1438
1439 /// Generator for '#omp taskwait'
1440 ///
1441 /// \param Loc The location where the taskwait directive was encountered.
1442 LLVM_ABI void createTaskwait(const LocationDescription &Loc);
1443
1444 /// Generator for '#omp taskyield'
1445 ///
1446 /// \param Loc The location where the taskyield directive was encountered.
1447 LLVM_ABI void createTaskyield(const LocationDescription &Loc);
1448
1449 /// A struct to pack the relevant information for an OpenMP depend clause.
1450 struct DependData {
1451 omp::RTLDependenceKindTy DepKind = omp::RTLDependenceKindTy::DepUnknown;
1452 Type *DepValueType;
1453 Value *DepVal;
1454 explicit DependData() = default;
1455 DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType,
1456 Value *DepVal)
1457 : DepKind(DepKind), DepValueType(DepValueType), DepVal(DepVal) {}
1458 };
1459
1460 /// Generator for `#omp taskloop`
1461 ///
1462 /// \param Loc The location where the taskloop construct was encountered.
1463 /// \param AllocaIP The insertion point to be used for alloca instructions.
1464 /// \param BodyGenCB Callback that will generate the region code.
1465 /// \param LoopInfo Callback that return the CLI
1466 /// \param LBVal Lowerbound value of loop
1467 /// \param UBVal Upperbound value of loop
1468 /// \param StepVal Step value of loop
1469 /// \param Tied True if the task is tied, false if the task is untied.
1470 /// \param DupCB The callback to generate the duplication code. See
1471 /// documentation for \ref TaskDupCallbackTy. This can be nullptr.
1472 /// \param TaskContextStructPtrVal If non-null, a pointer to to be placed
1473 /// immediately after the {lower bound, upper
1474 /// bound, step} values in the task data.
1475 LLVM_ABI InsertPointOrErrorTy createTaskloop(
1476 const LocationDescription &Loc, InsertPointTy AllocaIP,
1477 BodyGenCallbackTy BodyGenCB,
1478 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
1479 Value *LBVal, Value *UBVal, Value *StepVal, bool Tied = true,
1480 TaskDupCallbackTy DupCB = nullptr,
1481 Value *TaskContextStructPtrVal = nullptr);
1482
1483 /// Generator for `#omp task`
1484 ///
1485 /// \param Loc The location where the task construct was encountered.
1486 /// \param AllocaIP The insertion point to be used for alloca instructions.
1487 /// \param BodyGenCB Callback that will generate the region code.
1488 /// \param Tied True if the task is tied, false if the task is untied.
1489 /// \param Final i1 value which is `true` if the task is final, `false` if the
1490 /// task is not final.
1491 /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred
1492 /// task is generated, and the encountering thread must
1493 /// suspend the current task region, for which execution
1494 /// cannot be resumed until execution of the structured
1495 /// block that is associated with the generated task is
1496 /// completed.
1497 /// \param EventHandle If present, signifies the event handle as part of
1498 /// the detach clause
1499 /// \param Mergeable If the given task is `mergeable`
1500 /// \param priority `priority-value' specifies the execution order of the
1501 /// tasks that is generated by the construct
1502 LLVM_ABI InsertPointOrErrorTy
1503 createTask(const LocationDescription &Loc, InsertPointTy AllocaIP,
1504 BodyGenCallbackTy BodyGenCB, bool Tied = true,
1505 Value *Final = nullptr, Value *IfCondition = nullptr,
1506 SmallVector<DependData> Dependencies = {}, bool Mergeable = false,
1507 Value *EventHandle = nullptr, Value *Priority = nullptr);
1508
1509 /// Generator for the taskgroup construct
1510 ///
1511 /// \param Loc The location where the taskgroup construct was encountered.
1512 /// \param AllocaIP The insertion point to be used for alloca instructions.
1513 /// \param BodyGenCB Callback that will generate the region code.
1514 LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc,
1515 InsertPointTy AllocaIP,
1516 BodyGenCallbackTy BodyGenCB);
1517
1518 using FileIdentifierInfoCallbackTy =
1519 std::function<std::tuple<std::string, uint64_t>()>;
1520
1521 /// Creates a unique info for a target entry when provided a filename and
1522 /// line number from.
1523 ///
1524 /// \param CallBack A callback function which should return filename the entry
1525 /// resides in as well as the line number for the target entry
1526 /// \param ParentName The name of the parent the target entry resides in, if
1527 /// any.
1528 LLVM_ABI static TargetRegionEntryInfo
1529 getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
1530 vfs::FileSystem &VFS, StringRef ParentName = "");
1531
1532 /// Enum class for the RedctionGen CallBack type to be used.
1533 enum class ReductionGenCBKind { Clang, MLIR };
1534
1535 /// ReductionGen CallBack for Clang
1536 ///
1537 /// \param CodeGenIP InsertPoint for CodeGen.
1538 /// \param Index Index of the ReductionInfo to generate code for.
1539 /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
1540 /// codegen, used for fixup later.
1541 /// \param RHSPtr Optionally used by Clang to
1542 /// return the RHSPtr it used for codegen, used for fixup later.
1543 /// \param CurFn Optionally used by Clang to pass in the Current Function as
1544 /// Clang context may be old.
1545 using ReductionGenClangCBTy =
1546 std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
1547 Value **LHS, Value **RHS, Function *CurFn)>;
1548
1549 /// ReductionGen CallBack for MLIR
1550 ///
1551 /// \param CodeGenIP InsertPoint for CodeGen.
1552 /// \param LHS Pass in the LHS Value to be used for CodeGen.
1553 /// \param RHS Pass in the RHS Value to be used for CodeGen.
1554 using ReductionGenCBTy = std::function<InsertPointOrErrorTy(
1555 InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
1556
1557 /// Functions used to generate atomic reductions. Such functions take two
1558 /// Values representing pointers to LHS and RHS of the reduction, as well as
1559 /// the element type of these pointers. They are expected to atomically
1560 /// update the LHS to the reduced value.
1561 using ReductionGenAtomicCBTy = std::function<InsertPointOrErrorTy(
1562 InsertPointTy, Type *, Value *, Value *)>;
1563
1564 using ReductionGenDataPtrPtrCBTy = std::function<InsertPointOrErrorTy(
1565 InsertPointTy, Value *ByRefVal, Value *&Res)>;
1566
1567 /// Enum class for reduction evaluation types scalar, complex and aggregate.
1568 enum class EvalKind { Scalar, Complex, Aggregate };
1569
1570 /// Information about an OpenMP reduction.
1571 struct ReductionInfo {
1572 ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable,
1573 EvalKind EvaluationKind, ReductionGenCBTy ReductionGen,
1574 ReductionGenClangCBTy ReductionGenClang,
1575 ReductionGenAtomicCBTy AtomicReductionGen,
1576 ReductionGenDataPtrPtrCBTy DataPtrPtrGen,
1577 Type *ByRefAllocatedType = nullptr,
1578 Type *ByRefElementType = nullptr)
1580 PrivateVariable(PrivateVariable), EvaluationKind(EvaluationKind),
1581 ReductionGen(ReductionGen), ReductionGenClang(ReductionGenClang),
1582 AtomicReductionGen(AtomicReductionGen), DataPtrPtrGen(DataPtrPtrGen),
1583 ByRefAllocatedType(ByRefAllocatedType),
1584 ByRefElementType(ByRefElementType) {}
1585
1586 ReductionInfo(Value *PrivateVariable)
1587 : ElementType(nullptr), Variable(nullptr),
1588 PrivateVariable(PrivateVariable), EvaluationKind(EvalKind::Scalar),
1589 ReductionGen(), ReductionGenClang(), AtomicReductionGen(),
1590 DataPtrPtrGen() {}
1591
1592 /// Reduction element type, must match pointee type of variable. For by-ref
1593 /// reductions, this would be just an opaque `ptr`.
1595
1596 /// Reduction variable of pointer type.
1597 Value *Variable;
1598
1599 /// Thread-private partial reduction variable.
1600 Value *PrivateVariable;
1601
1602 /// Reduction evaluation kind - scalar, complex or aggregate.
1603 EvalKind EvaluationKind;
1604
1605 /// Callback for generating the reduction body. The IR produced by this will
1606 /// be used to combine two values in a thread-safe context, e.g., under
1607 /// lock or within the same thread, and therefore need not be atomic.
1608 ReductionGenCBTy ReductionGen;
1609
1610 /// Clang callback for generating the reduction body. The IR produced by
1611 /// this will be used to combine two values in a thread-safe context, e.g.,
1612 /// under lock or within the same thread, and therefore need not be atomic.
1613 ReductionGenClangCBTy ReductionGenClang;
1614
1615 /// Callback for generating the atomic reduction body, may be null. The IR
1616 /// produced by this will be used to atomically combine two values during
1617 /// reduction. If null, the implementation will use the non-atomic version
1618 /// along with the appropriate synchronization mechanisms.
1619 ReductionGenAtomicCBTy AtomicReductionGen;
1620
1621 ReductionGenDataPtrPtrCBTy DataPtrPtrGen;
1622
1623 /// For by-ref reductions, we need to keep track of 2 extra types that are
1624 /// potentially different:
1625 /// * The allocated type is the type of the storage allocated by the
1626 /// reduction op's `alloc` region. For example, for allocatables and arrays,
1627 /// this type would be the descriptor/box struct.
1628 Type *ByRefAllocatedType;
1629
1630 /// * The by-ref element type is the type of the actual storage needed for
1631 /// the data of the allocatable or array. For example, an float allocatable
1632 /// of would need some float storage to store intermediate reduction
1633 /// results.
1634 Type *ByRefElementType;
1635 };
1636
1637 enum class CopyAction : unsigned {
1638 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1639 // the warp using shuffle instructions.
1640 RemoteLaneToThread,
1641 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1642 ThreadCopy,
1643 };
1644
1645 struct CopyOptionsTy {
1646 Value *RemoteLaneOffset = nullptr;
1647 Value *ScratchpadIndex = nullptr;
1648 Value *ScratchpadWidth = nullptr;
1649 };
1650
1651 /// Supporting functions for Reductions CodeGen.
1652private:
1653 /// Get the id of the current thread on the GPU.
1654 Value *getGPUThreadID();
1655
1656 /// Get the GPU warp size.
1657 Value *getGPUWarpSize();
1658
1659 /// Get the id of the warp in the block.
1660 /// We assume that the warp size is 32, which is always the case
1661 /// on the NVPTX device, to generate more efficient code.
1662 Value *getNVPTXWarpID();
1663
1664 /// Get the id of the current lane in the Warp.
1665 /// We assume that the warp size is 32, which is always the case
1666 /// on the NVPTX device, to generate more efficient code.
1667 Value *getNVPTXLaneID();
1668
1669 /// Cast value to the specified type.
1670 Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
1671
1672 /// This function creates calls to one of two shuffle functions to copy
1673 /// variables between lanes in a warp.
1674 Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
1675 Type *ElementType, Value *Offset);
1676
1677 /// Function to shuffle over the value from the remote lane.
1678 void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
1679 Type *ElementType, Value *Offset, Type *ReductionArrayTy,
1680 bool IsByRefElem);
1681
1682 /// Emit instructions to copy a Reduce list, which contains partially
1683 /// aggregated values, in the specified direction.
1684 Error emitReductionListCopy(
1685 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
1686 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
1687 ArrayRef<bool> IsByRef,
1688 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
1689
1690 /// Emit a helper that reduces data across two OpenMP threads (lanes)
1691 /// in the same warp. It uses shuffle instructions to copy over data from
1692 /// a remote lane's stack. The reduction algorithm performed is specified
1693 /// by the fourth parameter.
1694 ///
1695 /// Algorithm Versions.
1696 /// Full Warp Reduce (argument value 0):
1697 /// This algorithm assumes that all 32 lanes are active and gathers
1698 /// data from these 32 lanes, producing a single resultant value.
1699 /// Contiguous Partial Warp Reduce (argument value 1):
1700 /// This algorithm assumes that only a *contiguous* subset of lanes
1701 /// are active. This happens for the last warp in a parallel region
1702 /// when the user specified num_threads is not an integer multiple of
1703 /// 32. This contiguous subset always starts with the zeroth lane.
1704 /// Partial Warp Reduce (argument value 2):
1705 /// This algorithm gathers data from any number of lanes at any position.
1706 /// All reduced values are stored in the lowest possible lane. The set
1707 /// of problems every algorithm addresses is a super set of those
1708 /// addressable by algorithms with a lower version number. Overhead
1709 /// increases as algorithm version increases.
1710 ///
1711 /// Terminology
1712 /// Reduce element:
1713 /// Reduce element refers to the individual data field with primitive
1714 /// data types to be combined and reduced across threads.
1715 /// Reduce list:
1716 /// Reduce list refers to a collection of local, thread-private
1717 /// reduce elements.
1718 /// Remote Reduce list:
1719 /// Remote Reduce list refers to a collection of remote (relative to
1720 /// the current thread) reduce elements.
1721 ///
1722 /// We distinguish between three states of threads that are important to
1723 /// the implementation of this function.
1724 /// Alive threads:
1725 /// Threads in a warp executing the SIMT instruction, as distinguished from
1726 /// threads that are inactive due to divergent control flow.
1727 /// Active threads:
1728 /// The minimal set of threads that has to be alive upon entry to this
1729 /// function. The computation is correct iff active threads are alive.
1730 /// Some threads are alive but they are not active because they do not
1731 /// contribute to the computation in any useful manner. Turning them off
1732 /// may introduce control flow overheads without any tangible benefits.
1733 /// Effective threads:
1734 /// In order to comply with the argument requirements of the shuffle
1735 /// function, we must keep all lanes holding data alive. But at most
1736 /// half of them perform value aggregation; we refer to this half of
1737 /// threads as effective. The other half is simply handing off their
1738 /// data.
1739 ///
1740 /// Procedure
1741 /// Value shuffle:
1742 /// In this step active threads transfer data from higher lane positions
1743 /// in the warp to lower lane positions, creating Remote Reduce list.
1744 /// Value aggregation:
1745 /// In this step, effective threads combine their thread local Reduce list
1746 /// with Remote Reduce list and store the result in the thread local
1747 /// Reduce list.
1748 /// Value copy:
1749 /// In this step, we deal with the assumption made by algorithm 2
1750 /// (i.e. contiguity assumption). When we have an odd number of lanes
1751 /// active, say 2k+1, only k threads will be effective and therefore k
1752 /// new values will be produced. However, the Reduce list owned by the
1753 /// (2k+1)th thread is ignored in the value aggregation. Therefore
1754 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1755 /// that the contiguity assumption still holds.
1756 ///
1757 /// \param ReductionInfos Array type containing the ReductionOps.
1758 /// \param ReduceFn The reduction function.
1759 /// \param FuncAttrs Optional param to specify any function attributes that
1760 /// need to be copied to the new function.
1761 /// \param IsByRef For each reduction clause, whether the reduction is by-ref
1762 /// or not.
1763 ///
1764 /// \return The ShuffleAndReduce function.
1765 Expected<Function *> emitShuffleAndReduceFunction(
1766 ArrayRef<OpenMPIRBuilder::ReductionInfo> ReductionInfos,
1767 Function *ReduceFn, AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
1768
1769 /// Helper function for CreateCanonicalScanLoops to create InputLoop
1770 /// in the firstGen and Scan Loop in the SecondGen
1771 /// \param InputLoopGen Callback for generating the loop for input phase
1772 /// \param ScanLoopGen Callback for generating the loop for scan phase
1773 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1774 /// `ScanInfoInitialize`.
1775 ///
1776 /// \return error if any produced, else return success.
1777 Error emitScanBasedDirectiveIR(
1778 llvm::function_ref<Error()> InputLoopGen,
1779 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
1780 ScanInfo *ScanRedInfo);
1781
1782 /// Creates the basic blocks required for scan reduction.
1783 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1784 /// `ScanInfoInitialize`.
1785 void createScanBBs(ScanInfo *ScanRedInfo);
1786
1787 /// Dynamically allocates the buffer needed for scan reduction.
1788 /// \param AllocaIP The IP where possibly-shared pointer of buffer needs to
1789 /// be declared.
1790 /// \param ScanVars Scan Variables.
1791 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1792 /// `ScanInfoInitialize`.
1793 ///
1794 /// \return error if any produced, else return success.
1795 Error emitScanBasedDirectiveDeclsIR(InsertPointTy AllocaIP,
1796 ArrayRef<llvm::Value *> ScanVars,
1797 ArrayRef<llvm::Type *> ScanVarsType,
1798 ScanInfo *ScanRedInfo);
1799
1800 /// Copies the result back to the reduction variable.
1801 /// \param ReductionInfos Array type containing the ReductionOps.
1802 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1803 /// `ScanInfoInitialize`.
1804 ///
1805 /// \return error if any produced, else return success.
1806 Error emitScanBasedDirectiveFinalsIR(
1807 ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos,
1808 ScanInfo *ScanInfo);
1809
1810 /// This function emits a helper that gathers Reduce lists from the first
1811 /// lane of every active warp to lanes in the first warp.
1812 ///
1813 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1814 /// shared smem[warp_size];
1815 /// For all data entries D in reduce_data:
1816 /// sync
1817 /// If (I am the first lane in each warp)
1818 /// Copy my local D to smem[warp_id]
1819 /// sync
1820 /// if (I am the first warp)
1821 /// Copy smem[thread_id] to my local D
1822 ///
1823 /// \param Loc The insert and source location description.
1824 /// \param ReductionInfos Array type containing the ReductionOps.
1825 /// \param FuncAttrs Optional param to specify any function attributes that
1826 /// need to be copied to the new function.
1827 /// \param IsByRef For each reduction clause, whether the reduction is by-ref
1828 /// or not.
1829 ///
1830 /// \return The InterWarpCopy function.
1831 Expected<Function *>
1832 emitInterWarpCopyFunction(const LocationDescription &Loc,
1833 ArrayRef<ReductionInfo> ReductionInfos,
1834 AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
1835
1836 /// This function emits a helper that copies all the reduction variables from
1837 /// the team into the provided global buffer for the reduction variables.
1838 ///
1839 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1840 /// For all data entries D in reduce_data:
1841 /// Copy local D to buffer.D[Idx]
1842 ///
1843 /// \param ReductionInfos Array type containing the ReductionOps.
1844 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1845 /// \param FuncAttrs Optional param to specify any function attributes that
1846 /// need to be copied to the new function.
1847 ///
1848 /// \return The ListToGlobalCopy function.
1849 Expected<Function *>
1850 emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1851 Type *ReductionsBufferTy,
1852 AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
1853
1854 /// This function emits a helper that copies all the reduction variables from
1855 /// the team into the provided global buffer for the reduction variables.
1856 ///
1857 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1858 /// For all data entries D in reduce_data:
1859 /// Copy buffer.D[Idx] to local D;
1860 ///
1861 /// \param ReductionInfos Array type containing the ReductionOps.
1862 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1863 /// \param FuncAttrs Optional param to specify any function attributes that
1864 /// need to be copied to the new function.
1865 ///
1866 /// \return The GlobalToList function.
1867 Expected<Function *>
1868 emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1869 Type *ReductionsBufferTy,
1870 AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
1871
1872 /// This function emits a helper that reduces all the reduction variables from
1873 /// the team into the provided global buffer for the reduction variables.
1874 ///
1875 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
1876 /// void *GlobPtrs[];
1877 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1878 /// ...
1879 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1880 /// reduce_function(GlobPtrs, reduce_data);
1881 ///
1882 /// \param ReductionInfos Array type containing the ReductionOps.
1883 /// \param ReduceFn The reduction function.
1884 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1885 /// \param FuncAttrs Optional param to specify any function attributes that
1886 /// need to be copied to the new function.
1887 ///
1888 /// \return The ListToGlobalReduce function.
1889 Expected<Function *>
1890 emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1891 Function *ReduceFn, Type *ReductionsBufferTy,
1892 AttributeList FuncAttrs,
1893 ArrayRef<bool> IsByRef);
1894
1895 /// This function emits a helper that reduces all the reduction variables from
1896 /// the team into the provided global buffer for the reduction variables.
1897 ///
1898 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
1899 /// void *GlobPtrs[];
1900 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1901 /// ...
1902 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1903 /// reduce_function(reduce_data, GlobPtrs);
1904 ///
1905 /// \param ReductionInfos Array type containing the ReductionOps.
1906 /// \param ReduceFn The reduction function.
1907 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1908 /// \param FuncAttrs Optional param to specify any function attributes that
1909 /// need to be copied to the new function.
1910 ///
1911 /// \return The GlobalToListReduce function.
1912 Expected<Function *>
1913 emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1914 Function *ReduceFn, Type *ReductionsBufferTy,
1915 AttributeList FuncAttrs,
1916 ArrayRef<bool> IsByRef);
1917
1918 /// Get the function name of a reduction function.
1919 std::string getReductionFuncName(StringRef Name) const;
1920
1921 /// Emits reduction function.
1922 /// \param ReducerName Name of the function calling the reduction.
1923 /// \param ReductionInfos Array type containing the ReductionOps.
1924 /// \param ReductionGenCBKind Optional param to specify Clang or MLIR
1925 /// CodeGenCB kind.
1926 /// \param FuncAttrs Optional param to specify any function attributes that
1927 /// need to be copied to the new function.
1928 ///
1929 /// \return The reduction function.
1930 Expected<Function *> createReductionFunction(
1931 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
1932 ArrayRef<bool> IsByRef,
1933 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
1934 AttributeList FuncAttrs = {});
1935
1936public:
1937 ///
1938 /// Design of OpenMP reductions on the GPU
1939 ///
1940 /// Consider a typical OpenMP program with one or more reduction
1941 /// clauses:
1942 ///
1943 /// float foo;
1944 /// double bar;
1945 /// #pragma omp target teams distribute parallel for \
1946 /// reduction(+:foo) reduction(*:bar)
1947 /// for (int i = 0; i < N; i++) {
1948 /// foo += A[i]; bar *= B[i];
1949 /// }
1950 ///
1951 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1952 /// all teams. In our OpenMP implementation on the NVPTX device an
1953 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1954 /// within a team are mapped to CUDA threads within a threadblock.
1955 /// Our goal is to efficiently aggregate values across all OpenMP
1956 /// threads such that:
1957 ///
1958 /// - the compiler and runtime are logically concise, and
1959 /// - the reduction is performed efficiently in a hierarchical
1960 /// manner as follows: within OpenMP threads in the same warp,
1961 /// across warps in a threadblock, and finally across teams on
1962 /// the NVPTX device.
1963 ///
1964 /// Introduction to Decoupling
1965 ///
1966 /// We would like to decouple the compiler and the runtime so that the
1967 /// latter is ignorant of the reduction variables (number, data types)
1968 /// and the reduction operators. This allows a simpler interface
1969 /// and implementation while still attaining good performance.
1970 ///
1971 /// Pseudocode for the aforementioned OpenMP program generated by the
1972 /// compiler is as follows:
1973 ///
1974 /// 1. Create private copies of reduction variables on each OpenMP
1975 /// thread: 'foo_private', 'bar_private'
1976 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1977 /// to it and writes the result in 'foo_private' and 'bar_private'
1978 /// respectively.
1979 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1980 /// and store the result on the team master:
1981 ///
1982 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1983 /// reduceData, shuffleReduceFn, interWarpCpyFn)
1984 ///
1985 /// where:
1986 /// struct ReduceData {
1987 /// double *foo;
1988 /// double *bar;
1989 /// } reduceData
1990 /// reduceData.foo = &foo_private
1991 /// reduceData.bar = &bar_private
1992 ///
1993 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1994 /// auxiliary functions generated by the compiler that operate on
1995 /// variables of type 'ReduceData'. They aid the runtime perform
1996 /// algorithmic steps in a data agnostic manner.
1997 ///
1998 /// 'shuffleReduceFn' is a pointer to a function that reduces data
1999 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
2000 /// same warp. It takes the following arguments as input:
2001 ///
2002 /// a. variable of type 'ReduceData' on the calling lane,
2003 /// b. its lane_id,
2004 /// c. an offset relative to the current lane_id to generate a
2005 /// remote_lane_id. The remote lane contains the second
2006 /// variable of type 'ReduceData' that is to be reduced.
2007 /// d. an algorithm version parameter determining which reduction
2008 /// algorithm to use.
2009 ///
2010 /// 'shuffleReduceFn' retrieves data from the remote lane using
2011 /// efficient GPU shuffle intrinsics and reduces, using the
2012 /// algorithm specified by the 4th parameter, the two operands
2013 /// element-wise. The result is written to the first operand.
2014 ///
2015 /// Different reduction algorithms are implemented in different
2016 /// runtime functions, all calling 'shuffleReduceFn' to perform
2017 /// the essential reduction step. Therefore, based on the 4th
2018 /// parameter, this function behaves slightly differently to
2019 /// cooperate with the runtime to ensure correctness under
2020 /// different circumstances.
2021 ///
2022 /// 'InterWarpCpyFn' is a pointer to a function that transfers
2023 /// reduced variables across warps. It tunnels, through CUDA
2024 /// shared memory, the thread-private data of type 'ReduceData'
2025 /// from lane 0 of each warp to a lane in the first warp.
2026 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2027 /// The last team writes the global reduced value to memory.
2028 ///
2029 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2030 /// reduceData, shuffleReduceFn, interWarpCpyFn,
2031 /// scratchpadCopyFn, loadAndReduceFn)
2032 ///
2033 /// 'scratchpadCopyFn' is a helper that stores reduced
2034 /// data from the team master to a scratchpad array in
2035 /// global memory.
2036 ///
2037 /// 'loadAndReduceFn' is a helper that loads data from
2038 /// the scratchpad array and reduces it with the input
2039 /// operand.
2040 ///
2041 /// These compiler generated functions hide address
2042 /// calculation and alignment information from the runtime.
2043 /// 5. if ret == 1:
2044 /// The team master of the last team stores the reduced
2045 /// result to the globals in memory.
2046 /// foo += reduceData.foo; bar *= reduceData.bar
2047 ///
2048 ///
2049 /// Warp Reduction Algorithms
2050 ///
2051 /// On the warp level, we have three algorithms implemented in the
2052 /// OpenMP runtime depending on the number of active lanes:
2053 ///
2054 /// Full Warp Reduction
2055 ///
2056 /// The reduce algorithm within a warp where all lanes are active
2057 /// is implemented in the runtime as follows:
2058 ///
2059 /// full_warp_reduce(void *reduce_data,
2060 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2061 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2062 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
2063 /// }
2064 ///
2065 /// The algorithm completes in log(2, WARPSIZE) steps.
2066 ///
2067 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2068 /// not used therefore we save instructions by not retrieving lane_id
2069 /// from the corresponding special registers. The 4th parameter, which
2070 /// represents the version of the algorithm being used, is set to 0 to
2071 /// signify full warp reduction.
2072 ///
2073 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2074 ///
2075 /// #reduce_elem refers to an element in the local lane's data structure
2076 /// #remote_elem is retrieved from a remote lane
2077 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2078 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2079 ///
2080 /// Contiguous Partial Warp Reduction
2081 ///
2082 /// This reduce algorithm is used within a warp where only the first
2083 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2084 /// number of OpenMP threads in a parallel region is not a multiple of
2085 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
2086 ///
2087 /// void
2088 /// contiguous_partial_reduce(void *reduce_data,
2089 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2090 /// int size, int lane_id) {
2091 /// int curr_size;
2092 /// int offset;
2093 /// curr_size = size;
2094 /// mask = curr_size/2;
2095 /// while (offset>0) {
2096 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2097 /// curr_size = (curr_size+1)/2;
2098 /// offset = curr_size/2;
2099 /// }
2100 /// }
2101 ///
2102 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2103 ///
2104 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2105 /// if (lane_id < offset)
2106 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2107 /// else
2108 /// reduce_elem = remote_elem
2109 ///
2110 /// This algorithm assumes that the data to be reduced are located in a
2111 /// contiguous subset of lanes starting from the first. When there is
2112 /// an odd number of active lanes, the data in the last lane is not
2113 /// aggregated with any other lane's dat but is instead copied over.
2114 ///
2115 /// Dispersed Partial Warp Reduction
2116 ///
2117 /// This algorithm is used within a warp when any discontiguous subset of
2118 /// lanes are active. It is used to implement the reduction operation
2119 /// across lanes in an OpenMP simd region or in a nested parallel region.
2120 ///
2121 /// void
2122 /// dispersed_partial_reduce(void *reduce_data,
2123 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2124 /// int size, remote_id;
2125 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2126 /// do {
2127 /// remote_id = next_active_lane_id_right_after_me();
2128 /// # the above function returns 0 of no active lane
2129 /// # is present right after the current lane.
2130 /// size = number_of_active_lanes_in_this_warp();
2131 /// logical_lane_id /= 2;
2132 /// ShuffleReduceFn(reduce_data, logical_lane_id,
2133 /// remote_id-1-threadIdx.x, 2);
2134 /// } while (logical_lane_id % 2 == 0 && size > 1);
2135 /// }
2136 ///
2137 /// There is no assumption made about the initial state of the reduction.
2138 /// Any number of lanes (>=1) could be active at any position. The reduction
2139 /// result is returned in the first active lane.
2140 ///
2141 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2142 ///
2143 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2144 /// if (lane_id % 2 == 0 && offset > 0)
2145 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2146 /// else
2147 /// reduce_elem = remote_elem
2148 ///
2149 ///
2150 /// Intra-Team Reduction
2151 ///
2152 /// This function, as implemented in the runtime call
2153 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2154 /// threads in a team. It first reduces within a warp using the
2155 /// aforementioned algorithms. We then proceed to gather all such
2156 /// reduced values at the first warp.
2157 ///
2158 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
2159 /// data from each of the "warp master" (zeroth lane of each warp, where
2160 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
2161 /// a mathematical sense) the problem of reduction across warp masters in
2162 /// a block to the problem of warp reduction.
2163 ///
2164 ///
2165 /// Inter-Team Reduction
2166 ///
2167 /// Once a team has reduced its data to a single value, it is stored in
2168 /// a global scratchpad array. Since each team has a distinct slot, this
2169 /// can be done without locking.
2170 ///
2171 /// The last team to write to the scratchpad array proceeds to reduce the
2172 /// scratchpad array. One or more workers in the last team use the helper
2173 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2174 /// the k'th worker reduces every k'th element.
2175 ///
2176 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2177 /// reduce across workers and compute a globally reduced value.
2178 ///
2179 /// \param Loc The location where the reduction was
2180 /// encountered. Must be within the associate
2181 /// directive and after the last local access to the
2182 /// reduction variables.
2183 /// \param AllocaIP An insertion point suitable for allocas usable
2184 /// in reductions.
2185 /// \param CodeGenIP An insertion point suitable for code
2186 /// generation.
2187 /// \param ReductionInfos A list of info on each reduction
2188 /// variable.
2189 /// \param IsNoWait Optional flag set if the reduction is
2190 /// marked as nowait.
2191 /// \param IsByRef For each reduction clause, whether the reduction is by-ref.
2192 /// \param IsTeamsReduction Optional flag set if it is a teams
2193 /// reduction.
2194 /// \param GridValue Optional GPU grid value.
2195 /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be
2196 /// used for teams reduction.
2197 /// \param SrcLocInfo Source location information global.
2198 LLVM_ABI InsertPointOrErrorTy createReductionsGPU(
2199 const LocationDescription &Loc, InsertPointTy AllocaIP,
2200 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
2201 ArrayRef<bool> IsByRef, bool IsNoWait = false,
2202 bool IsTeamsReduction = false,
2203 ReductionGenCBKind ReductionGenCBKind = ReductionGenCBKind::MLIR,
2204 std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
2205 Value *SrcLocInfo = nullptr);
2206
2207 // TODO: provide atomic and non-atomic reduction generators for reduction
2208 // operators defined by the OpenMP specification.
2209
2210 /// Generator for '#omp reduction'.
2211 ///
2212 /// Emits the IR instructing the runtime to perform the specific kind of
2213 /// reductions. Expects reduction variables to have been privatized and
2214 /// initialized to reduction-neutral values separately. Emits the calls to
2215 /// runtime functions as well as the reduction function and the basic blocks
2216 /// performing the reduction atomically and non-atomically.
2217 ///
2218 /// The code emitted for the following:
2219 ///
2220 /// \code
2221 /// type var_1;
2222 /// type var_2;
2223 /// #pragma omp <directive> reduction(reduction-op:var_1,var_2)
2224 /// /* body */;
2225 /// \endcode
2226 ///
2227 /// corresponds to the following sketch.
2228 ///
2229 /// \code
2230 /// void _outlined_par() {
2231 /// // N is the number of different reductions.
2232 /// void *red_array[] = {privatized_var_1, privatized_var_2, ...};
2233 /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
2234 /// _omp_reduction_func,
2235 /// _gomp_critical_user.reduction.var)) {
2236 /// case 1: {
2237 /// var_1 = var_1 <reduction-op> privatized_var_1;
2238 /// var_2 = var_2 <reduction-op> privatized_var_2;
2239 /// // ...
2240 /// __kmpc_end_reduce(...);
2241 /// break;
2242 /// }
2243 /// case 2: {
2244 /// _Atomic<ReductionOp>(var_1, privatized_var_1);
2245 /// _Atomic<ReductionOp>(var_2, privatized_var_2);
2246 /// // ...
2247 /// break;
2248 /// }
2249 /// default: break;
2250 /// }
2251 /// }
2252 ///
2253 /// void _omp_reduction_func(void **lhs, void **rhs) {
2254 /// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
2255 /// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
2256 /// // ...
2257 /// }
2258 /// \endcode
2259 ///
2260 /// \param Loc The location where the reduction was
2261 /// encountered. Must be within the associate
2262 /// directive and after the last local access to the
2263 /// reduction variables.
2264 /// \param AllocaIP An insertion point suitable for allocas usable
2265 /// in reductions.
2266 /// \param ReductionInfos A list of info on each reduction variable.
2267 /// \param IsNoWait A flag set if the reduction is marked as nowait.
2268 /// \param IsByRef A flag set if the reduction is using reference
2269 /// or direct value.
2270 /// \param IsTeamsReduction Optional flag set if it is a teams
2271 /// reduction.
2272 LLVM_ABI InsertPointOrErrorTy createReductions(
2273 const LocationDescription &Loc, InsertPointTy AllocaIP,
2274 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
2275 bool IsNoWait = false, bool IsTeamsReduction = false);
2276
2277 ///}
2278
2279 /// Return the insertion point used by the underlying IRBuilder.
2280 InsertPointTy getInsertionPoint() { return Builder.saveIP(); }
2281
2282 /// Update the internal location to \p Loc.
2283 bool updateToLocation(const LocationDescription &Loc) {
2284 Builder.restoreIP(Loc.IP);
2285 Builder.SetCurrentDebugLocation(Loc.DL);
2286 return Loc.IP.getBlock() != nullptr;
2287 }
2288
2289 /// Return the function declaration for the runtime function with \p FnID.
2290 LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M,
2291 omp::RuntimeFunction FnID);
2292
2293 LLVM_ABI Function *getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID);
2294
2295 CallInst *createRuntimeFunctionCall(FunctionCallee Callee,
2296 ArrayRef<Value *> Args,
2297 StringRef Name = "");
2298
2299 /// Return the (LLVM-IR) string describing the source location \p LocStr.
2300 LLVM_ABI Constant *getOrCreateSrcLocStr(StringRef LocStr,
2301 uint32_t &SrcLocStrSize);
2302
2303 /// Return the (LLVM-IR) string describing the default source location.
2304 LLVM_ABI Constant *getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize);
2305
2306 /// Return the (LLVM-IR) string describing the source location identified by
2307 /// the arguments.
2308 LLVM_ABI Constant *getOrCreateSrcLocStr(StringRef FunctionName,
2309 StringRef FileName, unsigned Line,
2310 unsigned Column,
2311 uint32_t &SrcLocStrSize);
2312
2313 /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
2314 /// fallback if \p DL does not specify the function name.
2315 LLVM_ABI Constant *getOrCreateSrcLocStr(DebugLoc DL, uint32_t &SrcLocStrSize,
2316 Function *F = nullptr);
2317
2318 /// Return the (LLVM-IR) string describing the source location \p Loc.
2319 LLVM_ABI Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
2320 uint32_t &SrcLocStrSize);
2321
2322 /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
2323 /// TODO: Create a enum class for the Reserve2Flags
2324 LLVM_ABI Constant *getOrCreateIdent(Constant *SrcLocStr,
2325 uint32_t SrcLocStrSize,
2326 omp::IdentFlag Flags = omp::IdentFlag(0),
2327 unsigned Reserve2Flags = 0);
2328
2329 /// Create a hidden global flag \p Name in the module with initial value \p
2330 /// Value.
2331 LLVM_ABI GlobalValue *createGlobalFlag(unsigned Value, StringRef Name);
2332
2333 /// Emit the llvm.used metadata.
2334 LLVM_ABI void emitUsed(StringRef Name, ArrayRef<llvm::WeakTrackingVH> List);
2335
2336 /// Emit the kernel execution mode.
2337 LLVM_ABI GlobalVariable *
2338 emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode);
2339
2340 /// Generate control flow and cleanup for cancellation.
2341 ///
2342 /// \param CancelFlag Flag indicating if the cancellation is performed.
2343 /// \param CanceledDirective The kind of directive that is cancled.
2344 /// \param ExitCB Extra code to be generated in the exit block.
2345 ///
2346 /// \return an error, if any were triggered during execution.
2347 LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag,
2348 omp::Directive CanceledDirective);
2349
2350 /// Generate a target region entry call.
2351 ///
2352 /// \param Loc The location at which the request originated and is fulfilled.
2353 /// \param AllocaIP The insertion point to be used for alloca instructions.
2354 /// \param Return Return value of the created function returned by reference.
2355 /// \param DeviceID Identifier for the device via the 'device' clause.
2356 /// \param NumTeams Numer of teams for the region via the 'num_teams' clause
2357 /// or 0 if unspecified and -1 if there is no 'teams' clause.
2358 /// \param NumThreads Number of threads via the 'thread_limit' clause.
2359 /// \param HostPtr Pointer to the host-side pointer of the target kernel.
2360 /// \param KernelArgs Array of arguments to the kernel.
2361 LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc,
2362 InsertPointTy AllocaIP,
2363 Value *&Return, Value *Ident,
2364 Value *DeviceID, Value *NumTeams,
2365 Value *NumThreads, Value *HostPtr,
2366 ArrayRef<Value *> KernelArgs);
2367
2368 /// Generate a flush runtime call.
2369 ///
2370 /// \param Loc The location at which the request originated and is fulfilled.
2371 LLVM_ABI void emitFlush(const LocationDescription &Loc);
2372
2373 /// The finalization stack made up of finalize callbacks currently in-flight,
2374 /// wrapped into FinalizationInfo objects that reference also the finalization
2375 /// target block and the kind of cancellable directive.
2376 SmallVector<FinalizationInfo, 8> FinalizationStack;
2377
2378 /// Return true if the last entry in the finalization stack is of kind \p DK
2379 /// and cancellable.
2380 bool isLastFinalizationInfoCancellable(omp::Directive DK) {
2381 return !FinalizationStack.empty() &&
2382 FinalizationStack.back().IsCancellable &&
2383 FinalizationStack.back().DK == DK;
2384 }
2385
2386 /// Generate a taskwait runtime call.
2387 ///
2388 /// \param Loc The location at which the request originated and is fulfilled.
2389 LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc);
2390
2391 /// Generate a taskyield runtime call.
2392 ///
2393 /// \param Loc The location at which the request originated and is fulfilled.
2394 LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc);
2395
2396 /// Return the current thread ID.
2397 ///
2398 /// \param Ident The ident (ident_t*) describing the query origin.
2399 LLVM_ABI Value *getOrCreateThreadID(Value *Ident);
2400
2401 /// The OpenMPIRBuilder Configuration
2402 OpenMPIRBuilderConfig Config;
2403
2404 /// The underlying LLVM-IR module
2405 Module &M;
2406
2407 /// The LLVM-IR Builder used to create IR.
2408 IRBuilder<> Builder;
2409
2410 /// Map to remember source location strings
2411 StringMap<Constant *> SrcLocStrMap;
2412
2413 /// Map to remember existing ident_t*.
2414 DenseMap<std::pair<Constant *, uint64_t>, Constant *> IdentMap;
2415
2416 /// Info manager to keep track of target regions.
2417 OffloadEntriesInfoManager OffloadInfoManager;
2418
2419 /// The target triple of the underlying module.
2420 const Triple T;
2421
2422 /// Helper that contains information about regions we need to outline
2423 /// during finalization.
2424 struct OutlineInfo {
2425 using PostOutlineCBTy = std::function<void(Function &)>;
2426 PostOutlineCBTy PostOutlineCB;
2427 BasicBlock *EntryBB, *ExitBB, *OuterAllocaBB;
2428 SmallVector<Value *, 2> ExcludeArgsFromAggregate;
2429 SetVector<Value *> Inputs, Outputs;
2430 // TODO: this should be safe to enable by default
2431 bool FixUpNonEntryAllocas = false;
2432
2433 /// Collect all blocks in between EntryBB and ExitBB in both the given
2434 /// vector and set.
2435 LLVM_ABI void collectBlocks(SmallPtrSetImpl<BasicBlock *> &BlockSet,
2436 SmallVectorImpl<BasicBlock *> &BlockVector);
2437
2438 /// Return the function that contains the region to be outlined.
2439 Function *getFunction() const { return EntryBB->getParent(); }
2440 };
2441
2442 /// Collection of regions that need to be outlined during finalization.
2443 SmallVector<OutlineInfo, 16> OutlineInfos;
2444
2445 /// A collection of candidate target functions that's constant allocas will
2446 /// attempt to be raised on a call of finalize after all currently enqueued
2447 /// outline info's have been processed.
2448 SmallVector<llvm::Function *, 16> ConstantAllocaRaiseCandidates;
2449
2450 /// Collection of owned canonical loop objects that eventually need to be
2451 /// free'd.
2452 std::forward_list<CanonicalLoopInfo> LoopInfos;
2453
2454 /// Collection of owned ScanInfo objects that eventually need to be free'd.
2455 std::forward_list<ScanInfo> ScanInfos;
2456
2457 /// Add a new region that will be outlined later.
2458 void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
2459
2460 /// An ordered map of auto-generated variables to their unique names.
2461 /// It stores variables with the following names: 1) ".gomp_critical_user_" +
2462 /// <critical_section_name> + ".var" for "omp critical" directives; 2)
2463 /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate
2464 /// variables.
2465 StringMap<GlobalVariable *, BumpPtrAllocator> InternalVars;
2466
2467 /// Computes the size of type in bytes.
2469
2470 // Emit a branch from the current block to the Target block only if
2471 // the current block has a terminator.
2472 LLVM_ABI void emitBranch(BasicBlock *Target);
2473
2474 // If BB has no use then delete it and return. Else place BB after the current
2475 // block, if possible, or else at the end of the function. Also add a branch
2476 // from current block to BB if current block does not have a terminator.
2477 LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn,
2478 bool IsFinished = false);
2479
2480 /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy
2481 /// Here is the logic:
2482 /// if (Cond) {
2483 /// ThenGen();
2484 /// } else {
2485 /// ElseGen();
2486 /// }
2487 ///
2488 /// \return an error, if any were triggered during execution.
2489 LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
2490 BodyGenCallbackTy ElseGen,
2491 InsertPointTy AllocaIP = {});
2492
2493 /// Create the global variable holding the offload mappings information.
2494 LLVM_ABI GlobalVariable *
2495 createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
2496 std::string VarName);
2497
2498 /// Create the global variable holding the offload names information.
2499 LLVM_ABI GlobalVariable *
2500 createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
2501 std::string VarName);
2502
2503 struct MapperAllocas {
2504 AllocaInst *ArgsBase = nullptr;
2505 AllocaInst *Args = nullptr;
2506 AllocaInst *ArgSizes = nullptr;
2507 };
2508
2509 /// Create the allocas instruction used in call to mapper functions.
2510 LLVM_ABI void createMapperAllocas(const LocationDescription &Loc,
2511 InsertPointTy AllocaIP,
2512 unsigned NumOperands,
2513 struct MapperAllocas &MapperAllocas);
2514
2515 /// Create the call for the target mapper function.
2516 /// \param Loc The source location description.
2517 /// \param MapperFunc Function to be called.
2518 /// \param SrcLocInfo Source location information global.
2519 /// \param MaptypesArg The argument types.
2520 /// \param MapnamesArg The argument names.
2521 /// \param MapperAllocas The AllocaInst used for the call.
2522 /// \param DeviceID Device ID for the call.
2523 /// \param NumOperands Number of operands in the call.
2524 LLVM_ABI void emitMapperCall(const LocationDescription &Loc,
2525 Function *MapperFunc, Value *SrcLocInfo,
2526 Value *MaptypesArg, Value *MapnamesArg,
2527 struct MapperAllocas &MapperAllocas,
2528 int64_t DeviceID, unsigned NumOperands);
2529
2530 /// Container for the arguments used to pass data to the runtime library.
2531 struct TargetDataRTArgs {
2532 /// The array of base pointer passed to the runtime library.
2533 Value *BasePointersArray = nullptr;
2534 /// The array of section pointers passed to the runtime library.
2535 Value *PointersArray = nullptr;
2536 /// The array of sizes passed to the runtime library.
2537 Value *SizesArray = nullptr;
2538 /// The array of map types passed to the runtime library for the beginning
2539 /// of the region or for the entire region if there are no separate map
2540 /// types for the region end.
2541 Value *MapTypesArray = nullptr;
2542 /// The array of map types passed to the runtime library for the end of the
2543 /// region, or nullptr if there are no separate map types for the region
2544 /// end.
2545 Value *MapTypesArrayEnd = nullptr;
2546 /// The array of user-defined mappers passed to the runtime library.
2547 Value *MappersArray = nullptr;
2548 /// The array of original declaration names of mapped pointers sent to the
2549 /// runtime library for debugging
2550 Value *MapNamesArray = nullptr;
2551
2552 explicit TargetDataRTArgs() = default;
2553 explicit TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray,
2554 Value *SizesArray, Value *MapTypesArray,
2555 Value *MapTypesArrayEnd, Value *MappersArray,
2556 Value *MapNamesArray)
2557 : BasePointersArray(BasePointersArray), PointersArray(PointersArray),
2558 SizesArray(SizesArray), MapTypesArray(MapTypesArray),
2559 MapTypesArrayEnd(MapTypesArrayEnd), MappersArray(MappersArray),
2560 MapNamesArray(MapNamesArray) {}
2561 };
2562
2563 /// Container to pass the default attributes with which a kernel must be
2564 /// launched, used to set kernel attributes and populate associated static
2565 /// structures.
2566 ///
2567 /// For max values, < 0 means unset, == 0 means set but unknown at compile
2568 /// time. The number of max values will be 1 except for the case where
2569 /// ompx_bare is set.
2570 struct TargetKernelDefaultAttrs {
2571 omp::OMPTgtExecModeFlags ExecFlags =
2572 omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2573 SmallVector<int32_t, 3> MaxTeams = {-1};
2574 int32_t MinTeams = 1;
2575 SmallVector<int32_t, 3> MaxThreads = {-1};
2576 int32_t MinThreads = 1;
2577 int32_t ReductionDataSize = 0;
2578 int32_t ReductionBufferLength = 0;
2579 };
2580
2581 /// Container to pass LLVM IR runtime values or constants related to the
2582 /// number of teams and threads with which the kernel must be launched, as
2583 /// well as the trip count of the loop, if it is an SPMD or Generic-SPMD
2584 /// kernel. These must be defined in the host prior to the call to the kernel
2585 /// launch OpenMP RTL function.
2586 struct TargetKernelRuntimeAttrs {
2587 SmallVector<Value *, 3> MaxTeams = {nullptr};
2588 Value *MinTeams = nullptr;
2589 SmallVector<Value *, 3> TargetThreadLimit = {nullptr};
2590 SmallVector<Value *, 3> TeamsThreadLimit = {nullptr};
2591
2592 /// 'parallel' construct 'num_threads' clause value, if present and it is an
2593 /// SPMD kernel.
2594 Value *MaxThreads = nullptr;
2595
2596 /// Total number of iterations of the SPMD or Generic-SPMD kernel or null if
2597 /// it is a generic kernel.
2598 Value *LoopTripCount = nullptr;
2599
2600 /// Device ID value used in the kernel launch.
2601 Value *DeviceID = nullptr;
2602 };
2603
2604 /// Data structure that contains the needed information to construct the
2605 /// kernel args vector.
2606 struct TargetKernelArgs {
2607 /// Number of arguments passed to the runtime library.
2608 unsigned NumTargetItems = 0;
2609 /// Arguments passed to the runtime library
2610 TargetDataRTArgs RTArgs;
2611 /// The number of iterations
2612 Value *NumIterations = nullptr;
2613 /// The number of teams.
2614 ArrayRef<Value *> NumTeams;
2615 /// The number of threads.
2616 ArrayRef<Value *> NumThreads;
2617 /// The size of the dynamic shared memory.
2618 Value *DynCGroupMem = nullptr;
2619 /// True if the kernel has 'no wait' clause.
2620 bool HasNoWait = false;
2621 /// The fallback mechanism for the shared memory.
2622 omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
2623 omp::OMPDynGroupprivateFallbackType::Abort;
2624
2625 // Constructors for TargetKernelArgs.
2626 TargetKernelArgs() = default;
2627 TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
2628 Value *NumIterations, ArrayRef<Value *> NumTeams,
2629 ArrayRef<Value *> NumThreads, Value *DynCGroupMem,
2630 bool HasNoWait,
2631 omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback)
2632 : NumTargetItems(NumTargetItems), RTArgs(RTArgs),
2633 NumIterations(NumIterations), NumTeams(NumTeams),
2634 NumThreads(NumThreads), DynCGroupMem(DynCGroupMem),
2635 HasNoWait(HasNoWait), DynCGroupMemFallback(DynCGroupMemFallback) {}
2636 };
2637
2638 /// Create the kernel args vector used by emitTargetKernel. This function
2639 /// creates various constant values that are used in the resulting args
2640 /// vector.
2641 LLVM_ABI static void getKernelArgsVector(TargetKernelArgs &KernelArgs,
2642 IRBuilderBase &Builder,
2643 SmallVector<Value *> &ArgsVector);
2644
2645 /// Struct that keeps the information that should be kept throughout
2646 /// a 'target data' region.
2647 class TargetDataInfo {
2648 /// Set to true if device pointer information have to be obtained.
2649 bool RequiresDevicePointerInfo = false;
2650 /// Set to true if Clang emits separate runtime calls for the beginning and
2651 /// end of the region. These calls might have separate map type arrays.
2652 bool SeparateBeginEndCalls = false;
2653
2654 public:
2655 TargetDataRTArgs RTArgs;
2656
2657 SmallMapVector<const Value *, std::pair<Value *, Value *>, 4>
2658 DevicePtrInfoMap;
2659
2660 /// Indicate whether any user-defined mapper exists.
2661 bool HasMapper = false;
2662 /// The total number of pointers passed to the runtime library.
2663 unsigned NumberOfPtrs = 0u;
2664
2665 bool EmitDebug = false;
2666
2667 /// Whether the `target ... data` directive has a `nowait` clause.
2668 bool HasNoWait = false;
2669
2670 explicit TargetDataInfo() = default;
2671 explicit TargetDataInfo(bool RequiresDevicePointerInfo,
2672 bool SeparateBeginEndCalls)
2673 : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
2674 SeparateBeginEndCalls(SeparateBeginEndCalls) {}
2675 /// Clear information about the data arrays.
2676 void clearArrayInfo() {
2677 RTArgs = TargetDataRTArgs();
2678 HasMapper = false;
2679 NumberOfPtrs = 0u;
2680 }
2681 /// Return true if the current target data information has valid arrays.
2682 bool isValid() {
2683 return RTArgs.BasePointersArray && RTArgs.PointersArray &&
2684 RTArgs.SizesArray && RTArgs.MapTypesArray &&
2685 (!HasMapper || RTArgs.MappersArray) && NumberOfPtrs;
2686 }
2687 bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
2688 bool separateBeginEndCalls() { return SeparateBeginEndCalls; }
2689 };
2690
2691 enum class DeviceInfoTy { None, Pointer, Address };
2692 using MapValuesArrayTy = SmallVector<Value *, 4>;
2693 using MapDeviceInfoArrayTy = SmallVector<DeviceInfoTy, 4>;
2694 using MapFlagsArrayTy = SmallVector<omp::OpenMPOffloadMappingFlags, 4>;
2695 using MapNamesArrayTy = SmallVector<Constant *, 4>;
2696 using MapDimArrayTy = SmallVector<uint64_t, 4>;
2697 using MapNonContiguousArrayTy = SmallVector<MapValuesArrayTy, 4>;
2698
2699 /// This structure contains combined information generated for mappable
2700 /// clauses, including base pointers, pointers, sizes, map types, user-defined
2701 /// mappers, and non-contiguous information.
2702 struct MapInfosTy {
2703 struct StructNonContiguousInfo {
2704 bool IsNonContiguous = false;
2705 MapDimArrayTy Dims;
2706 MapNonContiguousArrayTy Offsets;
2707 MapNonContiguousArrayTy Counts;
2708 MapNonContiguousArrayTy Strides;
2709 };
2710 MapValuesArrayTy BasePointers;
2711 MapValuesArrayTy Pointers;
2712 MapDeviceInfoArrayTy DevicePointers;
2713 MapValuesArrayTy Sizes;
2714 MapFlagsArrayTy Types;
2715 MapNamesArrayTy Names;
2716 StructNonContiguousInfo NonContigInfo;
2717
2718 /// Append arrays in \a CurInfo.
2719 void append(MapInfosTy &CurInfo) {
2720 BasePointers.append(CurInfo.BasePointers.begin(),
2721 CurInfo.BasePointers.end());
2722 Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end());
2723 DevicePointers.append(CurInfo.DevicePointers.begin(),
2724 CurInfo.DevicePointers.end());
2725 Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end());
2726 Types.append(CurInfo.Types.begin(), CurInfo.Types.end());
2727 Names.append(CurInfo.Names.begin(), CurInfo.Names.end());
2728 NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(),
2729 CurInfo.NonContigInfo.Dims.end());
2730 NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(),
2731 CurInfo.NonContigInfo.Offsets.end());
2732 NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(),
2733 CurInfo.NonContigInfo.Counts.end());
2734 NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(),
2735 CurInfo.NonContigInfo.Strides.end());
2736 }
2737 };
2738 using MapInfosOrErrorTy = Expected<MapInfosTy &>;
2739
2740 /// Callback function type for functions emitting the host fallback code that
2741 /// is executed when the kernel launch fails. It takes an insertion point as
2742 /// parameter where the code should be emitted. It returns an insertion point
2743 /// that points right after after the emitted code.
2744 using EmitFallbackCallbackTy =
2745 function_ref<InsertPointOrErrorTy(InsertPointTy)>;
2746
2747 // Callback function type for emitting and fetching user defined custom
2748 // mappers.
2749 using CustomMapperCallbackTy =
2750 function_ref<Expected<Function *>(unsigned int)>;
2751
2752 /// Generate a target region entry call and host fallback call.
2753 ///
2754 /// \param Loc The location at which the request originated and is fulfilled.
2755 /// \param OutlinedFnID The ooulined function ID.
2756 /// \param EmitTargetCallFallbackCB Call back function to generate host
2757 /// fallback code.
2758 /// \param Args Data structure holding information about the kernel arguments.
2759 /// \param DeviceID Identifier for the device via the 'device' clause.
2760 /// \param RTLoc Source location identifier
2761 /// \param AllocaIP The insertion point to be used for alloca instructions.
2762 LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(
2763 const LocationDescription &Loc, Value *OutlinedFnID,
2764 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
2765 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
2766
2767 /// Callback type for generating the bodies of device directives that require
2768 /// outer target tasks (e.g. in case of having `nowait` or `depend` clauses).
2769 ///
2770 /// \param DeviceID The ID of the device on which the target region will
2771 /// execute.
2772 /// \param RTLoc Source location identifier
2773 /// \Param TargetTaskAllocaIP Insertion point for the alloca block of the
2774 /// generated task.
2775 ///
2776 /// \return an error, if any were triggered during execution.
2777 using TargetTaskBodyCallbackTy =
2778 function_ref<Error(Value *DeviceID, Value *RTLoc,
2779 IRBuilderBase::InsertPoint TargetTaskAllocaIP)>;
2780
2781 /// Generate a target-task for the target construct
2782 ///
2783 /// \param TaskBodyCB Callback to generate the actual body of the target task.
2784 /// \param DeviceID Identifier for the device via the 'device' clause.
2785 /// \param RTLoc Source location identifier
2786 /// \param AllocaIP The insertion point to be used for alloca instructions.
2787 /// \param Dependencies Vector of DependData objects holding information of
2788 /// dependencies as specified by the 'depend' clause.
2789 /// \param HasNoWait True if the target construct had 'nowait' on it, false
2790 /// otherwise
2791 LLVM_ABI InsertPointOrErrorTy emitTargetTask(
2792 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
2793 OpenMPIRBuilder::InsertPointTy AllocaIP,
2794 const SmallVector<llvm::OpenMPIRBuilder::DependData> &Dependencies,
2795 const TargetDataRTArgs &RTArgs, bool HasNoWait);
2796
2797 /// Emit the arguments to be passed to the runtime library based on the
2798 /// arrays of base pointers, pointers, sizes, map types, and mappers. If
2799 /// ForEndCall, emit map types to be passed for the end of the region instead
2800 /// of the beginning.
2801 LLVM_ABI void emitOffloadingArraysArgument(
2802 IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs,
2803 OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall = false);
2804
2805 /// Emit an array of struct descriptors to be assigned to the offload args.
2806 LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP,
2807 InsertPointTy CodeGenIP,
2808 MapInfosTy &CombinedInfo,
2809 TargetDataInfo &Info);
2810
2811 /// Emit the arrays used to pass the captures and map information to the
2812 /// offloading runtime library. If there is no map or capture information,
2813 /// return nullptr by reference. Accepts a reference to a MapInfosTy object
2814 /// that contains information generated for mappable clauses,
2815 /// including base pointers, pointers, sizes, map types, user-defined mappers.
2816 LLVM_ABI Error emitOffloadingArrays(
2817 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
2818 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
2819 bool IsNonContiguous = false,
2820 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
2821
2822 /// Allocates memory for and populates the arrays required for offloading
2823 /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
2824 /// emits their base addresses as arguments to be passed to the runtime
2825 /// library. In essence, this function is a combination of
2826 /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
2827 /// be preferred by clients of OpenMPIRBuilder.
2828 LLVM_ABI Error emitOffloadingArraysAndArgs(
2829 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
2830 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
2831 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous = false,
2832 bool ForEndCall = false,
2833 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
2834
2835 /// Creates offloading entry for the provided entry ID \a ID, address \a
2836 /// Addr, size \a Size, and flags \a Flags.
2837 LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size,
2838 int32_t Flags, GlobalValue::LinkageTypes,
2839 StringRef Name = "");
2840
2841 /// The kind of errors that can occur when emitting the offload entries and
2842 /// metadata.
2843 enum EmitMetadataErrorKind {
2844 EMIT_MD_TARGET_REGION_ERROR,
2845 EMIT_MD_DECLARE_TARGET_ERROR,
2846 EMIT_MD_GLOBAL_VAR_LINK_ERROR,
2847 EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR
2848 };
2849
2850 /// Callback function type
2851 using EmitMetadataErrorReportFunctionTy =
2852 std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>;
2853
2854 // Emit the offloading entries and metadata so that the device codegen side
2855 // can easily figure out what to emit. The produced metadata looks like
2856 // this:
2857 //
2858 // !omp_offload.info = !{!1, ...}
2859 //
2860 // We only generate metadata for function that contain target regions.
2861 LLVM_ABI void createOffloadEntriesAndInfoMetadata(
2862 EmitMetadataErrorReportFunctionTy &ErrorReportFunction);
2863
2864public:
2865 /// Generator for __kmpc_copyprivate
2866 ///
2867 /// \param Loc The source location description.
2868 /// \param BufSize Number of elements in the buffer.
2869 /// \param CpyBuf List of pointers to data to be copied.
2870 /// \param CpyFn function to call for copying data.
2871 /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
2872 ///
2873 /// \return The insertion position *after* the CopyPrivate call.
2874
2875 LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc,
2876 llvm::Value *BufSize,
2877 llvm::Value *CpyBuf,
2878 llvm::Value *CpyFn,
2879 llvm::Value *DidIt);
2880
2881 /// Generator for '#omp single'
2882 ///
2883 /// \param Loc The source location description.
2884 /// \param BodyGenCB Callback that will generate the region code.
2885 /// \param FiniCB Callback to finalize variable copies.
2886 /// \param IsNowait If false, a barrier is emitted.
2887 /// \param CPVars copyprivate variables.
2888 /// \param CPFuncs copy functions to use for each copyprivate variable.
2889 ///
2890 /// \returns The insertion position *after* the single call.
2891 LLVM_ABI InsertPointOrErrorTy
2892 createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
2893 FinalizeCallbackTy FiniCB, bool IsNowait,
2894 ArrayRef<llvm::Value *> CPVars = {},
2895 ArrayRef<llvm::Function *> CPFuncs = {});
2896
2897 /// Generator for '#omp master'
2898 ///
2899 /// \param Loc The insert and source location description.
2900 /// \param BodyGenCB Callback that will generate the region code.
2901 /// \param FiniCB Callback to finalize variable copies.
2902 ///
2903 /// \returns The insertion position *after* the master.
2904 LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc,
2905 BodyGenCallbackTy BodyGenCB,
2906 FinalizeCallbackTy FiniCB);
2907
2908 /// Generator for '#omp masked'
2909 ///
2910 /// \param Loc The insert and source location description.
2911 /// \param BodyGenCB Callback that will generate the region code.
2912 /// \param FiniCB Callback to finialize variable copies.
2913 ///
2914 /// \returns The insertion position *after* the masked.
2915 LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc,
2916 BodyGenCallbackTy BodyGenCB,
2917 FinalizeCallbackTy FiniCB,
2918 Value *Filter);
2919
2920 /// This function performs the scan reduction of the values updated in
2921 /// the input phase. The reduction logic needs to be emitted between input
2922 /// and scan loop returned by `CreateCanonicalScanLoops`. The following
2923 /// is the code that is generated, `buffer` and `span` are expected to be
2924 /// populated before executing the generated code.
2925 /// \code{c}
2926 /// for (int k = 0; k != ceil(log2(span)); ++k) {
2927 /// i=pow(2,k)
2928 /// for (size cnt = last_iter; cnt >= i; --cnt)
2929 /// buffer[cnt] op= buffer[cnt-i];
2930 /// }
2931 /// \endcode
2932 /// \param Loc The insert and source location description.
2933 /// \param ReductionInfos Array type containing the ReductionOps.
2934 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
2935 /// `ScanInfoInitialize`.
2936 ///
2937 /// \returns The insertion position *after* the masked.
2938 LLVM_ABI InsertPointOrErrorTy emitScanReduction(
2939 const LocationDescription &Loc,
2940 ArrayRef<llvm::OpenMPIRBuilder::ReductionInfo> ReductionInfos,
2941 ScanInfo *ScanRedInfo);
2942
2943 /// This directive split and directs the control flow to input phase
2944 /// blocks or scan phase blocks based on 1. whether input loop or scan loop
2945 /// is executed, 2. whether exclusive or inclusive scan is used.
2946 ///
2947 /// \param Loc The insert and source location description.
2948 /// \param AllocaIP The IP where the temporary buffer for scan reduction
2949 // needs to be allocated.
2950 /// \param ScanVars Scan Variables.
2951 /// \param IsInclusive Whether it is an inclusive or exclusive scan.
2952 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
2953 /// `ScanInfoInitialize`.
2954 ///
2955 /// \returns The insertion position *after* the scan.
2956 LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc,
2957 InsertPointTy AllocaIP,
2958 ArrayRef<llvm::Value *> ScanVars,
2959 ArrayRef<llvm::Type *> ScanVarsType,
2960 bool IsInclusive,
2961 ScanInfo *ScanRedInfo);
2962
2963 /// Generator for '#omp critical'
2964 ///
2965 /// \param Loc The insert and source location description.
2966 /// \param BodyGenCB Callback that will generate the region body code.
2967 /// \param FiniCB Callback to finalize variable copies.
2968 /// \param CriticalName name of the lock used by the critical directive
2969 /// \param HintInst Hint Instruction for hint clause associated with critical
2970 ///
2971 /// \returns The insertion position *after* the critical.
2972 LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc,
2973 BodyGenCallbackTy BodyGenCB,
2974 FinalizeCallbackTy FiniCB,
2975 StringRef CriticalName,
2976 Value *HintInst);
2977
2978 /// Generator for '#omp ordered depend (source | sink)'
2979 ///
2980 /// \param Loc The insert and source location description.
2981 /// \param AllocaIP The insertion point to be used for alloca instructions.
2982 /// \param NumLoops The number of loops in depend clause.
2983 /// \param StoreValues The value will be stored in vector address.
2984 /// \param Name The name of alloca instruction.
2985 /// \param IsDependSource If true, depend source; otherwise, depend sink.
2986 ///
2987 /// \return The insertion position *after* the ordered.
2988 LLVM_ABI InsertPointTy
2989 createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP,
2990 unsigned NumLoops, ArrayRef<llvm::Value *> StoreValues,
2991 const Twine &Name, bool IsDependSource);
2992
2993 /// Generator for '#omp ordered [threads | simd]'
2994 ///
2995 /// \param Loc The insert and source location description.
2996 /// \param BodyGenCB Callback that will generate the region code.
2997 /// \param FiniCB Callback to finalize variable copies.
2998 /// \param IsThreads If true, with threads clause or without clause;
2999 /// otherwise, with simd clause;
3000 ///
3001 /// \returns The insertion position *after* the ordered.
3002 LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(
3003 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
3004 FinalizeCallbackTy FiniCB, bool IsThreads);
3005
3006 /// Generator for '#omp sections'
3007 ///
3008 /// \param Loc The insert and source location description.
3009 /// \param AllocaIP The insertion points to be used for alloca instructions.
3010 /// \param SectionCBs Callbacks that will generate body of each section.
3011 /// \param PrivCB Callback to copy a given variable (think copy constructor).
3012 /// \param FiniCB Callback to finalize variable copies.
3013 /// \param IsCancellable Flag to indicate a cancellable parallel region.
3014 /// \param IsNowait If true, barrier - to ensure all sections are executed
3015 /// before moving forward will not be generated.
3016 /// \returns The insertion position *after* the sections.
3017 LLVM_ABI InsertPointOrErrorTy
3018 createSections(const LocationDescription &Loc, InsertPointTy AllocaIP,
3019 ArrayRef<StorableBodyGenCallbackTy> SectionCBs,
3020 PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB,
3021 bool IsCancellable, bool IsNowait);
3022
3023 /// Generator for '#omp section'
3024 ///
3025 /// \param Loc The insert and source location description.
3026 /// \param BodyGenCB Callback that will generate the region body code.
3027 /// \param FiniCB Callback to finalize variable copies.
3028 /// \returns The insertion position *after* the section.
3029 LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc,
3030 BodyGenCallbackTy BodyGenCB,
3031 FinalizeCallbackTy FiniCB);
3032
3033 /// Generator for `#omp teams`
3034 ///
3035 /// \param Loc The location where the teams construct was encountered.
3036 /// \param BodyGenCB Callback that will generate the region code.
3037 /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
3038 /// it is as if lower bound is specified as equal to upperbound. If
3039 /// this is non-null, then upperbound must also be non-null.
3040 /// \param NumTeamsUpper Upper bound on the number of teams.
3041 /// \param ThreadLimit on the number of threads that may participate in a
3042 /// contention group created by each team.
3043 /// \param IfExpr is the integer argument value of the if condition on the
3044 /// teams clause.
3045 LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc,
3046 BodyGenCallbackTy BodyGenCB,
3047 Value *NumTeamsLower = nullptr,
3048 Value *NumTeamsUpper = nullptr,
3049 Value *ThreadLimit = nullptr,
3050 Value *IfExpr = nullptr);
3051
3052 /// Generator for `#omp distribute`
3053 ///
3054 /// \param Loc The location where the distribute construct was encountered.
3055 /// \param AllocaIP The insertion points to be used for alloca instructions.
3056 /// \param BodyGenCB Callback that will generate the region code.
3057 LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc,
3058 InsertPointTy AllocaIP,
3059 BodyGenCallbackTy BodyGenCB);
3060
3061 /// Generate conditional branch and relevant BasicBlocks through which private
3062 /// threads copy the 'copyin' variables from Master copy to threadprivate
3063 /// copies.
3064 ///
3065 /// \param IP insertion block for copyin conditional
3066 /// \param MasterVarPtr a pointer to the master variable
3067 /// \param PrivateVarPtr a pointer to the threadprivate variable
3068 /// \param IntPtrTy Pointer size type
3069 /// \param BranchtoEnd Create a branch between the copyin.not.master blocks
3070 // and copy.in.end block
3071 ///
3072 /// \returns The insertion point where copying operation to be emitted.
3073 LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP,
3074 Value *MasterAddr,
3075 Value *PrivateAddr,
3076 llvm::IntegerType *IntPtrTy,
3077 bool BranchtoEnd = true);
3078
3079 /// Create a runtime call for kmpc_Alloc
3080 ///
3081 /// \param Loc The insert and source location description.
3082 /// \param Size Size of allocated memory space
3083 /// \param Allocator Allocator information instruction
3084 /// \param Name Name of call Instruction for OMP_alloc
3085 ///
3086 /// \returns CallInst to the OMP_Alloc call
3087 LLVM_ABI CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
3088 Value *Allocator, std::string Name = "");
3089
3090 /// Create a runtime call for kmpc_free
3091 ///
3092 /// \param Loc The insert and source location description.
3093 /// \param Addr Address of memory space to be freed
3094 /// \param Allocator Allocator information instruction
3095 /// \param Name Name of call Instruction for OMP_Free
3096 ///
3097 /// \returns CallInst to the OMP_Free call
3098 LLVM_ABI CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
3099 Value *Allocator, std::string Name = "");
3100
3101 /// Create a runtime call for kmpc_threadprivate_cached
3102 ///
3103 /// \param Loc The insert and source location description.
3104 /// \param Pointer pointer to data to be cached
3105 /// \param Size size of data to be cached
3106 /// \param Name Name of call Instruction for callinst
3107 ///
3108 /// \returns CallInst to the thread private cache call.
3109 LLVM_ABI CallInst *
3110 createCachedThreadPrivate(const LocationDescription &Loc,
3111 llvm::Value *Pointer, llvm::ConstantInt *Size,
3112 const llvm::Twine &Name = Twine(""));
3113
3114 /// Create a runtime call for __tgt_interop_init
3115 ///
3116 /// \param Loc The insert and source location description.
3117 /// \param InteropVar variable to be allocated
3118 /// \param InteropType type of interop operation
3119 /// \param Device devide to which offloading will occur
3120 /// \param NumDependences number of dependence variables
3121 /// \param DependenceAddress pointer to dependence variables
3122 /// \param HaveNowaitClause does nowait clause exist
3123 ///
3124 /// \returns CallInst to the __tgt_interop_init call
3125 LLVM_ABI CallInst *createOMPInteropInit(const LocationDescription &Loc,
3126 Value *InteropVar,
3127 omp::OMPInteropType InteropType,
3128 Value *Device, Value *NumDependences,
3129 Value *DependenceAddress,
3130 bool HaveNowaitClause);
3131
3132 /// Create a runtime call for __tgt_interop_destroy
3133 ///
3134 /// \param Loc The insert and source location description.
3135 /// \param InteropVar variable to be allocated
3136 /// \param Device devide to which offloading will occur
3137 /// \param NumDependences number of dependence variables
3138 /// \param DependenceAddress pointer to dependence variables
3139 /// \param HaveNowaitClause does nowait clause exist
3140 ///
3141 /// \returns CallInst to the __tgt_interop_destroy call
3142 LLVM_ABI CallInst *createOMPInteropDestroy(const LocationDescription &Loc,
3143 Value *InteropVar, Value *Device,
3144 Value *NumDependences,
3145 Value *DependenceAddress,
3146 bool HaveNowaitClause);
3147
3148 /// Create a runtime call for __tgt_interop_use
3149 ///
3150 /// \param Loc The insert and source location description.
3151 /// \param InteropVar variable to be allocated
3152 /// \param Device devide to which offloading will occur
3153 /// \param NumDependences number of dependence variables
3154 /// \param DependenceAddress pointer to dependence variables
3155 /// \param HaveNowaitClause does nowait clause exist
3156 ///
3157 /// \returns CallInst to the __tgt_interop_use call
3158 LLVM_ABI CallInst *createOMPInteropUse(const LocationDescription &Loc,
3159 Value *InteropVar, Value *Device,
3160 Value *NumDependences,
3161 Value *DependenceAddress,
3162 bool HaveNowaitClause);
3163
3164 /// The `omp target` interface
3165 ///
3166 /// For more information about the usage of this interface,
3167 /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
3168 ///
3169 ///{
3170
3171 /// Create a runtime call for kmpc_target_init
3172 ///
3173 /// \param Loc The insert and source location description.
3174 /// \param Attrs Structure containing the default attributes, including
3175 /// numbers of threads and teams to launch the kernel with.
3176 LLVM_ABI InsertPointTy createTargetInit(
3177 const LocationDescription &Loc,
3178 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs);
3179
3180 /// Create a runtime call for kmpc_target_deinit
3181 ///
3182 /// \param Loc The insert and source location description.
3183 /// \param TeamsReductionDataSize The maximal size of all the reduction data
3184 /// for teams reduction.
3185 /// \param TeamsReductionBufferLength The number of elements (each of up to
3186 /// \p TeamsReductionDataSize size), in the teams reduction buffer.
3187 LLVM_ABI void createTargetDeinit(const LocationDescription &Loc,
3188 int32_t TeamsReductionDataSize = 0,
3189 int32_t TeamsReductionBufferLength = 1024);
3190
3191 ///}
3192
3193 /// Helpers to read/write kernel annotations from the IR.
3194 ///
3195 ///{
3196
3197 /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
3198 /// is set.
3199 LLVM_ABI static std::pair<int32_t, int32_t>
3200 readThreadBoundsForKernel(const Triple &T, Function &Kernel);
3201 LLVM_ABI static void writeThreadBoundsForKernel(const Triple &T,
3202 Function &Kernel, int32_t LB,
3203 int32_t UB);
3204
3205 /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
3206 /// is set.
3207 LLVM_ABI static std::pair<int32_t, int32_t>
3208 readTeamBoundsForKernel(const Triple &T, Function &Kernel);
3209 LLVM_ABI static void writeTeamsForKernel(const Triple &T, Function &Kernel,
3210 int32_t LB, int32_t UB);
3211 ///}
3212
3213private:
3214 // Sets the function attributes expected for the outlined function
3215 void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);
3216
3217 // Creates the function ID/Address for the given outlined function.
3218 // In the case of an embedded device function the address of the function is
3219 // used, in the case of a non-offload function a constant is created.
3220 Constant *createOutlinedFunctionID(Function *OutlinedFn,
3221 StringRef EntryFnIDName);
3222
3223 // Creates the region entry address for the outlined function
3224 Constant *createTargetRegionEntryAddr(Function *OutlinedFunction,
3225 StringRef EntryFnName);
3226
3227public:
3228 /// Functions used to generate a function with the given name.
3229 using FunctionGenCallback =
3230 std::function<Expected<Function *>(StringRef FunctionName)>;
3231
3232 /// Create a unique name for the entry function using the source location
3233 /// information of the current target region. The name will be something like:
3234 ///
3235 /// __omp_offloading_DD_FFFF_PP_lBB[_CC]
3236 ///
3237 /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
3238 /// mangled name of the function that encloses the target region and BB is the
3239 /// line number of the target region. CC is a count added when more than one
3240 /// region is located at the same location.
3241 ///
3242 /// If this target outline function is not an offload entry, we don't need to
3243 /// register it. This may happen if it is guarded by an if clause that is
3244 /// false at compile time, or no target archs have been specified.
3245 ///
3246 /// The created target region ID is used by the runtime library to identify
3247 /// the current target region, so it only has to be unique and not
3248 /// necessarily point to anything. It could be the pointer to the outlined
3249 /// function that implements the target region, but we aren't using that so
3250 /// that the compiler doesn't need to keep that, and could therefore inline
3251 /// the host function if proven worthwhile during optimization. In the other
3252 /// hand, if emitting code for the device, the ID has to be the function
3253 /// address so that it can retrieved from the offloading entry and launched
3254 /// by the runtime library. We also mark the outlined function to have
3255 /// external linkage in case we are emitting code for the device, because
3256 /// these functions will be entry points to the device.
3257 ///
3258 /// \param InfoManager The info manager keeping track of the offload entries
3259 /// \param EntryInfo The entry information about the function
3260 /// \param GenerateFunctionCallback The callback function to generate the code
3261 /// \param OutlinedFunction Pointer to the outlined function
3262 /// \param EntryFnIDName Name of the ID o be created
3263 LLVM_ABI Error emitTargetRegionFunction(
3264 TargetRegionEntryInfo &EntryInfo,
3265 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
3266 Function *&OutlinedFn, Constant *&OutlinedFnID);
3267
3268 /// Registers the given function and sets up the attribtues of the function
3269 /// Returns the FunctionID.
3270 ///
3271 /// \param InfoManager The info manager keeping track of the offload entries
3272 /// \param EntryInfo The entry information about the function
3273 /// \param OutlinedFunction Pointer to the outlined function
3274 /// \param EntryFnName Name of the outlined function
3275 /// \param EntryFnIDName Name of the ID o be created
3277 registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
3278 Function *OutlinedFunction,
3279 StringRef EntryFnName, StringRef EntryFnIDName);
3280
3281 /// Type of BodyGen to use for region codegen
3282 ///
3283 /// Priv: If device pointer privatization is required, emit the body of the
3284 /// region here. It will have to be duplicated: with and without
3285 /// privatization.
3286 /// DupNoPriv: If we need device pointer privatization, we need
3287 /// to emit the body of the region with no privatization in the 'else' branch
3288 /// of the conditional.
3289 /// NoPriv: If we don't require privatization of device
3290 /// pointers, we emit the body in between the runtime calls. This avoids
3291 /// duplicating the body code.
3292 enum BodyGenTy { Priv, DupNoPriv, NoPriv };
3293
3294 /// Callback type for creating the map infos for the kernel parameters.
3295 /// \param CodeGenIP is the insertion point where code should be generated,
3296 /// if any.
3297 using GenMapInfoCallbackTy =
3298 function_ref<MapInfosTy &(InsertPointTy CodeGenIP)>;
3299
3300private:
3301 /// Emit the array initialization or deletion portion for user-defined mapper
3302 /// code generation. First, it evaluates whether an array section is mapped
3303 /// and whether the \a MapType instructs to delete this section. If \a IsInit
3304 /// is true, and \a MapType indicates to not delete this array, array
3305 /// initialization code is generated. If \a IsInit is false, and \a MapType
3306 /// indicates to delete this array, array deletion code is generated.
3307 void emitUDMapperArrayInitOrDel(Function *MapperFn, llvm::Value *MapperHandle,
3308 llvm::Value *Base, llvm::Value *Begin,
3309 llvm::Value *Size, llvm::Value *MapType,
3310 llvm::Value *MapName, TypeSize ElementSize,
3311 llvm::BasicBlock *ExitBB, bool IsInit);
3312
3313public:
3314 /// Emit the user-defined mapper function. The code generation follows the
3315 /// pattern in the example below.
3316 /// \code
3317 /// void .omp_mapper.<type_name>.<mapper_id>.(void *rt_mapper_handle,
3318 /// void *base, void *begin,
3319 /// int64_t size, int64_t type,
3320 /// void *name = nullptr) {
3321 /// // Allocate space for an array section first or add a base/begin for
3322 /// // pointer dereference.
3323 /// if ((size > 1 || (base != begin && maptype.IsPtrAndObj)) &&
3324 /// !maptype.IsDelete)
3325 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin,
3326 /// size*sizeof(Ty), clearToFromMember(type));
3327 /// // Map members.
3328 /// for (unsigned i = 0; i < size; i++) {
3329 /// // For each component specified by this mapper:
3330 /// for (auto c : begin[i]->all_components) {
3331 /// if (c.hasMapper())
3332 /// (*c.Mapper())(rt_mapper_handle, c.arg_base, c.arg_begin,
3333 /// c.arg_size,
3334 /// c.arg_type, c.arg_name);
3335 /// else
3336 /// __tgt_push_mapper_component(rt_mapper_handle, c.arg_base,
3337 /// c.arg_begin, c.arg_size, c.arg_type,
3338 /// c.arg_name);
3339 /// }
3340 /// }
3341 /// // Delete the array section.
3342 /// if (size > 1 && maptype.IsDelete)
3343 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin,
3344 /// size*sizeof(Ty), clearToFromMember(type));
3345 /// }
3346 /// \endcode
3347 ///
3348 /// \param PrivAndGenMapInfoCB Callback that privatizes code and populates the
3349 /// MapInfos and returns.
3350 /// \param ElemTy DeclareMapper element type.
3351 /// \param FuncName Optional param to specify mapper function name.
3352 /// \param CustomMapperCB Optional callback to generate code related to
3353 /// custom mappers.
3354 LLVM_ABI Expected<Function *> emitUserDefinedMapper(
3355 function_ref<MapInfosOrErrorTy(
3356 InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)>
3357 PrivAndGenMapInfoCB,
3358 llvm::Type *ElemTy, StringRef FuncName,
3359 CustomMapperCallbackTy CustomMapperCB);
3360
3361 /// Generator for '#omp target data'
3362 ///
3363 /// \param Loc The location where the target data construct was encountered.
3364 /// \param AllocaIP The insertion points to be used for alloca instructions.
3365 /// \param CodeGenIP The insertion point at which the target directive code
3366 /// should be placed.
3367 /// \param IsBegin If true then emits begin mapper call otherwise emits
3368 /// end mapper call.
3369 /// \param DeviceID Stores the DeviceID from the device clause.
3370 /// \param IfCond Value which corresponds to the if clause condition.
3371 /// \param Info Stores all information realted to the Target Data directive.
3372 /// \param GenMapInfoCB Callback that populates the MapInfos and returns.
3373 /// \param CustomMapperCB Callback to generate code related to
3374 /// custom mappers.
3375 /// \param BodyGenCB Optional Callback to generate the region code.
3376 /// \param DeviceAddrCB Optional callback to generate code related to
3377 /// use_device_ptr and use_device_addr.
3378 LLVM_ABI InsertPointOrErrorTy createTargetData(
3379 const LocationDescription &Loc, InsertPointTy AllocaIP,
3380 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
3381 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
3382 CustomMapperCallbackTy CustomMapperCB,
3383 omp::RuntimeFunction *MapperFunc = nullptr,
3384 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
3385 BodyGenTy BodyGenType)>
3386 BodyGenCB = nullptr,
3387 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
3388 Value *SrcLocInfo = nullptr);
3389
3390 using TargetBodyGenCallbackTy = function_ref<InsertPointOrErrorTy(
3391 InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
3392
3393 using TargetGenArgAccessorsCallbackTy = function_ref<InsertPointOrErrorTy(
3394 Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
3395 InsertPointTy CodeGenIP)>;
3396
3397 /// Generator for '#omp target'
3398 ///
3399 /// \param Loc where the target data construct was encountered.
3400 /// \param IsOffloadEntry whether it is an offload entry.
3401 /// \param CodeGenIP The insertion point where the call to the outlined
3402 /// function should be emitted.
3403 /// \param Info Stores all information realted to the Target directive.
3404 /// \param EntryInfo The entry information about the function.
3405 /// \param DefaultAttrs Structure containing the default attributes, including
3406 /// numbers of threads and teams to launch the kernel with.
3407 /// \param RuntimeAttrs Structure containing the runtime numbers of threads
3408 /// and teams to launch the kernel with.
3409 /// \param IfCond value of the `if` clause.
3410 /// \param Inputs The input values to the region that will be passed.
3411 /// as arguments to the outlined function.
3412 /// \param BodyGenCB Callback that will generate the region code.
3413 /// \param ArgAccessorFuncCB Callback that will generate accessors
3414 /// instructions for passed in target arguments where neccessary
3415 /// \param CustomMapperCB Callback to generate code related to
3416 /// custom mappers.
3417 /// \param Dependencies A vector of DependData objects that carry
3418 /// dependency information as passed in the depend clause
3419 /// \param HasNowait Whether the target construct has a `nowait` clause or
3420 /// not.
3421 /// \param DynCGroupMem The size of the dynamic groupprivate memory for each
3422 /// cgroup.
3423 /// \param DynCGroupMem The fallback mechanism to execute if the requested
3424 /// cgroup memory cannot be provided.
3425 LLVM_ABI InsertPointOrErrorTy createTarget(
3426 const LocationDescription &Loc, bool IsOffloadEntry,
3427 OpenMPIRBuilder::InsertPointTy AllocaIP,
3428 OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info,
3429 TargetRegionEntryInfo &EntryInfo,
3430 const TargetKernelDefaultAttrs &DefaultAttrs,
3431 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
3432 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
3433 TargetBodyGenCallbackTy BodyGenCB,
3434 TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
3435 CustomMapperCallbackTy CustomMapperCB,
3436 const SmallVector<DependData> &Dependencies, bool HasNowait = false,
3437 Value *DynCGroupMem = nullptr,
3438 omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
3439 omp::OMPDynGroupprivateFallbackType::Abort);
3440
3441 /// Returns __kmpc_for_static_init_* runtime function for the specified
3442 /// size \a IVSize and sign \a IVSigned. Will create a distribute call
3443 /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set.
3444 LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize,
3445 bool IVSigned,
3446 bool IsGPUDistribute);
3447
3448 /// Returns __kmpc_dispatch_init_* runtime function for the specified
3449 /// size \a IVSize and sign \a IVSigned.
3450 LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize,
3451 bool IVSigned);
3452
3453 /// Returns __kmpc_dispatch_next_* runtime function for the specified
3454 /// size \a IVSize and sign \a IVSigned.
3455 LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize,
3456 bool IVSigned);
3457
3458 /// Returns __kmpc_dispatch_fini_* runtime function for the specified
3459 /// size \a IVSize and sign \a IVSigned.
3460 LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize,
3461 bool IVSigned);
3462
3463 /// Returns __kmpc_dispatch_deinit runtime function.
3464 LLVM_ABI FunctionCallee createDispatchDeinitFunction();
3465
3466 /// Declarations for LLVM-IR types (simple, array, function and structure) are
3467 /// generated below. Their names are defined and used in OpenMPKinds.def. Here
3468 /// we provide the declarations, the initializeTypes function will provide the
3469 /// values.
3470 ///
3471 ///{
3472#define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr;
3473#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
3474 ArrayType *VarName##Ty = nullptr; \
3475 PointerType *VarName##PtrTy = nullptr;
3476#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
3477 FunctionType *VarName = nullptr; \
3478 PointerType *VarName##Ptr = nullptr;
3479#define OMP_STRUCT_TYPE(VarName, StrName, ...) \
3480 StructType *VarName = nullptr; \
3481 PointerType *VarName##Ptr = nullptr;
3482#include "llvm/Frontend/OpenMP/OMPKinds.def"
3483
3484 ///}
3485
3486private:
3487 /// Create all simple and struct types exposed by the runtime and remember
3488 /// the llvm::PointerTypes of them for easy access later.
3489 void initializeTypes(Module &M);
3490
3491 /// Common interface for generating entry calls for OMP Directives.
3492 /// if the directive has a region/body, It will set the insertion
3493 /// point to the body
3494 ///
3495 /// \param OMPD Directive to generate entry blocks for
3496 /// \param EntryCall Call to the entry OMP Runtime Function
3497 /// \param ExitBB block where the region ends.
3498 /// \param Conditional indicate if the entry call result will be used
3499 /// to evaluate a conditional of whether a thread will execute
3500 /// body code or not.
3501 ///
3502 /// \return The insertion position in exit block
3503 InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall,
3504 BasicBlock *ExitBB,
3505 bool Conditional = false);
3506
3507 /// Common interface to finalize the region
3508 ///
3509 /// \param OMPD Directive to generate exiting code for
3510 /// \param FinIP Insertion point for emitting Finalization code and exit call.
3511 /// This block must not contain any non-finalization code.
3512 /// \param ExitCall Call to the ending OMP Runtime Function
3513 /// \param HasFinalize indicate if the directive will require finalization
3514 /// and has a finalization callback in the stack that
3515 /// should be called.
3516 ///
3517 /// \return The insertion position in exit block
3518 InsertPointOrErrorTy emitCommonDirectiveExit(omp::Directive OMPD,
3519 InsertPointTy FinIP,
3520 Instruction *ExitCall,
3521 bool HasFinalize = true);
3522
3523 /// Common Interface to generate OMP inlined regions
3524 ///
3525 /// \param OMPD Directive to generate inlined region for
3526 /// \param EntryCall Call to the entry OMP Runtime Function
3527 /// \param ExitCall Call to the ending OMP Runtime Function
3528 /// \param BodyGenCB Body code generation callback.
3529 /// \param FiniCB Finalization Callback. Will be called when finalizing region
3530 /// \param Conditional indicate if the entry call result will be used
3531 /// to evaluate a conditional of whether a thread will execute
3532 /// body code or not.
3533 /// \param HasFinalize indicate if the directive will require finalization
3534 /// and has a finalization callback in the stack that
3535 /// should be called.
3536 /// \param IsCancellable if HasFinalize is set to true, indicate if the
3537 /// the directive should be cancellable.
3538 /// \return The insertion point after the region
3539 InsertPointOrErrorTy
3540 EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall,
3541 Instruction *ExitCall, BodyGenCallbackTy BodyGenCB,
3542 FinalizeCallbackTy FiniCB, bool Conditional = false,
3543 bool HasFinalize = true, bool IsCancellable = false);
3544
3545 /// Get the platform-specific name separator.
3546 /// \param Parts different parts of the final name that needs separation
3547 /// \param FirstSeparator First separator used between the initial two
3548 /// parts of the name.
3549 /// \param Separator separator used between all of the rest consecutive
3550 /// parts of the name
3551 static std::string getNameWithSeparators(ArrayRef<StringRef> Parts,
3552 StringRef FirstSeparator,
3553 StringRef Separator);
3554
3555 /// Returns corresponding lock object for the specified critical region
3556 /// name. If the lock object does not exist it is created, otherwise the
3557 /// reference to the existing copy is returned.
3558 /// \param CriticalName Name of the critical region.
3559 ///
3560 Value *getOMPCriticalRegionLock(StringRef CriticalName);
3561
3562 /// Callback type for Atomic Expression update
3563 /// ex:
3564 /// \code{.cpp}
3565 /// unsigned x = 0;
3566 /// #pragma omp atomic update
3567 /// x = Expr(x_old); //Expr() is any legal operation
3568 /// \endcode
3569 ///
3570 /// \param XOld the value of the atomic memory address to use for update
3571 /// \param IRB reference to the IRBuilder to use
3572 ///
3573 /// \returns Value to update X to.
3574 using AtomicUpdateCallbackTy =
3575 const function_ref<Expected<Value *>(Value *XOld, IRBuilder<> &IRB)>;
3576
3577private:
3578 enum AtomicKind { Read, Write, Update, Capture, Compare };
3579
3580 /// Determine whether to emit flush or not
3581 ///
3582 /// \param Loc The insert and source location description.
3583 /// \param AO The required atomic ordering
3584 /// \param AK The OpenMP atomic operation kind used.
3585 ///
3586 /// \returns wether a flush was emitted or not
3587 bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc,
3588 AtomicOrdering AO, AtomicKind AK);
3589
3590 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3591 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3592 /// Only Scalar data types.
3593 ///
3594 /// \param AllocaIP The insertion point to be used for alloca
3595 /// instructions.
3596 /// \param X The target atomic pointer to be updated
3597 /// \param XElemTy The element type of the atomic pointer.
3598 /// \param Expr The value to update X with.
3599 /// \param AO Atomic ordering of the generated atomic
3600 /// instructions.
3601 /// \param RMWOp The binary operation used for update. If
3602 /// operation is not supported by atomicRMW,
3603 /// or belong to {FADD, FSUB, BAD_BINOP}.
3604 /// Then a `cmpExch` based atomic will be generated.
3605 /// \param UpdateOp Code generator for complex expressions that cannot be
3606 /// expressed through atomicrmw instruction.
3607 /// \param VolatileX true if \a X volatile?
3608 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3609 /// update expression, false otherwise.
3610 /// (e.g. true for X = X BinOp Expr)
3611 ///
3612 /// \returns A pair of the old value of X before the update, and the value
3613 /// used for the update.
3614 Expected<std::pair<Value *, Value *>>
3615 emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
3616 AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3617 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
3618 bool IsXBinopExpr, bool IsIgnoreDenormalMode,
3619 bool IsFineGrainedMemory, bool IsRemoteMemory);
3620
3621 /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
3622 ///
3623 /// \Return The instruction
3624 Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
3625 AtomicRMWInst::BinOp RMWOp);
3626
3627 bool IsFinalized;
3628
3629public:
3630 /// a struct to pack relevant information while generating atomic Ops
3631 struct AtomicOpValue {
3632 Value *Var = nullptr;
3633 Type *ElemTy = nullptr;
3634 bool IsSigned = false;
3635 bool IsVolatile = false;
3636 };
3637
3638 /// Emit atomic Read for : V = X --- Only Scalar data types.
3639 ///
3640 /// \param Loc The insert and source location description.
3641 /// \param X The target pointer to be atomically read
3642 /// \param V Memory address where to store atomically read
3643 /// value
3644 /// \param AO Atomic ordering of the generated atomic
3645 /// instructions.
3646 /// \param AllocaIP Insert point for allocas
3647 //
3648 /// \return Insertion point after generated atomic read IR.
3649 LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc,
3650 AtomicOpValue &X, AtomicOpValue &V,
3651 AtomicOrdering AO,
3652 InsertPointTy AllocaIP);
3653
3654 /// Emit atomic write for : X = Expr --- Only Scalar data types.
3655 ///
3656 /// \param Loc The insert and source location description.
3657 /// \param X The target pointer to be atomically written to
3658 /// \param Expr The value to store.
3659 /// \param AO Atomic ordering of the generated atomic
3660 /// instructions.
3661 /// \param AllocaIP Insert point for allocas
3662 ///
3663 /// \return Insertion point after generated atomic Write IR.
3664 LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc,
3665 AtomicOpValue &X, Value *Expr,
3666 AtomicOrdering AO,
3667 InsertPointTy AllocaIP);
3668
3669 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3670 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3671 /// Only Scalar data types.
3672 ///
3673 /// \param Loc The insert and source location description.
3674 /// \param AllocaIP The insertion point to be used for alloca instructions.
3675 /// \param X The target atomic pointer to be updated
3676 /// \param Expr The value to update X with.
3677 /// \param AO Atomic ordering of the generated atomic instructions.
3678 /// \param RMWOp The binary operation used for update. If operation
3679 /// is not supported by atomicRMW, or belong to
3680 /// {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based
3681 /// atomic will be generated.
3682 /// \param UpdateOp Code generator for complex expressions that cannot be
3683 /// expressed through atomicrmw instruction.
3684 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3685 /// update expression, false otherwise.
3686 /// (e.g. true for X = X BinOp Expr)
3687 ///
3688 /// \return Insertion point after generated atomic update IR.
3689 LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(
3690 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
3691 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3692 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
3693 bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false,
3694 bool IsRemoteMemory = false);
3695
3696 /// Emit atomic update for constructs: --- Only Scalar data types
3697 /// V = X; X = X BinOp Expr ,
3698 /// X = X BinOp Expr; V = X,
3699 /// V = X; X = Expr BinOp X,
3700 /// X = Expr BinOp X; V = X,
3701 /// V = X; X = UpdateOp(X),
3702 /// X = UpdateOp(X); V = X,
3703 ///
3704 /// \param Loc The insert and source location description.
3705 /// \param AllocaIP The insertion point to be used for alloca instructions.
3706 /// \param X The target atomic pointer to be updated
3707 /// \param V Memory address where to store captured value
3708 /// \param Expr The value to update X with.
3709 /// \param AO Atomic ordering of the generated atomic instructions
3710 /// \param RMWOp The binary operation used for update. If
3711 /// operation is not supported by atomicRMW, or belong to
3712 /// {FADD, FSUB, BAD_BINOP}. Then a cmpExch based
3713 /// atomic will be generated.
3714 /// \param UpdateOp Code generator for complex expressions that cannot be
3715 /// expressed through atomicrmw instruction.
3716 /// \param UpdateExpr true if X is an in place update of the form
3717 /// X = X BinOp Expr or X = Expr BinOp X
3718 /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the
3719 /// update expression, false otherwise.
3720 /// (e.g. true for X = X BinOp Expr)
3721 /// \param IsPostfixUpdate true if original value of 'x' must be stored in
3722 /// 'v', not an updated one.
3723 ///
3724 /// \return Insertion point after generated atomic capture IR.
3725 LLVM_ABI InsertPointOrErrorTy createAtomicCapture(
3726 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
3727 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
3728 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
3729 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
3730 bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false,
3731 bool IsRemoteMemory = false);
3732
3733 /// Emit atomic compare for constructs: --- Only scalar data types
3734 /// cond-expr-stmt:
3735 /// x = x ordop expr ? expr : x;
3736 /// x = expr ordop x ? expr : x;
3737 /// x = x == e ? d : x;
3738 /// x = e == x ? d : x; (this one is not in the spec)
3739 /// cond-update-stmt:
3740 /// if (x ordop expr) { x = expr; }
3741 /// if (expr ordop x) { x = expr; }
3742 /// if (x == e) { x = d; }
3743 /// if (e == x) { x = d; } (this one is not in the spec)
3744 /// conditional-update-capture-atomic:
3745 /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
3746 /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
3747 /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3748 /// IsFailOnly=true)
3749 /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
3750 /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3751 /// IsFailOnly=true)
3752 ///
3753 /// \param Loc The insert and source location description.
3754 /// \param X The target atomic pointer to be updated.
3755 /// \param V Memory address where to store captured value (for
3756 /// compare capture only).
3757 /// \param R Memory address where to store comparison result
3758 /// (for compare capture with '==' only).
3759 /// \param E The expected value ('e') for forms that use an
3760 /// equality comparison or an expression ('expr') for
3761 /// forms that use 'ordop' (logically an atomic maximum or
3762 /// minimum).
3763 /// \param D The desired value for forms that use an equality
3764 /// comparison. If forms that use 'ordop', it should be
3765 /// \p nullptr.
3766 /// \param AO Atomic ordering of the generated atomic instructions.
3767 /// \param Op Atomic compare operation. It can only be ==, <, or >.
3768 /// \param IsXBinopExpr True if the conditional statement is in the form where
3769 /// x is on LHS. It only matters for < or >.
3770 /// \param IsPostfixUpdate True if original value of 'x' must be stored in
3771 /// 'v', not an updated one (for compare capture
3772 /// only).
3773 /// \param IsFailOnly True if the original value of 'x' is stored to 'v'
3774 /// only when the comparison fails. This is only valid for
3775 /// the case the comparison is '=='.
3776 ///
3777 /// \return Insertion point after generated atomic capture IR.
3778 LLVM_ABI InsertPointTy
3779 createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X,
3780 AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D,
3781 AtomicOrdering AO, omp::OMPAtomicCompareOp Op,
3782 bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
3783 LLVM_ABI InsertPointTy createAtomicCompare(
3784 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
3785 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
3786 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
3787 bool IsFailOnly, AtomicOrdering Failure);
3788
3789 /// Create the control flow structure of a canonical OpenMP loop.
3790 ///
3791 /// The emitted loop will be disconnected, i.e. no edge to the loop's
3792 /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
3793 /// IRBuilder location is not preserved.
3794 ///
3795 /// \param DL DebugLoc used for the instructions in the skeleton.
3796 /// \param TripCount Value to be used for the trip count.
3797 /// \param F Function in which to insert the BasicBlocks.
3798 /// \param PreInsertBefore Where to insert BBs that execute before the body,
3799 /// typically the body itself.
3800 /// \param PostInsertBefore Where to insert BBs that execute after the body.
3801 /// \param Name Base name used to derive BB
3802 /// and instruction names.
3803 ///
3804 /// \returns The CanonicalLoopInfo that represents the emitted loop.
3805 LLVM_ABI CanonicalLoopInfo *createLoopSkeleton(DebugLoc DL, Value *TripCount,
3806 Function *F,
3807 BasicBlock *PreInsertBefore,
3808 BasicBlock *PostInsertBefore,
3809 const Twine &Name = {});
3810 /// OMP Offload Info Metadata name string
3811 const std::string ompOffloadInfoName = "omp_offload.info";
3812
3813 /// Loads all the offload entries information from the host IR
3814 /// metadata. This function is only meant to be used with device code
3815 /// generation.
3816 ///
3817 /// \param M Module to load Metadata info from. Module passed maybe
3818 /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
3819 LLVM_ABI void loadOffloadInfoMetadata(Module &M);
3820
3821 /// Loads all the offload entries information from the host IR
3822 /// metadata read from the file passed in as the HostFilePath argument. This
3823 /// function is only meant to be used with device code generation.
3824 ///
3825 /// \param HostFilePath The path to the host IR file,
3826 /// used to load in offload metadata for the device, allowing host and device
3827 /// to maintain the same metadata mapping.
3828 LLVM_ABI void loadOffloadInfoMetadata(vfs::FileSystem &VFS,
3829 StringRef HostFilePath);
3830
3831 /// Gets (if variable with the given name already exist) or creates
3832 /// internal global variable with the specified Name. The created variable has
3833 /// linkage CommonLinkage by default and is initialized by null value.
3834 /// \param Ty Type of the global variable. If it is exist already the type
3835 /// must be the same.
3836 /// \param Name Name of the variable.
3837 LLVM_ABI GlobalVariable *
3838 getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
3839 std::optional<unsigned> AddressSpace = {});
3840};
3841
3842/// Class to represented the control flow structure of an OpenMP canonical loop.
3843///
3844/// The control-flow structure is standardized for easy consumption by
3845/// directives associated with loops. For instance, the worksharing-loop
3846/// construct may change this control flow such that each loop iteration is
3847/// executed on only one thread. The constraints of a canonical loop in brief
3848/// are:
3849///
3850/// * The number of loop iterations must have been computed before entering the
3851/// loop.
3852///
3853/// * Has an (unsigned) logical induction variable that starts at zero and
3854/// increments by one.
3855///
3856/// * The loop's CFG itself has no side-effects. The OpenMP specification
3857/// itself allows side-effects, but the order in which they happen, including
3858/// how often or whether at all, is unspecified. We expect that the frontend
3859/// will emit those side-effect instructions somewhere (e.g. before the loop)
3860/// such that the CanonicalLoopInfo itself can be side-effect free.
3861///
3862/// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
3863/// execution of a loop body that satifies these constraints. It does NOT
3864/// represent arbitrary SESE regions that happen to contain a loop. Do not use
3865/// CanonicalLoopInfo for such purposes.
3866///
3867/// The control flow can be described as follows:
3868///
3869/// Preheader
3870/// |
3871/// /-> Header
3872/// | |
3873/// | Cond---\
3874/// | | |
3875/// | Body |
3876/// | | | |
3877/// | <...> |
3878/// | | | |
3879/// \--Latch |
3880/// |
3881/// Exit
3882/// |
3883/// After
3884///
3885/// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
3886/// including) and end at AfterIP (at the After's first instruction, excluding).
3887/// That is, instructions in the Preheader and After blocks (except the
3888/// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
3889/// side-effects. Typically, the Preheader is used to compute the loop's trip
3890/// count. The instructions from BodyIP (at the Body block's first instruction,
3891/// excluding) until the Latch are also considered outside CanonicalLoopInfo's
3892/// control and thus can have side-effects. The body block is the single entry
3893/// point into the loop body, which may contain arbitrary control flow as long
3894/// as all control paths eventually branch to the Latch block.
3895///
3896/// TODO: Consider adding another standardized BasicBlock between Body CFG and
3897/// Latch to guarantee that there is only a single edge to the latch. It would
3898/// make loop transformations easier to not needing to consider multiple
3899/// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
3900/// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
3901/// executes after each body iteration.
3902///
3903/// There must be no loop-carried dependencies through llvm::Values. This is
3904/// equivalant to that the Latch has no PHINode and the Header's only PHINode is
3905/// for the induction variable.
3906///
3907/// All code in Header, Cond, Latch and Exit (plus the terminator of the
3908/// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
3909/// by assertOK(). They are expected to not be modified unless explicitly
3910/// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
3911/// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
3912/// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
3913/// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
3914/// anymore as its underlying control flow may not exist anymore.
3915/// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
3916/// may also return a new CanonicalLoopInfo that can be passed to other
3917/// loop-associated construct implementing methods. These loop-transforming
3918/// methods may either create a new CanonicalLoopInfo usually using
3919/// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
3920/// modify one of the input CanonicalLoopInfo and return it as representing the
3921/// modified loop. What is done is an implementation detail of
3922/// transformation-implementing method and callers should always assume that the
3923/// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
3924/// Returned CanonicalLoopInfo have the same structure and guarantees as the one
3925/// created by createCanonicalLoop, such that transforming methods do not have
3926/// to special case where the CanonicalLoopInfo originated from.
3927///
3928/// Generally, methods consuming CanonicalLoopInfo do not need an
3929/// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
3930/// CanonicalLoopInfo to insert new or modify existing instructions. Unless
3931/// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
3932/// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
3933/// any InsertPoint in the Preheader, After or Block can still be used after
3934/// calling such a method.
3935///
3936/// TODO: Provide mechanisms for exception handling and cancellation points.
3937///
3938/// Defined outside OpenMPIRBuilder because nested classes cannot be
3939/// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
3940class CanonicalLoopInfo {
3941 friend class OpenMPIRBuilder;
3942
3943private:
3944 BasicBlock *Header = nullptr;
3945 BasicBlock *Cond = nullptr;
3946 BasicBlock *Latch = nullptr;
3947 BasicBlock *Exit = nullptr;
3948
3949 // Hold the MLIR value for the `lastiter` of the canonical loop.
3950 Value *LastIter = nullptr;
3951
3952 /// Add the control blocks of this loop to \p BBs.
3953 ///
3954 /// This does not include any block from the body, including the one returned
3955 /// by getBody().
3956 ///
3957 /// FIXME: This currently includes the Preheader and After blocks even though
3958 /// their content is (mostly) not under CanonicalLoopInfo's control.
3959 /// Re-evaluated whether this makes sense.
3960 void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
3961
3962 /// Sets the number of loop iterations to the given value. This value must be
3963 /// valid in the condition block (i.e., defined in the preheader) and is
3964 /// interpreted as an unsigned integer.
3965 void setTripCount(Value *TripCount);
3966
3967 /// Replace all uses of the canonical induction variable in the loop body with
3968 /// a new one.
3969 ///
3970 /// The intended use case is to update the induction variable for an updated
3971 /// iteration space such that it can stay normalized in the 0...tripcount-1
3972 /// range.
3973 ///
3974 /// The \p Updater is called with the (presumable updated) current normalized
3975 /// induction variable and is expected to return the value that uses of the
3976 /// pre-updated induction values should use instead, typically dependent on
3977 /// the new induction variable. This is a lambda (instead of e.g. just passing
3978 /// the new value) to be able to distinguish the uses of the pre-updated
3979 /// induction variable and uses of the induction varible to compute the
3980 /// updated induction variable value.
3981 void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
3982
3983public:
3984 /// Sets the last iteration variable for this loop.
3985 void setLastIter(Value *IterVar) { LastIter = std::move(IterVar); }
3986
3987 /// Returns the last iteration variable for this loop.
3988 /// Certain use-cases (like translation of linear clause) may access
3989 /// this variable even after a loop transformation. Hence, do not guard
3990 /// this getter function by `isValid`. It is the responsibility of the
3991 /// callee to ensure this functionality is not invoked by a non-outlined
3992 /// CanonicalLoopInfo object (in which case, `setLastIter` will never be
3993 /// invoked and `LastIter` will be by default `nullptr`).
3994 Value *getLastIter() { return LastIter; }
3995
3996 /// Returns whether this object currently represents the IR of a loop. If
3997 /// returning false, it may have been consumed by a loop transformation or not
3998 /// been intialized. Do not use in this case;
3999 bool isValid() const { return Header; }
4000
4001 /// The preheader ensures that there is only a single edge entering the loop.
4002 /// Code that must be execute before any loop iteration can be emitted here,
4003 /// such as computing the loop trip count and begin lifetime markers. Code in
4004 /// the preheader is not considered part of the canonical loop.
4005 LLVM_ABI BasicBlock *getPreheader() const;
4006
4007 /// The header is the entry for each iteration. In the canonical control flow,
4008 /// it only contains the PHINode for the induction variable.
4009 BasicBlock *getHeader() const {
4010 assert(isValid() && "Requires a valid canonical loop");
4011 return Header;
4012 }
4013
4014 /// The condition block computes whether there is another loop iteration. If
4015 /// yes, branches to the body; otherwise to the exit block.
4016 BasicBlock *getCond() const {
4017 assert(isValid() && "Requires a valid canonical loop");
4018 return Cond;
4019 }
4020
4021 /// The body block is the single entry for a loop iteration and not controlled
4022 /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
4023 /// eventually branch to the \p Latch block.
4024 BasicBlock *getBody() const {
4025 assert(isValid() && "Requires a valid canonical loop");
4026 return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0);
4027 }
4028
4029 /// Reaching the latch indicates the end of the loop body code. In the
4030 /// canonical control flow, it only contains the increment of the induction
4031 /// variable.
4032 BasicBlock *getLatch() const {
4033 assert(isValid() && "Requires a valid canonical loop");
4034 return Latch;
4035 }
4036
4037 /// Reaching the exit indicates no more iterations are being executed.
4038 BasicBlock *getExit() const {
4039 assert(isValid() && "Requires a valid canonical loop");
4040 return Exit;
4041 }
4042
4043 /// The after block is intended for clean-up code such as lifetime end
4044 /// markers. It is separate from the exit block to ensure, analogous to the
4045 /// preheader, it having just a single entry edge and being free from PHI
4046 /// nodes should there be multiple loop exits (such as from break
4047 /// statements/cancellations).
4048 BasicBlock *getAfter() const {
4049 assert(isValid() && "Requires a valid canonical loop");
4050 return Exit->getSingleSuccessor();
4051 }
4052
4053 /// Returns the llvm::Value containing the number of loop iterations. It must
4054 /// be valid in the preheader and always interpreted as an unsigned integer of
4055 /// any bit-width.
4056 Value *getTripCount() const {
4057 assert(isValid() && "Requires a valid canonical loop");
4058 Instruction *CmpI = &Cond->front();
4059 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
4060 return CmpI->getOperand(1);
4061 }
4062
4063 /// Returns the instruction representing the current logical induction
4064 /// variable. Always unsigned, always starting at 0 with an increment of one.
4065 Instruction *getIndVar() const {
4066 assert(isValid() && "Requires a valid canonical loop");
4067 Instruction *IndVarPHI = &Header->front();
4068 assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
4069 return IndVarPHI;
4070 }
4071
4072 /// Return the type of the induction variable (and the trip count).
4073 Type *getIndVarType() const {
4074 assert(isValid() && "Requires a valid canonical loop");
4075 return getIndVar()->getType();
4076 }
4077
4078 /// Return the insertion point for user code before the loop.
4079 OpenMPIRBuilder::InsertPointTy getPreheaderIP() const {
4080 assert(isValid() && "Requires a valid canonical loop");
4081 BasicBlock *Preheader = getPreheader();
4082 return {Preheader, std::prev(Preheader->end())};
4083 };
4084
4085 /// Return the insertion point for user code in the body.
4086 OpenMPIRBuilder::InsertPointTy getBodyIP() const {
4087 assert(isValid() && "Requires a valid canonical loop");
4088 BasicBlock *Body = getBody();
4089 return {Body, Body->begin()};
4090 };
4091
4092 /// Return the insertion point for user code after the loop.
4093 OpenMPIRBuilder::InsertPointTy getAfterIP() const {
4094 assert(isValid() && "Requires a valid canonical loop");
4095 BasicBlock *After = getAfter();
4096 return {After, After->begin()};
4097 };
4098
4099 Function *getFunction() const {
4100 assert(isValid() && "Requires a valid canonical loop");
4101 return Header->getParent();
4102 }
4103
4104 /// Consistency self-check.
4105 LLVM_ABI void assertOK() const;
4106
4107 /// Invalidate this loop. That is, the underlying IR does not fulfill the
4108 /// requirements of an OpenMP canonical loop anymore.
4109 LLVM_ABI void invalidate();
4110};
4111
4112/// ScanInfo holds the information to assist in lowering of Scan reduction.
4113/// Before lowering, the body of the for loop specifying scan reduction is
4114/// expected to have the following structure
4115///
4116/// Loop Body Entry
4117/// |
4118/// Code before the scan directive
4119/// |
4120/// Scan Directive
4121/// |
4122/// Code after the scan directive
4123/// |
4124/// Loop Body Exit
4125/// When `createCanonicalScanLoops` is executed, the bodyGen callback of it
4126/// transforms the body to:
4127///
4128/// Loop Body Entry
4129/// |
4130/// OMPScanDispatch
4131///
4132/// OMPBeforeScanBlock
4133/// |
4134/// OMPScanLoopExit
4135/// |
4136/// Loop Body Exit
4137///
4138/// The insert point is updated to the first insert point of OMPBeforeScanBlock.
4139/// It dominates the control flow of code generated until
4140/// scan directive is encountered and OMPAfterScanBlock dominates the
4141/// control flow of code generated after scan is encountered. The successor
4142/// of OMPScanDispatch can be OMPBeforeScanBlock or OMPAfterScanBlock based
4143/// on 1.whether it is in Input phase or Scan Phase , 2. whether it is an
4144/// exclusive or inclusive scan. This jump is added when `createScan` is
4145/// executed. If input loop is being generated, if it is inclusive scan,
4146/// `OMPAfterScanBlock` succeeds `OMPScanDispatch` , if exclusive,
4147/// `OMPBeforeScanBlock` succeeds `OMPDispatch` and vice versa for scan loop. At
4148/// the end of the input loop, temporary buffer is populated and at the
4149/// beginning of the scan loop, temporary buffer is read. After scan directive
4150/// is encountered, insertion point is updated to `OMPAfterScanBlock` as it is
4151/// expected to dominate the code after the scan directive. Both Before and
4152/// After scan blocks are succeeded by `OMPScanLoopExit`.
4153/// Temporary buffer allocations are done in `ScanLoopInit` block before the
4154/// lowering of for-loop. The results are copied back to reduction variable in
4155/// `ScanLoopFinish` block.
4156class ScanInfo {
4157public:
4158 /// Dominates the body of the loop before scan directive
4159 llvm::BasicBlock *OMPBeforeScanBlock = nullptr;
4160
4161 /// Dominates the body of the loop before scan directive
4162 llvm::BasicBlock *OMPAfterScanBlock = nullptr;
4163
4164 /// Controls the flow to before or after scan blocks
4165 llvm::BasicBlock *OMPScanDispatch = nullptr;
4166
4167 /// Exit block of loop body
4168 llvm::BasicBlock *OMPScanLoopExit = nullptr;
4169
4170 /// Block before loop body where scan initializations are done
4171 llvm::BasicBlock *OMPScanInit = nullptr;
4172
4173 /// Block after loop body where scan finalizations are done
4174 llvm::BasicBlock *OMPScanFinish = nullptr;
4175
4176 /// If true, it indicates Input phase is lowered; else it indicates
4177 /// ScanPhase is lowered
4178 bool OMPFirstScanLoop = false;
4179
4180 /// Maps the private reduction variable to the pointer of the temporary
4181 /// buffer
4182 llvm::SmallDenseMap<llvm::Value *, llvm::Value *> *ScanBuffPtrs;
4183
4184 /// Keeps track of value of iteration variable for input/scan loop to be
4185 /// used for Scan directive lowering
4186 llvm::Value *IV = nullptr;
4187
4188 /// Stores the span of canonical loop being lowered to be used for temporary
4189 /// buffer allocation or Finalization.
4190 llvm::Value *Span = nullptr;
4191
4192 ScanInfo() {
4193 ScanBuffPtrs = new llvm::SmallDenseMap<llvm::Value *, llvm::Value *>();
4194 }
4195 ScanInfo(ScanInfo &) = delete;
4196 ScanInfo &operator=(const ScanInfo &) = delete;
4197
4198 ~ScanInfo() { delete (ScanBuffPtrs); }
4199};
4200
4201} // end namespace llvm
4202
4203#endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
arc branch finalize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis false
This file defines the BumpPtrAllocator interface.
static GCRegistry::Add< ShadowStackGC > C("shadow-stack", "Very portable GC for uncooperative code generators")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ABI
Definition Compiler.h:213
DXIL Finalize Linkage
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
static std::string getVarName(InstrProfInstBase *Inc, StringRef Prefix, bool &Renamed)
Get the name of a profiling variable for a particular function.
bool operator<(const DeltaInfo &LHS, int64_t Delta)
Definition LineTable.cpp:30
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Machine Check Debug Module
static std::optional< uint64_t > getSizeInBytes(std::optional< uint64_t > SizeInBits)
#define T
This file defines constans and helpers used when dealing with OpenMP.
Provides definitions for Target specific Grid Values.
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static uint32_t getFlags(const Symbol *Sym)
Definition TapiFile.cpp:26
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
@ None
static Function * getFunction(FunctionType *Ty, const Twine &Name, Module *M)
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
LLVM Basic Block Representation.
Definition BasicBlock.h:62
A debug info location.
Definition DebugLoc.h:123
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2794
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:397
The virtual file system interface.
LLVM_ABI bool isGPU(const Module &M)
Return true iff M target a GPU (and we can use GPU AS reasoning).
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
Offsets
Offsets in bytes from the start of the input buffer.
ElementType
The element type of an SRV or UAV resource.
Definition DXILABI.h:60
bool empty() const
Definition BasicBlock.h:101
Context & getContext() const
Definition BasicBlock.h:99
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
LLVM_ABI void append(SmallVectorImpl< char > &path, const Twine &a, const Twine &b="", const Twine &c="", const Twine &d="")
Append to path.
Definition Path.cpp:457
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1915