Bug Summary

File:clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Warning:line 551, column 29
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -disable-llvm-verifier -discard-value-names -main-file-name CGOpenMPRuntimeGPU.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -relaxed-aliasing -fmath-errno -fno-rounding-math -mconstructor-aliases -munwind-tables -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/tools/clang/lib/CodeGen -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/tools/clang/lib/CodeGen -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/clang/lib/CodeGen -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/clang/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/tools/clang/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/include -I /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/llvm/include -D NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O2 -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/build-llvm/tools/clang/lib/CodeGen -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e=. -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2021-09-04-040900-46481-1 -x c++ /build/llvm-toolchain-snapshot-14~++20210903100615+fd66b44ec19e/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CGOpenMPRuntimeNVPTX.h"
16#include "CodeGenFunction.h"
17#include "clang/AST/Attr.h"
18#include "clang/AST/DeclOpenMP.h"
19#include "clang/AST/StmtOpenMP.h"
20#include "clang/AST/StmtVisitor.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24#include "llvm/IR/IntrinsicsNVPTX.h"
25#include "llvm/Support/MathExtras.h"
26
27using namespace clang;
28using namespace CodeGen;
29using namespace llvm::omp;
30
31namespace {
32/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
33class NVPTXActionTy final : public PrePostActionTy {
34 llvm::FunctionCallee EnterCallee = nullptr;
35 ArrayRef<llvm::Value *> EnterArgs;
36 llvm::FunctionCallee ExitCallee = nullptr;
37 ArrayRef<llvm::Value *> ExitArgs;
38 bool Conditional = false;
39 llvm::BasicBlock *ContBlock = nullptr;
40
41public:
42 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
43 ArrayRef<llvm::Value *> EnterArgs,
44 llvm::FunctionCallee ExitCallee,
45 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
46 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
47 ExitArgs(ExitArgs), Conditional(Conditional) {}
48 void Enter(CodeGenFunction &CGF) override {
49 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
50 if (Conditional) {
51 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
52 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
53 ContBlock = CGF.createBasicBlock("omp_if.end");
54 // Generate the branch (If-stmt)
55 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
56 CGF.EmitBlock(ThenBlock);
57 }
58 }
59 void Done(CodeGenFunction &CGF) {
60 // Emit the rest of blocks/branches
61 CGF.EmitBranch(ContBlock);
62 CGF.EmitBlock(ContBlock, true);
63 }
64 void Exit(CodeGenFunction &CGF) override {
65 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
66 }
67};
68
69/// A class to track the execution mode when codegening directives within
70/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
71/// to the target region and used by containing directives such as 'parallel'
72/// to emit optimized code.
73class ExecutionRuntimeModesRAII {
74private:
75 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
76 CGOpenMPRuntimeGPU::EM_Unknown;
77 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
78 bool SavedRuntimeMode = false;
79 bool *RuntimeMode = nullptr;
80
81public:
82 /// Constructor for Non-SPMD mode.
83 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
84 : ExecMode(ExecMode) {
85 SavedExecMode = ExecMode;
86 ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
87 }
88 /// Constructor for SPMD mode.
89 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
90 bool &RuntimeMode, bool FullRuntimeMode)
91 : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
92 SavedExecMode = ExecMode;
93 SavedRuntimeMode = RuntimeMode;
94 ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
95 RuntimeMode = FullRuntimeMode;
96 }
97 ~ExecutionRuntimeModesRAII() {
98 ExecMode = SavedExecMode;
99 if (RuntimeMode)
100 *RuntimeMode = SavedRuntimeMode;
101 }
102};
103
104/// GPU Configuration: This information can be derived from cuda registers,
105/// however, providing compile time constants helps generate more efficient
106/// code. For all practical purposes this is fine because the configuration
107/// is the same for all known NVPTX architectures.
108enum MachineConfiguration : unsigned {
109 /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
110 /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
111
112 /// Global memory alignment for performance.
113 GlobalMemoryAlignment = 128,
114
115 /// Maximal size of the shared memory buffer.
116 SharedMemorySize = 128,
117};
118
119static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
120 RefExpr = RefExpr->IgnoreParens();
121 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
122 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
123 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
124 Base = TempASE->getBase()->IgnoreParenImpCasts();
125 RefExpr = Base;
126 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
127 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
128 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
129 Base = TempOASE->getBase()->IgnoreParenImpCasts();
130 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
131 Base = TempASE->getBase()->IgnoreParenImpCasts();
132 RefExpr = Base;
133 }
134 RefExpr = RefExpr->IgnoreParenImpCasts();
135 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
136 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
137 const auto *ME = cast<MemberExpr>(RefExpr);
138 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
139}
140
141
142static RecordDecl *buildRecordForGlobalizedVars(
143 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
144 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
145 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
146 &MappedDeclsFields, int BufSize) {
147 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
148 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
149 return nullptr;
150 SmallVector<VarsDataTy, 4> GlobalizedVars;
151 for (const ValueDecl *D : EscapedDecls)
152 GlobalizedVars.emplace_back(
153 CharUnits::fromQuantity(std::max(
154 C.getDeclAlign(D).getQuantity(),
155 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
156 D);
157 for (const ValueDecl *D : EscapedDeclsForTeams)
158 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
159 llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
160 return L.first > R.first;
161 });
162
163 // Build struct _globalized_locals_ty {
164 // /* globalized vars */[WarSize] align (max(decl_align,
165 // GlobalMemoryAlignment))
166 // /* globalized vars */ for EscapedDeclsForTeams
167 // };
168 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
169 GlobalizedRD->startDefinition();
170 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
171 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
172 for (const auto &Pair : GlobalizedVars) {
173 const ValueDecl *VD = Pair.second;
174 QualType Type = VD->getType();
175 if (Type->isLValueReferenceType())
176 Type = C.getPointerType(Type.getNonReferenceType());
177 else
178 Type = Type.getNonReferenceType();
179 SourceLocation Loc = VD->getLocation();
180 FieldDecl *Field;
181 if (SingleEscaped.count(VD)) {
182 Field = FieldDecl::Create(
183 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
184 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
185 /*BW=*/nullptr, /*Mutable=*/false,
186 /*InitStyle=*/ICIS_NoInit);
187 Field->setAccess(AS_public);
188 if (VD->hasAttrs()) {
189 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
190 E(VD->getAttrs().end());
191 I != E; ++I)
192 Field->addAttr(*I);
193 }
194 } else {
195 llvm::APInt ArraySize(32, BufSize);
196 Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
197 0);
198 Field = FieldDecl::Create(
199 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
200 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
201 /*BW=*/nullptr, /*Mutable=*/false,
202 /*InitStyle=*/ICIS_NoInit);
203 Field->setAccess(AS_public);
204 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
205 static_cast<CharUnits::QuantityType>(
206 GlobalMemoryAlignment)));
207 Field->addAttr(AlignedAttr::CreateImplicit(
208 C, /*IsAlignmentExpr=*/true,
209 IntegerLiteral::Create(C, Align,
210 C.getIntTypeForBitwidth(32, /*Signed=*/0),
211 SourceLocation()),
212 {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
213 }
214 GlobalizedRD->addDecl(Field);
215 MappedDeclsFields.try_emplace(VD, Field);
216 }
217 GlobalizedRD->completeDefinition();
218 return GlobalizedRD;
219}
220
221/// Get the list of variables that can escape their declaration context.
222class CheckVarsEscapingDeclContext final
223 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
224 CodeGenFunction &CGF;
225 llvm::SetVector<const ValueDecl *> EscapedDecls;
226 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
227 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
228 RecordDecl *GlobalizedRD = nullptr;
229 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
230 bool AllEscaped = false;
231 bool IsForCombinedParallelRegion = false;
232
233 void markAsEscaped(const ValueDecl *VD) {
234 // Do not globalize declare target variables.
235 if (!isa<VarDecl>(VD) ||
236 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
237 return;
238 VD = cast<ValueDecl>(VD->getCanonicalDecl());
239 // Use user-specified allocation.
240 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
241 return;
242 // Variables captured by value must be globalized.
243 if (auto *CSI = CGF.CapturedStmtInfo) {
244 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
245 // Check if need to capture the variable that was already captured by
246 // value in the outer region.
247 if (!IsForCombinedParallelRegion) {
248 if (!FD->hasAttrs())
249 return;
250 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
251 if (!Attr)
252 return;
253 if (((Attr->getCaptureKind() != OMPC_map) &&
254 !isOpenMPPrivate(Attr->getCaptureKind())) ||
255 ((Attr->getCaptureKind() == OMPC_map) &&
256 !FD->getType()->isAnyPointerType()))
257 return;
258 }
259 if (!FD->getType()->isReferenceType()) {
260 assert(!VD->getType()->isVariablyModifiedType() &&(static_cast<void> (0))
261 "Parameter captured by value with variably modified type")(static_cast<void> (0));
262 EscapedParameters.insert(VD);
263 } else if (!IsForCombinedParallelRegion) {
264 return;
265 }
266 }
267 }
268 if ((!CGF.CapturedStmtInfo ||
269 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
270 VD->getType()->isReferenceType())
271 // Do not globalize variables with reference type.
272 return;
273 if (VD->getType()->isVariablyModifiedType())
274 EscapedVariableLengthDecls.insert(VD);
275 else
276 EscapedDecls.insert(VD);
277 }
278
279 void VisitValueDecl(const ValueDecl *VD) {
280 if (VD->getType()->isLValueReferenceType())
281 markAsEscaped(VD);
282 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
283 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
284 const bool SavedAllEscaped = AllEscaped;
285 AllEscaped = VD->getType()->isLValueReferenceType();
286 Visit(VarD->getInit());
287 AllEscaped = SavedAllEscaped;
288 }
289 }
290 }
291 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
292 ArrayRef<OMPClause *> Clauses,
293 bool IsCombinedParallelRegion) {
294 if (!S)
295 return;
296 for (const CapturedStmt::Capture &C : S->captures()) {
297 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
298 const ValueDecl *VD = C.getCapturedVar();
299 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
300 if (IsCombinedParallelRegion) {
301 // Check if the variable is privatized in the combined construct and
302 // those private copies must be shared in the inner parallel
303 // directive.
304 IsForCombinedParallelRegion = false;
305 for (const OMPClause *C : Clauses) {
306 if (!isOpenMPPrivate(C->getClauseKind()) ||
307 C->getClauseKind() == OMPC_reduction ||
308 C->getClauseKind() == OMPC_linear ||
309 C->getClauseKind() == OMPC_private)
310 continue;
311 ArrayRef<const Expr *> Vars;
312 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
313 Vars = PC->getVarRefs();
314 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
315 Vars = PC->getVarRefs();
316 else
317 llvm_unreachable("Unexpected clause.")__builtin_unreachable();
318 for (const auto *E : Vars) {
319 const Decl *D =
320 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
321 if (D == VD->getCanonicalDecl()) {
322 IsForCombinedParallelRegion = true;
323 break;
324 }
325 }
326 if (IsForCombinedParallelRegion)
327 break;
328 }
329 }
330 markAsEscaped(VD);
331 if (isa<OMPCapturedExprDecl>(VD))
332 VisitValueDecl(VD);
333 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
334 }
335 }
336 }
337
338 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
339 assert(!GlobalizedRD &&(static_cast<void> (0))
340 "Record for globalized variables is built already.")(static_cast<void> (0));
341 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
342 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
343 if (IsInTTDRegion)
344 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
345 else
346 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
347 GlobalizedRD = ::buildRecordForGlobalizedVars(
348 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
349 MappedDeclsFields, WarpSize);
350 }
351
352public:
353 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
354 ArrayRef<const ValueDecl *> TeamsReductions)
355 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
356 }
357 virtual ~CheckVarsEscapingDeclContext() = default;
358 void VisitDeclStmt(const DeclStmt *S) {
359 if (!S)
360 return;
361 for (const Decl *D : S->decls())
362 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
363 VisitValueDecl(VD);
364 }
365 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
366 if (!D)
367 return;
368 if (!D->hasAssociatedStmt())
369 return;
370 if (const auto *S =
371 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
372 // Do not analyze directives that do not actually require capturing,
373 // like `omp for` or `omp simd` directives.
374 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
375 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
376 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
377 VisitStmt(S->getCapturedStmt());
378 return;
379 }
380 VisitOpenMPCapturedStmt(
381 S, D->clauses(),
382 CaptureRegions.back() == OMPD_parallel &&
383 isOpenMPDistributeDirective(D->getDirectiveKind()));
384 }
385 }
386 void VisitCapturedStmt(const CapturedStmt *S) {
387 if (!S)
388 return;
389 for (const CapturedStmt::Capture &C : S->captures()) {
390 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
391 const ValueDecl *VD = C.getCapturedVar();
392 markAsEscaped(VD);
393 if (isa<OMPCapturedExprDecl>(VD))
394 VisitValueDecl(VD);
395 }
396 }
397 }
398 void VisitLambdaExpr(const LambdaExpr *E) {
399 if (!E)
400 return;
401 for (const LambdaCapture &C : E->captures()) {
402 if (C.capturesVariable()) {
403 if (C.getCaptureKind() == LCK_ByRef) {
404 const ValueDecl *VD = C.getCapturedVar();
405 markAsEscaped(VD);
406 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
407 VisitValueDecl(VD);
408 }
409 }
410 }
411 }
412 void VisitBlockExpr(const BlockExpr *E) {
413 if (!E)
414 return;
415 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
416 if (C.isByRef()) {
417 const VarDecl *VD = C.getVariable();
418 markAsEscaped(VD);
419 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
420 VisitValueDecl(VD);
421 }
422 }
423 }
424 void VisitCallExpr(const CallExpr *E) {
425 if (!E)
426 return;
427 for (const Expr *Arg : E->arguments()) {
428 if (!Arg)
429 continue;
430 if (Arg->isLValue()) {
431 const bool SavedAllEscaped = AllEscaped;
432 AllEscaped = true;
433 Visit(Arg);
434 AllEscaped = SavedAllEscaped;
435 } else {
436 Visit(Arg);
437 }
438 }
439 Visit(E->getCallee());
440 }
441 void VisitDeclRefExpr(const DeclRefExpr *E) {
442 if (!E)
443 return;
444 const ValueDecl *VD = E->getDecl();
445 if (AllEscaped)
446 markAsEscaped(VD);
447 if (isa<OMPCapturedExprDecl>(VD))
448 VisitValueDecl(VD);
449 else if (const auto *VarD = dyn_cast<VarDecl>(VD))
450 if (VarD->isInitCapture())
451 VisitValueDecl(VD);
452 }
453 void VisitUnaryOperator(const UnaryOperator *E) {
454 if (!E)
455 return;
456 if (E->getOpcode() == UO_AddrOf) {
457 const bool SavedAllEscaped = AllEscaped;
458 AllEscaped = true;
459 Visit(E->getSubExpr());
460 AllEscaped = SavedAllEscaped;
461 } else {
462 Visit(E->getSubExpr());
463 }
464 }
465 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
466 if (!E)
467 return;
468 if (E->getCastKind() == CK_ArrayToPointerDecay) {
469 const bool SavedAllEscaped = AllEscaped;
470 AllEscaped = true;
471 Visit(E->getSubExpr());
472 AllEscaped = SavedAllEscaped;
473 } else {
474 Visit(E->getSubExpr());
475 }
476 }
477 void VisitExpr(const Expr *E) {
478 if (!E)
479 return;
480 bool SavedAllEscaped = AllEscaped;
481 if (!E->isLValue())
482 AllEscaped = false;
483 for (const Stmt *Child : E->children())
484 if (Child)
485 Visit(Child);
486 AllEscaped = SavedAllEscaped;
487 }
488 void VisitStmt(const Stmt *S) {
489 if (!S)
490 return;
491 for (const Stmt *Child : S->children())
492 if (Child)
493 Visit(Child);
494 }
495
496 /// Returns the record that handles all the escaped local variables and used
497 /// instead of their original storage.
498 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
499 if (!GlobalizedRD)
500 buildRecordForGlobalizedVars(IsInTTDRegion);
501 return GlobalizedRD;
502 }
503
504 /// Returns the field in the globalized record for the escaped variable.
505 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
506 assert(GlobalizedRD &&(static_cast<void> (0))
507 "Record for globalized variables must be generated already.")(static_cast<void> (0));
508 auto I = MappedDeclsFields.find(VD);
509 if (I == MappedDeclsFields.end())
510 return nullptr;
511 return I->getSecond();
512 }
513
514 /// Returns the list of the escaped local variables/parameters.
515 ArrayRef<const ValueDecl *> getEscapedDecls() const {
516 return EscapedDecls.getArrayRef();
517 }
518
519 /// Checks if the escaped local variable is actually a parameter passed by
520 /// value.
521 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
522 return EscapedParameters;
523 }
524
525 /// Returns the list of the escaped variables with the variably modified
526 /// types.
527 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
528 return EscapedVariableLengthDecls.getArrayRef();
529 }
530};
531} // anonymous namespace
532
533/// Get the id of the warp in the block.
534/// We assume that the warp size is 32, which is always the case
535/// on the NVPTX device, to generate more efficient code.
536static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
537 CGBuilderTy &Bld = CGF.Builder;
538 unsigned LaneIDBits =
539 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
540 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
541 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
542}
543
544/// Get the id of the current lane in the Warp.
545/// We assume that the warp size is 32, which is always the case
546/// on the NVPTX device, to generate more efficient code.
547static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
548 CGBuilderTy &Bld = CGF.Builder;
549 unsigned LaneIDBits =
550 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
551 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
11
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'
552 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
553 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
554 "nvptx_lane_id");
555}
556
557CGOpenMPRuntimeGPU::ExecutionMode
558CGOpenMPRuntimeGPU::getExecutionMode() const {
559 return CurrentExecutionMode;
560}
561
562static CGOpenMPRuntimeGPU::DataSharingMode
563getDataSharingMode(CodeGenModule &CGM) {
564 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
565 : CGOpenMPRuntimeGPU::Generic;
566}
567
568/// Check for inner (nested) SPMD construct, if any
569static bool hasNestedSPMDDirective(ASTContext &Ctx,
570 const OMPExecutableDirective &D) {
571 const auto *CS = D.getInnermostCapturedStmt();
572 const auto *Body =
573 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
574 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
575
576 if (const auto *NestedDir =
577 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
578 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
579 switch (D.getDirectiveKind()) {
580 case OMPD_target:
581 if (isOpenMPParallelDirective(DKind))
582 return true;
583 if (DKind == OMPD_teams) {
584 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
585 /*IgnoreCaptured=*/true);
586 if (!Body)
587 return false;
588 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
589 if (const auto *NND =
590 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
591 DKind = NND->getDirectiveKind();
592 if (isOpenMPParallelDirective(DKind))
593 return true;
594 }
595 }
596 return false;
597 case OMPD_target_teams:
598 return isOpenMPParallelDirective(DKind);
599 case OMPD_target_simd:
600 case OMPD_target_parallel:
601 case OMPD_target_parallel_for:
602 case OMPD_target_parallel_for_simd:
603 case OMPD_target_teams_distribute:
604 case OMPD_target_teams_distribute_simd:
605 case OMPD_target_teams_distribute_parallel_for:
606 case OMPD_target_teams_distribute_parallel_for_simd:
607 case OMPD_parallel:
608 case OMPD_for:
609 case OMPD_parallel_for:
610 case OMPD_parallel_master:
611 case OMPD_parallel_sections:
612 case OMPD_for_simd:
613 case OMPD_parallel_for_simd:
614 case OMPD_cancel:
615 case OMPD_cancellation_point:
616 case OMPD_ordered:
617 case OMPD_threadprivate:
618 case OMPD_allocate:
619 case OMPD_task:
620 case OMPD_simd:
621 case OMPD_sections:
622 case OMPD_section:
623 case OMPD_single:
624 case OMPD_master:
625 case OMPD_critical:
626 case OMPD_taskyield:
627 case OMPD_barrier:
628 case OMPD_taskwait:
629 case OMPD_taskgroup:
630 case OMPD_atomic:
631 case OMPD_flush:
632 case OMPD_depobj:
633 case OMPD_scan:
634 case OMPD_teams:
635 case OMPD_target_data:
636 case OMPD_target_exit_data:
637 case OMPD_target_enter_data:
638 case OMPD_distribute:
639 case OMPD_distribute_simd:
640 case OMPD_distribute_parallel_for:
641 case OMPD_distribute_parallel_for_simd:
642 case OMPD_teams_distribute:
643 case OMPD_teams_distribute_simd:
644 case OMPD_teams_distribute_parallel_for:
645 case OMPD_teams_distribute_parallel_for_simd:
646 case OMPD_target_update:
647 case OMPD_declare_simd:
648 case OMPD_declare_variant:
649 case OMPD_begin_declare_variant:
650 case OMPD_end_declare_variant:
651 case OMPD_declare_target:
652 case OMPD_end_declare_target:
653 case OMPD_declare_reduction:
654 case OMPD_declare_mapper:
655 case OMPD_taskloop:
656 case OMPD_taskloop_simd:
657 case OMPD_master_taskloop:
658 case OMPD_master_taskloop_simd:
659 case OMPD_parallel_master_taskloop:
660 case OMPD_parallel_master_taskloop_simd:
661 case OMPD_requires:
662 case OMPD_unknown:
663 default:
664 llvm_unreachable("Unexpected directive.")__builtin_unreachable();
665 }
666 }
667
668 return false;
669}
670
671static bool supportsSPMDExecutionMode(ASTContext &Ctx,
672 const OMPExecutableDirective &D) {
673 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
674 switch (DirectiveKind) {
675 case OMPD_target:
676 case OMPD_target_teams:
677 return hasNestedSPMDDirective(Ctx, D);
678 case OMPD_target_parallel:
679 case OMPD_target_parallel_for:
680 case OMPD_target_parallel_for_simd:
681 case OMPD_target_teams_distribute_parallel_for:
682 case OMPD_target_teams_distribute_parallel_for_simd:
683 case OMPD_target_simd:
684 case OMPD_target_teams_distribute_simd:
685 return true;
686 case OMPD_target_teams_distribute:
687 return false;
688 case OMPD_parallel:
689 case OMPD_for:
690 case OMPD_parallel_for:
691 case OMPD_parallel_master:
692 case OMPD_parallel_sections:
693 case OMPD_for_simd:
694 case OMPD_parallel_for_simd:
695 case OMPD_cancel:
696 case OMPD_cancellation_point:
697 case OMPD_ordered:
698 case OMPD_threadprivate:
699 case OMPD_allocate:
700 case OMPD_task:
701 case OMPD_simd:
702 case OMPD_sections:
703 case OMPD_section:
704 case OMPD_single:
705 case OMPD_master:
706 case OMPD_critical:
707 case OMPD_taskyield:
708 case OMPD_barrier:
709 case OMPD_taskwait:
710 case OMPD_taskgroup:
711 case OMPD_atomic:
712 case OMPD_flush:
713 case OMPD_depobj:
714 case OMPD_scan:
715 case OMPD_teams:
716 case OMPD_target_data:
717 case OMPD_target_exit_data:
718 case OMPD_target_enter_data:
719 case OMPD_distribute:
720 case OMPD_distribute_simd:
721 case OMPD_distribute_parallel_for:
722 case OMPD_distribute_parallel_for_simd:
723 case OMPD_teams_distribute:
724 case OMPD_teams_distribute_simd:
725 case OMPD_teams_distribute_parallel_for:
726 case OMPD_teams_distribute_parallel_for_simd:
727 case OMPD_target_update:
728 case OMPD_declare_simd:
729 case OMPD_declare_variant:
730 case OMPD_begin_declare_variant:
731 case OMPD_end_declare_variant:
732 case OMPD_declare_target:
733 case OMPD_end_declare_target:
734 case OMPD_declare_reduction:
735 case OMPD_declare_mapper:
736 case OMPD_taskloop:
737 case OMPD_taskloop_simd:
738 case OMPD_master_taskloop:
739 case OMPD_master_taskloop_simd:
740 case OMPD_parallel_master_taskloop:
741 case OMPD_parallel_master_taskloop_simd:
742 case OMPD_requires:
743 case OMPD_unknown:
744 default:
745 break;
746 }
747 llvm_unreachable(__builtin_unreachable()
748 "Unknown programming model for OpenMP directive on NVPTX target.")__builtin_unreachable();
749}
750
751/// Check if the directive is loops based and has schedule clause at all or has
752/// static scheduling.
753static bool hasStaticScheduling(const OMPExecutableDirective &D) {
754 assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&(static_cast<void> (0))
755 isOpenMPLoopDirective(D.getDirectiveKind()) &&(static_cast<void> (0))
756 "Expected loop-based directive.")(static_cast<void> (0));
757 return !D.hasClausesOfKind<OMPOrderedClause>() &&
758 (!D.hasClausesOfKind<OMPScheduleClause>() ||
759 llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
760 [](const OMPScheduleClause *C) {
761 return C->getScheduleKind() == OMPC_SCHEDULE_static;
762 }));
763}
764
765/// Check for inner (nested) lightweight runtime construct, if any
766static bool hasNestedLightweightDirective(ASTContext &Ctx,
767 const OMPExecutableDirective &D) {
768 assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.")(static_cast<void> (0));
769 const auto *CS = D.getInnermostCapturedStmt();
770 const auto *Body =
771 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
772 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
773
774 if (const auto *NestedDir =
775 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
776 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
777 switch (D.getDirectiveKind()) {
778 case OMPD_target:
779 if (isOpenMPParallelDirective(DKind) &&
780 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
781 hasStaticScheduling(*NestedDir))
782 return true;
783 if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
784 return true;
785 if (DKind == OMPD_parallel) {
786 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
787 /*IgnoreCaptured=*/true);
788 if (!Body)
789 return false;
790 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
791 if (const auto *NND =
792 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
793 DKind = NND->getDirectiveKind();
794 if (isOpenMPWorksharingDirective(DKind) &&
795 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
796 return true;
797 }
798 } else if (DKind == OMPD_teams) {
799 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
800 /*IgnoreCaptured=*/true);
801 if (!Body)
802 return false;
803 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
804 if (const auto *NND =
805 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
806 DKind = NND->getDirectiveKind();
807 if (isOpenMPParallelDirective(DKind) &&
808 isOpenMPWorksharingDirective(DKind) &&
809 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
810 return true;
811 if (DKind == OMPD_parallel) {
812 Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
813 /*IgnoreCaptured=*/true);
814 if (!Body)
815 return false;
816 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
817 if (const auto *NND =
818 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
819 DKind = NND->getDirectiveKind();
820 if (isOpenMPWorksharingDirective(DKind) &&
821 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
822 return true;
823 }
824 }
825 }
826 }
827 return false;
828 case OMPD_target_teams:
829 if (isOpenMPParallelDirective(DKind) &&
830 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
831 hasStaticScheduling(*NestedDir))
832 return true;
833 if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
834 return true;
835 if (DKind == OMPD_parallel) {
836 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
837 /*IgnoreCaptured=*/true);
838 if (!Body)
839 return false;
840 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
841 if (const auto *NND =
842 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
843 DKind = NND->getDirectiveKind();
844 if (isOpenMPWorksharingDirective(DKind) &&
845 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
846 return true;
847 }
848 }
849 return false;
850 case OMPD_target_parallel:
851 if (DKind == OMPD_simd)
852 return true;
853 return isOpenMPWorksharingDirective(DKind) &&
854 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
855 case OMPD_target_teams_distribute:
856 case OMPD_target_simd:
857 case OMPD_target_parallel_for:
858 case OMPD_target_parallel_for_simd:
859 case OMPD_target_teams_distribute_simd:
860 case OMPD_target_teams_distribute_parallel_for:
861 case OMPD_target_teams_distribute_parallel_for_simd:
862 case OMPD_parallel:
863 case OMPD_for:
864 case OMPD_parallel_for:
865 case OMPD_parallel_master:
866 case OMPD_parallel_sections:
867 case OMPD_for_simd:
868 case OMPD_parallel_for_simd:
869 case OMPD_cancel:
870 case OMPD_cancellation_point:
871 case OMPD_ordered:
872 case OMPD_threadprivate:
873 case OMPD_allocate:
874 case OMPD_task:
875 case OMPD_simd:
876 case OMPD_sections:
877 case OMPD_section:
878 case OMPD_single:
879 case OMPD_master:
880 case OMPD_critical:
881 case OMPD_taskyield:
882 case OMPD_barrier:
883 case OMPD_taskwait:
884 case OMPD_taskgroup:
885 case OMPD_atomic:
886 case OMPD_flush:
887 case OMPD_depobj:
888 case OMPD_scan:
889 case OMPD_teams:
890 case OMPD_target_data:
891 case OMPD_target_exit_data:
892 case OMPD_target_enter_data:
893 case OMPD_distribute:
894 case OMPD_distribute_simd:
895 case OMPD_distribute_parallel_for:
896 case OMPD_distribute_parallel_for_simd:
897 case OMPD_teams_distribute:
898 case OMPD_teams_distribute_simd:
899 case OMPD_teams_distribute_parallel_for:
900 case OMPD_teams_distribute_parallel_for_simd:
901 case OMPD_target_update:
902 case OMPD_declare_simd:
903 case OMPD_declare_variant:
904 case OMPD_begin_declare_variant:
905 case OMPD_end_declare_variant:
906 case OMPD_declare_target:
907 case OMPD_end_declare_target:
908 case OMPD_declare_reduction:
909 case OMPD_declare_mapper:
910 case OMPD_taskloop:
911 case OMPD_taskloop_simd:
912 case OMPD_master_taskloop:
913 case OMPD_master_taskloop_simd:
914 case OMPD_parallel_master_taskloop:
915 case OMPD_parallel_master_taskloop_simd:
916 case OMPD_requires:
917 case OMPD_unknown:
918 default:
919 llvm_unreachable("Unexpected directive.")__builtin_unreachable();
920 }
921 }
922
923 return false;
924}
925
926/// Checks if the construct supports lightweight runtime. It must be SPMD
927/// construct + inner loop-based construct with static scheduling.
928static bool supportsLightweightRuntime(ASTContext &Ctx,
929 const OMPExecutableDirective &D) {
930 if (!supportsSPMDExecutionMode(Ctx, D))
931 return false;
932 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
933 switch (DirectiveKind) {
934 case OMPD_target:
935 case OMPD_target_teams:
936 case OMPD_target_parallel:
937 return hasNestedLightweightDirective(Ctx, D);
938 case OMPD_target_parallel_for:
939 case OMPD_target_parallel_for_simd:
940 case OMPD_target_teams_distribute_parallel_for:
941 case OMPD_target_teams_distribute_parallel_for_simd:
942 // (Last|First)-privates must be shared in parallel region.
943 return hasStaticScheduling(D);
944 case OMPD_target_simd:
945 case OMPD_target_teams_distribute_simd:
946 return true;
947 case OMPD_target_teams_distribute:
948 return false;
949 case OMPD_parallel:
950 case OMPD_for:
951 case OMPD_parallel_for:
952 case OMPD_parallel_master:
953 case OMPD_parallel_sections:
954 case OMPD_for_simd:
955 case OMPD_parallel_for_simd:
956 case OMPD_cancel:
957 case OMPD_cancellation_point:
958 case OMPD_ordered:
959 case OMPD_threadprivate:
960 case OMPD_allocate:
961 case OMPD_task:
962 case OMPD_simd:
963 case OMPD_sections:
964 case OMPD_section:
965 case OMPD_single:
966 case OMPD_master:
967 case OMPD_critical:
968 case OMPD_taskyield:
969 case OMPD_barrier:
970 case OMPD_taskwait:
971 case OMPD_taskgroup:
972 case OMPD_atomic:
973 case OMPD_flush:
974 case OMPD_depobj:
975 case OMPD_scan:
976 case OMPD_teams:
977 case OMPD_target_data:
978 case OMPD_target_exit_data:
979 case OMPD_target_enter_data:
980 case OMPD_distribute:
981 case OMPD_distribute_simd:
982 case OMPD_distribute_parallel_for:
983 case OMPD_distribute_parallel_for_simd:
984 case OMPD_teams_distribute:
985 case OMPD_teams_distribute_simd:
986 case OMPD_teams_distribute_parallel_for:
987 case OMPD_teams_distribute_parallel_for_simd:
988 case OMPD_target_update:
989 case OMPD_declare_simd:
990 case OMPD_declare_variant:
991 case OMPD_begin_declare_variant:
992 case OMPD_end_declare_variant:
993 case OMPD_declare_target:
994 case OMPD_end_declare_target:
995 case OMPD_declare_reduction:
996 case OMPD_declare_mapper:
997 case OMPD_taskloop:
998 case OMPD_taskloop_simd:
999 case OMPD_master_taskloop:
1000 case OMPD_master_taskloop_simd:
1001 case OMPD_parallel_master_taskloop:
1002 case OMPD_parallel_master_taskloop_simd:
1003 case OMPD_requires:
1004 case OMPD_unknown:
1005 default:
1006 break;
1007 }
1008 llvm_unreachable(__builtin_unreachable()
1009 "Unknown programming model for OpenMP directive on NVPTX target.")__builtin_unreachable();
1010}
1011
1012void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
1013 StringRef ParentName,
1014 llvm::Function *&OutlinedFn,
1015 llvm::Constant *&OutlinedFnID,
1016 bool IsOffloadEntry,
1017 const RegionCodeGenTy &CodeGen) {
1018 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1019 EntryFunctionState EST;
1020 WrapperFunctionsMap.clear();
1021
1022 // Emit target region as a standalone region.
1023 class NVPTXPrePostActionTy : public PrePostActionTy {
1024 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1025
1026 public:
1027 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1028 : EST(EST) {}
1029 void Enter(CodeGenFunction &CGF) override {
1030 auto &RT =
1031 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1032 RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
1033 // Skip target region initialization.
1034 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1035 }
1036 void Exit(CodeGenFunction &CGF) override {
1037 auto &RT =
1038 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1039 RT.clearLocThreadIdInsertPt(CGF);
1040 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
1041 }
1042 } Action(EST);
1043 CodeGen.setAction(Action);
1044 IsInTTDRegion = true;
1045 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1046 IsOffloadEntry, CodeGen);
1047 IsInTTDRegion = false;
1048}
1049
1050void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
1051 EntryFunctionState &EST, bool IsSPMD) {
1052 CGBuilderTy &Bld = CGF.Builder;
1053 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
1054 IsInTargetMasterThreadRegion = IsSPMD;
1055 if (!IsSPMD)
1056 emitGenericVarsProlog(CGF, EST.Loc);
1057}
1058
1059void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
1060 EntryFunctionState &EST,
1061 bool IsSPMD) {
1062 if (!IsSPMD)
1063 emitGenericVarsEpilog(CGF);
1064
1065 CGBuilderTy &Bld = CGF.Builder;
1066 OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
1067}
1068
1069void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
1070 StringRef ParentName,
1071 llvm::Function *&OutlinedFn,
1072 llvm::Constant *&OutlinedFnID,
1073 bool IsOffloadEntry,
1074 const RegionCodeGenTy &CodeGen) {
1075 ExecutionRuntimeModesRAII ModeRAII(
1076 CurrentExecutionMode, RequiresFullRuntime,
1077 CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1078 !supportsLightweightRuntime(CGM.getContext(), D));
1079 EntryFunctionState EST;
1080
1081 // Emit target region as a standalone region.
1082 class NVPTXPrePostActionTy : public PrePostActionTy {
1083 CGOpenMPRuntimeGPU &RT;
1084 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1085
1086 public:
1087 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
1088 CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1089 : RT(RT), EST(EST) {}
1090 void Enter(CodeGenFunction &CGF) override {
1091 RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
1092 // Skip target region initialization.
1093 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1094 }
1095 void Exit(CodeGenFunction &CGF) override {
1096 RT.clearLocThreadIdInsertPt(CGF);
1097 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
1098 }
1099 } Action(*this, EST);
1100 CodeGen.setAction(Action);
1101 IsInTTDRegion = true;
1102 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1103 IsOffloadEntry, CodeGen);
1104 IsInTTDRegion = false;
1105}
1106
1107// Create a unique global variable to indicate the execution mode of this target
1108// region. The execution mode is either 'generic', or 'spmd' depending on the
1109// target directive. This variable is picked up by the offload library to setup
1110// the device appropriately before kernel launch. If the execution mode is
1111// 'generic', the runtime reserves one warp for the master, otherwise, all
1112// warps participate in parallel work.
1113static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1114 bool Mode) {
1115 auto *GVMode =
1116 new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1117 llvm::GlobalValue::WeakAnyLinkage,
1118 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1119 Twine(Name, "_exec_mode"));
1120 CGM.addCompilerUsedGlobal(GVMode);
1121}
1122
1123void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
1124 llvm::Constant *Addr,
1125 uint64_t Size, int32_t,
1126 llvm::GlobalValue::LinkageTypes) {
1127 // TODO: Add support for global variables on the device after declare target
1128 // support.
1129 if (!isa<llvm::Function>(Addr))
1130 return;
1131 llvm::Module &M = CGM.getModule();
1132 llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1133
1134 // Get "nvvm.annotations" metadata node
1135 llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1136
1137 llvm::Metadata *MDVals[] = {
1138 llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1139 llvm::ConstantAsMetadata::get(
1140 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1141 // Append metadata to nvvm.annotations
1142 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1143}
1144
1145void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1146 const OMPExecutableDirective &D, StringRef ParentName,
1147 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1148 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1149 if (!IsOffloadEntry) // Nothing to do.
1150 return;
1151
1152 assert(!ParentName.empty() && "Invalid target region parent name!")(static_cast<void> (0));
1153
1154 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1155 if (Mode)
1156 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1157 CodeGen);
1158 else
1159 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1160 CodeGen);
1161
1162 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1163}
1164
1165namespace {
1166LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()using ::llvm::BitmaskEnumDetail::operator~; using ::llvm::BitmaskEnumDetail
::operator|; using ::llvm::BitmaskEnumDetail::operator&; using
::llvm::BitmaskEnumDetail::operator^; using ::llvm::BitmaskEnumDetail
::operator|=; using ::llvm::BitmaskEnumDetail::operator&=
; using ::llvm::BitmaskEnumDetail::operator^=
;
1167/// Enum for accesseing the reserved_2 field of the ident_t struct.
1168enum ModeFlagsTy : unsigned {
1169 /// Bit set to 1 when in SPMD mode.
1170 KMP_IDENT_SPMD_MODE = 0x01,
1171 /// Bit set to 1 when a simplified runtime is used.
1172 KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1173 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)LLVM_BITMASK_LARGEST_ENUMERATOR = KMP_IDENT_SIMPLE_RT_MODE
1174};
1175
1176/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1177static const ModeFlagsTy UndefinedMode =
1178 (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1179} // anonymous namespace
1180
1181unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
1182 switch (getExecutionMode()) {
1183 case EM_SPMD:
1184 if (requiresFullRuntime())
1185 return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1186 return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1187 case EM_NonSPMD:
1188 assert(requiresFullRuntime() && "Expected full runtime.")(static_cast<void> (0));
1189 return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1190 case EM_Unknown:
1191 return UndefinedMode;
1192 }
1193 llvm_unreachable("Unknown flags are requested.")__builtin_unreachable();
1194}
1195
1196CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
1197 : CGOpenMPRuntime(CGM, "_", "$") {
1198 if (!CGM.getLangOpts().OpenMPIsDevice)
1199 llvm_unreachable("OpenMP NVPTX can only handle device code.")__builtin_unreachable();
1200}
1201
1202void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
1203 ProcBindKind ProcBind,
1204 SourceLocation Loc) {
1205 // Do nothing in case of SPMD mode and L0 parallel.
1206 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1207 return;
1208
1209 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1210}
1211
1212void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
1213 llvm::Value *NumThreads,
1214 SourceLocation Loc) {
1215 // Do nothing in case of SPMD mode and L0 parallel.
1216 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1217 return;
1218
1219 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1220}
1221
1222void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
1223 const Expr *NumTeams,
1224 const Expr *ThreadLimit,
1225 SourceLocation Loc) {}
1226
1227llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
1228 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1229 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1230 // Emit target region as a standalone region.
1231 class NVPTXPrePostActionTy : public PrePostActionTy {
1232 bool &IsInParallelRegion;
1233 bool PrevIsInParallelRegion;
1234
1235 public:
1236 NVPTXPrePostActionTy(bool &IsInParallelRegion)
1237 : IsInParallelRegion(IsInParallelRegion) {}
1238 void Enter(CodeGenFunction &CGF) override {
1239 PrevIsInParallelRegion = IsInParallelRegion;
1240 IsInParallelRegion = true;
1241 }
1242 void Exit(CodeGenFunction &CGF) override {
1243 IsInParallelRegion = PrevIsInParallelRegion;
1244 }
1245 } Action(IsInParallelRegion);
1246 CodeGen.setAction(Action);
1247 bool PrevIsInTTDRegion = IsInTTDRegion;
1248 IsInTTDRegion = false;
1249 bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1250 IsInTargetMasterThreadRegion = false;
1251 auto *OutlinedFun =
1252 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1253 D, ThreadIDVar, InnermostKind, CodeGen));
1254 IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1255 IsInTTDRegion = PrevIsInTTDRegion;
1256 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
1257 !IsInParallelRegion) {
1258 llvm::Function *WrapperFun =
1259 createParallelDataSharingWrapper(OutlinedFun, D);
1260 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1261 }
1262
1263 return OutlinedFun;
1264}
1265
1266/// Get list of lastprivate variables from the teams distribute ... or
1267/// teams {distribute ...} directives.
1268static void
1269getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1270 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1271 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast<void> (0))
1272 "expected teams directive.")(static_cast<void> (0));
1273 const OMPExecutableDirective *Dir = &D;
1274 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1275 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
1276 Ctx,
1277 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1278 /*IgnoreCaptured=*/true))) {
1279 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1280 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1281 Dir = nullptr;
1282 }
1283 }
1284 if (!Dir)
1285 return;
1286 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1287 for (const Expr *E : C->getVarRefs())
1288 Vars.push_back(getPrivateItem(E));
1289 }
1290}
1291
1292/// Get list of reduction variables from the teams ... directives.
1293static void
1294getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1295 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1296 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast<void> (0))
1297 "expected teams directive.")(static_cast<void> (0));
1298 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1299 for (const Expr *E : C->privates())
1300 Vars.push_back(getPrivateItem(E));
1301 }
1302}
1303
1304llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1305 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1306 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1307 SourceLocation Loc = D.getBeginLoc();
1308
1309 const RecordDecl *GlobalizedRD = nullptr;
1310 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1311 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1312 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1313 // Globalize team reductions variable unconditionally in all modes.
1314 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1315 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1316 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1317 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1318 if (!LastPrivatesReductions.empty()) {
1319 GlobalizedRD = ::buildRecordForGlobalizedVars(
1320 CGM.getContext(), llvm::None, LastPrivatesReductions,
1321 MappedDeclsFields, WarpSize);
1322 }
1323 } else if (!LastPrivatesReductions.empty()) {
1324 assert(!TeamAndReductions.first &&(static_cast<void> (0))
1325 "Previous team declaration is not expected.")(static_cast<void> (0));
1326 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1327 std::swap(TeamAndReductions.second, LastPrivatesReductions);
1328 }
1329
1330 // Emit target region as a standalone region.
1331 class NVPTXPrePostActionTy : public PrePostActionTy {
1332 SourceLocation &Loc;
1333 const RecordDecl *GlobalizedRD;
1334 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1335 &MappedDeclsFields;
1336
1337 public:
1338 NVPTXPrePostActionTy(
1339 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1340 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1341 &MappedDeclsFields)
1342 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1343 MappedDeclsFields(MappedDeclsFields) {}
1344 void Enter(CodeGenFunction &CGF) override {
1345 auto &Rt =
1346 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1347 if (GlobalizedRD) {
1348 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1349 I->getSecond().MappedParams =
1350 std::make_unique<CodeGenFunction::OMPMapVars>();
1351 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1352 for (const auto &Pair : MappedDeclsFields) {
1353 assert(Pair.getFirst()->isCanonicalDecl() &&(static_cast<void> (0))
1354 "Expected canonical declaration")(static_cast<void> (0));
1355 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1356 }
1357 }
1358 Rt.emitGenericVarsProlog(CGF, Loc);
1359 }
1360 void Exit(CodeGenFunction &CGF) override {
1361 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1362 .emitGenericVarsEpilog(CGF);
1363 }
1364 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1365 CodeGen.setAction(Action);
1366 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1367 D, ThreadIDVar, InnermostKind, CodeGen);
1368
1369 return OutlinedFun;
1370}
1371
1372void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1373 SourceLocation Loc,
1374 bool WithSPMDCheck) {
1375 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1376 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1377 return;
1378
1379 CGBuilderTy &Bld = CGF.Builder;
1380
1381 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1382 if (I == FunctionGlobalizedDecls.end())
1383 return;
1384
1385 for (auto &Rec : I->getSecond().LocalVarData) {
1386 const auto *VD = cast<VarDecl>(Rec.first);
1387 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1388 QualType VarTy = VD->getType();
1389
1390 // Get the local allocation of a firstprivate variable before sharing
1391 llvm::Value *ParValue;
1392 if (EscapedParam) {
1393 LValue ParLVal =
1394 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1395 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1396 }
1397
1398 // Allocate space for the variable to be globalized
1399 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1400 llvm::Instruction *VoidPtr =
1401 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1402 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1403 AllocArgs, VD->getName());
1404
1405 // Cast the void pointer and get the address of the globalized variable.
1406 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1407 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1408 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1409 LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1410 Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1411 Rec.second.GlobalizedVal = VoidPtr;
1412
1413 // Assign the local allocation to the newly globalized location.
1414 if (EscapedParam) {
1415 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1416 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1417 }
1418 if (auto *DI = CGF.getDebugInfo())
1419 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1420 }
1421 for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1422 // Use actual memory size of the VLA object including the padding
1423 // for alignment purposes.
1424 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1425 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1426 Size = Bld.CreateNUWAdd(
1427 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1428 llvm::Value *AlignVal =
1429 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1430
1431 Size = Bld.CreateUDiv(Size, AlignVal);
1432 Size = Bld.CreateNUWMul(Size, AlignVal);
1433
1434 // Allocate space for this VLA object to be globalized.
1435 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1436 llvm::Instruction *VoidPtr =
1437 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1438 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1439 AllocArgs, VD->getName());
1440
1441 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1442 std::pair<llvm::Value *, llvm::Value *>(
1443 {VoidPtr, CGF.getTypeSize(VD->getType())}));
1444 LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1445 CGM.getContext().getDeclAlign(VD),
1446 AlignmentSource::Decl);
1447 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1448 Base.getAddress(CGF));
1449 }
1450 I->getSecond().MappedParams->apply(CGF);
1451}
1452
1453void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1454 bool WithSPMDCheck) {
1455 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1456 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1457 return;
1458
1459 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1460 if (I != FunctionGlobalizedDecls.end()) {
1461 // Deallocate the memory for each globalized VLA object
1462 for (auto AddrSizePair :
1463 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1464 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1465 CGM.getModule(), OMPRTL___kmpc_free_shared),
1466 {AddrSizePair.first, AddrSizePair.second});
1467 }
1468 // Deallocate the memory for each globalized value
1469 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1470 const auto *VD = cast<VarDecl>(Rec.first);
1471 I->getSecond().MappedParams->restore(CGF);
1472
1473 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1474 CGF.getTypeSize(VD->getType())};
1475 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1476 CGM.getModule(), OMPRTL___kmpc_free_shared),
1477 FreeArgs);
1478 }
1479 }
1480}
1481
1482void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1483 const OMPExecutableDirective &D,
1484 SourceLocation Loc,
1485 llvm::Function *OutlinedFn,
1486 ArrayRef<llvm::Value *> CapturedVars) {
1487 if (!CGF.HaveInsertPoint())
1488 return;
1489
1490 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1491 /*Name=*/".zero.addr");
1492 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1493 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1494 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1495 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1496 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1497 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1498}
1499
1500void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1501 SourceLocation Loc,
1502 llvm::Function *OutlinedFn,
1503 ArrayRef<llvm::Value *> CapturedVars,
1504 const Expr *IfCond) {
1505 if (!CGF.HaveInsertPoint())
1506 return;
1507
1508 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars,
1509 IfCond](CodeGenFunction &CGF, PrePostActionTy &Action) {
1510 CGBuilderTy &Bld = CGF.Builder;
1511 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1512 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1513 if (WFn)
1514 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1515 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1516
1517 // Create a private scope that will globalize the arguments
1518 // passed from the outside of the target region.
1519 // TODO: Is that needed?
1520 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1521
1522 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1523 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1524 "captured_vars_addrs");
1525 // There's something to share.
1526 if (!CapturedVars.empty()) {
1527 // Prepare for parallel region. Indicate the outlined function.
1528 ASTContext &Ctx = CGF.getContext();
1529 unsigned Idx = 0;
1530 for (llvm::Value *V : CapturedVars) {
1531 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1532 llvm::Value *PtrV;
1533 if (V->getType()->isIntegerTy())
1534 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1535 else
1536 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1537 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1538 Ctx.getPointerType(Ctx.VoidPtrTy));
1539 ++Idx;
1540 }
1541 }
1542
1543 llvm::Value *IfCondVal = nullptr;
1544 if (IfCond)
1545 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1546 /* isSigned */ false);
1547 else
1548 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1549
1550 assert(IfCondVal && "Expected a value")(static_cast<void> (0));
1551 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1552 llvm::Value *Args[] = {
1553 RTLoc,
1554 getThreadID(CGF, Loc),
1555 IfCondVal,
1556 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1557 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1558 FnPtr,
1559 ID,
1560 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1561 CGF.VoidPtrPtrTy),
1562 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1563 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1564 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1565 Args);
1566 };
1567
1568 RegionCodeGenTy RCG(ParallelGen);
1569 RCG(CGF);
1570}
1571
1572void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1573 // Always emit simple barriers!
1574 if (!CGF.HaveInsertPoint())
1575 return;
1576 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1577 // This function does not use parameters, so we can emit just default values.
1578 llvm::Value *Args[] = {
1579 llvm::ConstantPointerNull::get(
1580 cast<llvm::PointerType>(getIdentTyPointerTy())),
1581 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1582 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1583 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1584 Args);
1585}
1586
1587void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1588 SourceLocation Loc,
1589 OpenMPDirectiveKind Kind, bool,
1590 bool) {
1591 // Always emit simple barriers!
1592 if (!CGF.HaveInsertPoint())
1593 return;
1594 // Build call __kmpc_cancel_barrier(loc, thread_id);
1595 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1596 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1597 getThreadID(CGF, Loc)};
1598
1599 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1600 CGM.getModule(), OMPRTL___kmpc_barrier),
1601 Args);
1602}
1603
1604void CGOpenMPRuntimeGPU::emitCriticalRegion(
1605 CodeGenFunction &CGF, StringRef CriticalName,
1606 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1607 const Expr *Hint) {
1608 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1609 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1610 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1611 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1612 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1613
1614 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1615
1616 // Get the mask of active threads in the warp.
1617 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1618 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1619 // Fetch team-local id of the thread.
1620 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1621
1622 // Get the width of the team.
1623 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1624
1625 // Initialize the counter variable for the loop.
1626 QualType Int32Ty =
1627 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1628 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1629 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1630 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1631 /*isInit=*/true);
1632
1633 // Block checks if loop counter exceeds upper bound.
1634 CGF.EmitBlock(LoopBB);
1635 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1636 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1637 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1638
1639 // Block tests which single thread should execute region, and which threads
1640 // should go straight to synchronisation point.
1641 CGF.EmitBlock(TestBB);
1642 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1643 llvm::Value *CmpThreadToCounter =
1644 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1645 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1646
1647 // Block emits the body of the critical region.
1648 CGF.EmitBlock(BodyBB);
1649
1650 // Output the critical statement.
1651 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1652 Hint);
1653
1654 // After the body surrounded by the critical region, the single executing
1655 // thread will jump to the synchronisation point.
1656 // Block waits for all threads in current team to finish then increments the
1657 // counter variable and returns to the loop.
1658 CGF.EmitBlock(SyncBB);
1659 // Reconverge active threads in the warp.
1660 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1661 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1662 Mask);
1663
1664 llvm::Value *IncCounterVal =
1665 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1666 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1667 CGF.EmitBranch(LoopBB);
1668
1669 // Block that is reached when all threads in the team complete the region.
1670 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1671}
1672
1673/// Cast value to the specified type.
1674static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1675 QualType ValTy, QualType CastTy,
1676 SourceLocation Loc) {
1677 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&(static_cast<void> (0))
1678 "Cast type must sized.")(static_cast<void> (0));
1679 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&(static_cast<void> (0))
1680 "Val type must sized.")(static_cast<void> (0));
1681 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1682 if (ValTy == CastTy)
1683 return Val;
1684 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1685 CGF.getContext().getTypeSizeInChars(CastTy))
1686 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1687 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1688 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1689 CastTy->hasSignedIntegerRepresentation());
1690 Address CastItem = CGF.CreateMemTemp(CastTy);
1691 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1692 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
1693 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1694 LValueBaseInfo(AlignmentSource::Type),
1695 TBAAAccessInfo());
1696 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1697 LValueBaseInfo(AlignmentSource::Type),
1698 TBAAAccessInfo());
1699}
1700
1701/// This function creates calls to one of two shuffle functions to copy
1702/// variables between lanes in a warp.
1703static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1704 llvm::Value *Elem,
1705 QualType ElemType,
1706 llvm::Value *Offset,
1707 SourceLocation Loc) {
1708 CodeGenModule &CGM = CGF.CGM;
1709 CGBuilderTy &Bld = CGF.Builder;
1710 CGOpenMPRuntimeGPU &RT =
1711 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1712 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1713
1714 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1715 assert(Size.getQuantity() <= 8 &&(static_cast<void> (0))
1716 "Unsupported bitwidth in shuffle instruction.")(static_cast<void> (0));
1717
1718 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1719 ? OMPRTL___kmpc_shuffle_int32
1720 : OMPRTL___kmpc_shuffle_int64;
1721
1722 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1723 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1724 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1725 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1726 llvm::Value *WarpSize =
1727 Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1728
1729 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1730 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1731 {ElemCast, Offset, WarpSize});
1732
1733 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1734}
1735
1736static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1737 Address DestAddr, QualType ElemType,
1738 llvm::Value *Offset, SourceLocation Loc) {
1739 CGBuilderTy &Bld = CGF.Builder;
1740
1741 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1742 // Create the loop over the big sized data.
1743 // ptr = (void*)Elem;
1744 // ptrEnd = (void*) Elem + 1;
1745 // Step = 8;
1746 // while (ptr + Step < ptrEnd)
1747 // shuffle((int64_t)*ptr);
1748 // Step = 4;
1749 // while (ptr + Step < ptrEnd)
1750 // shuffle((int32_t)*ptr);
1751 // ...
1752 Address ElemPtr = DestAddr;
1753 Address Ptr = SrcAddr;
1754 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1755 Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
1756 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1757 if (Size < CharUnits::fromQuantity(IntSize))
1758 continue;
1759 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1760 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1761 /*Signed=*/1);
1762 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1763 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
1764 ElemPtr =
1765 Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
1766 if (Size.getQuantity() / IntSize > 1) {
1767 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1768 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1769 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1770 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1771 CGF.EmitBlock(PreCondBB);
1772 llvm::PHINode *PhiSrc =
1773 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1774 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1775 llvm::PHINode *PhiDest =
1776 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1777 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1778 Ptr = Address(PhiSrc, Ptr.getAlignment());
1779 ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
1780 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1781 PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
1782 Ptr.getPointer(), CGF.VoidPtrTy));
1783 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1784 ThenBB, ExitBB);
1785 CGF.EmitBlock(ThenBB);
1786 llvm::Value *Res = createRuntimeShuffleFunction(
1787 CGF,
1788 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1789 LValueBaseInfo(AlignmentSource::Type),
1790 TBAAAccessInfo()),
1791 IntType, Offset, Loc);
1792 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1793 LValueBaseInfo(AlignmentSource::Type),
1794 TBAAAccessInfo());
1795 Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1796 Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1797 PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1798 PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1799 CGF.EmitBranch(PreCondBB);
1800 CGF.EmitBlock(ExitBB);
1801 } else {
1802 llvm::Value *Res = createRuntimeShuffleFunction(
1803 CGF,
1804 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1805 LValueBaseInfo(AlignmentSource::Type),
1806 TBAAAccessInfo()),
1807 IntType, Offset, Loc);
1808 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1809 LValueBaseInfo(AlignmentSource::Type),
1810 TBAAAccessInfo());
1811 Ptr = Bld.CreateConstGEP(Ptr, 1);
1812 ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1813 }
1814 Size = Size % IntSize;
1815 }
1816}
1817
1818namespace {
1819enum CopyAction : unsigned {
1820 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1821 // the warp using shuffle instructions.
1822 RemoteLaneToThread,
1823 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1824 ThreadCopy,
1825 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1826 ThreadToScratchpad,
1827 // ScratchpadToThread: Copy from a scratchpad array in global memory
1828 // containing team-reduced data to a thread's stack.
1829 ScratchpadToThread,
1830};
1831} // namespace
1832
1833struct CopyOptionsTy {
1834 llvm::Value *RemoteLaneOffset;
1835 llvm::Value *ScratchpadIndex;
1836 llvm::Value *ScratchpadWidth;
1837};
1838
1839/// Emit instructions to copy a Reduce list, which contains partially
1840/// aggregated values, in the specified direction.
1841static void emitReductionListCopy(
1842 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1843 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1844 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1845
1846 CodeGenModule &CGM = CGF.CGM;
1847 ASTContext &C = CGM.getContext();
1848 CGBuilderTy &Bld = CGF.Builder;
1849
1850 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1851 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1852 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1853
1854 // Iterates, element-by-element, through the source Reduce list and
1855 // make a copy.
1856 unsigned Idx = 0;
1857 unsigned Size = Privates.size();
1858 for (const Expr *Private : Privates) {
1859 Address SrcElementAddr = Address::invalid();
1860 Address DestElementAddr = Address::invalid();
1861 Address DestElementPtrAddr = Address::invalid();
1862 // Should we shuffle in an element from a remote lane?
1863 bool ShuffleInElement = false;
1864 // Set to true to update the pointer in the dest Reduce list to a
1865 // newly created element.
1866 bool UpdateDestListPtr = false;
1867 // Increment the src or dest pointer to the scratchpad, for each
1868 // new element.
1869 bool IncrScratchpadSrc = false;
1870 bool IncrScratchpadDest = false;
1871
1872 switch (Action) {
1873 case RemoteLaneToThread: {
1874 // Step 1.1: Get the address for the src element in the Reduce list.
1875 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1876 SrcElementAddr = CGF.EmitLoadOfPointer(
1877 SrcElementPtrAddr,
1878 C.getPointerType(Private->getType())->castAs<PointerType>());
1879
1880 // Step 1.2: Create a temporary to store the element in the destination
1881 // Reduce list.
1882 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1883 DestElementAddr =
1884 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1885 ShuffleInElement = true;
1886 UpdateDestListPtr = true;
1887 break;
1888 }
1889 case ThreadCopy: {
1890 // Step 1.1: Get the address for the src element in the Reduce list.
1891 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1892 SrcElementAddr = CGF.EmitLoadOfPointer(
1893 SrcElementPtrAddr,
1894 C.getPointerType(Private->getType())->castAs<PointerType>());
1895
1896 // Step 1.2: Get the address for dest element. The destination
1897 // element has already been created on the thread's stack.
1898 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1899 DestElementAddr = CGF.EmitLoadOfPointer(
1900 DestElementPtrAddr,
1901 C.getPointerType(Private->getType())->castAs<PointerType>());
1902 break;
1903 }
1904 case ThreadToScratchpad: {
1905 // Step 1.1: Get the address for the src element in the Reduce list.
1906 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1907 SrcElementAddr = CGF.EmitLoadOfPointer(
1908 SrcElementPtrAddr,
1909 C.getPointerType(Private->getType())->castAs<PointerType>());
1910
1911 // Step 1.2: Get the address for dest element:
1912 // address = base + index * ElementSizeInChars.
1913 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1914 llvm::Value *CurrentOffset =
1915 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1916 llvm::Value *ScratchPadElemAbsolutePtrVal =
1917 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1918 ScratchPadElemAbsolutePtrVal =
1919 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1920 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1921 C.getTypeAlignInChars(Private->getType()));
1922 IncrScratchpadDest = true;
1923 break;
1924 }
1925 case ScratchpadToThread: {
1926 // Step 1.1: Get the address for the src element in the scratchpad.
1927 // address = base + index * ElementSizeInChars.
1928 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1929 llvm::Value *CurrentOffset =
1930 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1931 llvm::Value *ScratchPadElemAbsolutePtrVal =
1932 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1933 ScratchPadElemAbsolutePtrVal =
1934 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1935 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1936 C.getTypeAlignInChars(Private->getType()));
1937 IncrScratchpadSrc = true;
1938
1939 // Step 1.2: Create a temporary to store the element in the destination
1940 // Reduce list.
1941 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1942 DestElementAddr =
1943 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1944 UpdateDestListPtr = true;
1945 break;
1946 }
1947 }
1948
1949 // Regardless of src and dest of copy, we emit the load of src
1950 // element as this is required in all directions
1951 SrcElementAddr = Bld.CreateElementBitCast(
1952 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1953 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1954 SrcElementAddr.getElementType());
1955
1956 // Now that all active lanes have read the element in the
1957 // Reduce list, shuffle over the value from the remote lane.
1958 if (ShuffleInElement) {
1959 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1960 RemoteLaneOffset, Private->getExprLoc());
1961 } else {
1962 switch (CGF.getEvaluationKind(Private->getType())) {
1963 case TEK_Scalar: {
1964 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1965 SrcElementAddr, /*Volatile=*/false, Private->getType(),
1966 Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
1967 TBAAAccessInfo());
1968 // Store the source element value to the dest element address.
1969 CGF.EmitStoreOfScalar(
1970 Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
1971 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
1972 break;
1973 }
1974 case TEK_Complex: {
1975 CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
1976 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1977 Private->getExprLoc());
1978 CGF.EmitStoreOfComplex(
1979 Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1980 /*isInit=*/false);
1981 break;
1982 }
1983 case TEK_Aggregate:
1984 CGF.EmitAggregateCopy(
1985 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1986 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1987 Private->getType(), AggValueSlot::DoesNotOverlap);
1988 break;
1989 }
1990 }
1991
1992 // Step 3.1: Modify reference in dest Reduce list as needed.
1993 // Modifying the reference in Reduce list to point to the newly
1994 // created element. The element is live in the current function
1995 // scope and that of functions it invokes (i.e., reduce_function).
1996 // RemoteReduceData[i] = (void*)&RemoteElem
1997 if (UpdateDestListPtr) {
1998 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1999 DestElementAddr.getPointer(), CGF.VoidPtrTy),
2000 DestElementPtrAddr, /*Volatile=*/false,
2001 C.VoidPtrTy);
2002 }
2003
2004 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2005 // address of the next element in scratchpad memory, unless we're currently
2006 // processing the last one. Memory alignment is also taken care of here.
2007 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2008 llvm::Value *ScratchpadBasePtr =
2009 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2010 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2011 ScratchpadBasePtr = Bld.CreateNUWAdd(
2012 ScratchpadBasePtr,
2013 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2014
2015 // Take care of global memory alignment for performance
2016 ScratchpadBasePtr = Bld.CreateNUWSub(
2017 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2018 ScratchpadBasePtr = Bld.CreateUDiv(
2019 ScratchpadBasePtr,
2020 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2021 ScratchpadBasePtr = Bld.CreateNUWAdd(
2022 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2023 ScratchpadBasePtr = Bld.CreateNUWMul(
2024 ScratchpadBasePtr,
2025 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2026
2027 if (IncrScratchpadDest)
2028 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2029 else /* IncrScratchpadSrc = true */
2030 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2031 }
2032
2033 ++Idx;
2034 }
2035}
2036
2037/// This function emits a helper that gathers Reduce lists from the first
2038/// lane of every active warp to lanes in the first warp.
2039///
2040/// void inter_warp_copy_func(void* reduce_data, num_warps)
2041/// shared smem[warp_size];
2042/// For all data entries D in reduce_data:
2043/// sync
2044/// If (I am the first lane in each warp)
2045/// Copy my local D to smem[warp_id]
2046/// sync
2047/// if (I am the first warp)
2048/// Copy smem[thread_id] to my local D
2049static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2050 ArrayRef<const Expr *> Privates,
2051 QualType ReductionArrayTy,
2052 SourceLocation Loc) {
2053 ASTContext &C = CGM.getContext();
2054 llvm::Module &M = CGM.getModule();
2055
2056 // ReduceList: thread local Reduce list.
2057 // At the stage of the computation when this function is called, partially
2058 // aggregated values reside in the first lane of every active warp.
2059 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2060 C.VoidPtrTy, ImplicitParamDecl::Other);
2061 // NumWarps: number of warps active in the parallel region. This could
2062 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2063 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2064 C.getIntTypeForBitwidth(32, /* Signed */ true),
2065 ImplicitParamDecl::Other);
2066 FunctionArgList Args;
2067 Args.push_back(&ReduceListArg);
2068 Args.push_back(&NumWarpsArg);
2069
2070 const CGFunctionInfo &CGFI =
2071 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2072 auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
2073 llvm::GlobalValue::InternalLinkage,
2074 "_omp_reduction_inter_warp_copy_func", &M);
2075 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2076 Fn->setDoesNotRecurse();
2077 CodeGenFunction CGF(CGM);
2078 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2079
2080 CGBuilderTy &Bld = CGF.Builder;
2081
2082 // This array is used as a medium to transfer, one reduce element at a time,
2083 // the data from the first lane of every warp to lanes in the first warp
2084 // in order to perform the final step of a reduction in a parallel region
2085 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2086 // for reduced latency, as well as to have a distinct copy for concurrently
2087 // executing target regions. The array is declared with common linkage so
2088 // as to be shared across compilation units.
2089 StringRef TransferMediumName =
2090 "__openmp_nvptx_data_transfer_temporary_storage";
2091 llvm::GlobalVariable *TransferMedium =
2092 M.getGlobalVariable(TransferMediumName);
2093 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
2094 if (!TransferMedium) {
8
Assuming 'TransferMedium' is non-null
9
Taking false branch
2095 auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
2096 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2097 TransferMedium = new llvm::GlobalVariable(
2098 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
2099 llvm::UndefValue::get(Ty), TransferMediumName,
2100 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2101 SharedAddressSpace);
2102 CGM.addCompilerUsedGlobal(TransferMedium);
2103 }
2104
2105 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2106 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2107 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2108 // nvptx_lane_id = nvptx_id % warpsize
2109 llvm::Value *LaneID = getNVPTXLaneID(CGF);
10
Calling 'getNVPTXLaneID'
2110 // nvptx_warp_id = nvptx_id / warpsize
2111 llvm::Value *WarpID = getNVPTXWarpID(CGF);
2112
2113 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2114 Address LocalReduceList(
2115 Bld.CreatePointerBitCastOrAddrSpaceCast(
2116 CGF.EmitLoadOfScalar(
2117 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
2118 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
2119 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2120 CGF.getPointerAlign());
2121
2122 unsigned Idx = 0;
2123 for (const Expr *Private : Privates) {
2124 //
2125 // Warp master copies reduce element to transfer medium in __shared__
2126 // memory.
2127 //
2128 unsigned RealTySize =
2129 C.getTypeSizeInChars(Private->getType())
2130 .alignTo(C.getTypeAlignInChars(Private->getType()))
2131 .getQuantity();
2132 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2133 unsigned NumIters = RealTySize / TySize;
2134 if (NumIters == 0)
2135 continue;
2136 QualType CType = C.getIntTypeForBitwidth(
2137 C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
2138 llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
2139 CharUnits Align = CharUnits::fromQuantity(TySize);
2140 llvm::Value *Cnt = nullptr;
2141 Address CntAddr = Address::invalid();
2142 llvm::BasicBlock *PrecondBB = nullptr;
2143 llvm::BasicBlock *ExitBB = nullptr;
2144 if (NumIters > 1) {
2145 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
2146 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
2147 /*Volatile=*/false, C.IntTy);
2148 PrecondBB = CGF.createBasicBlock("precond");
2149 ExitBB = CGF.createBasicBlock("exit");
2150 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
2151 // There is no need to emit line number for unconditional branch.
2152 (void)ApplyDebugLocation::CreateEmpty(CGF);
2153 CGF.EmitBlock(PrecondBB);
2154 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
2155 llvm::Value *Cmp =
2156 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
2157 Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2158 CGF.EmitBlock(BodyBB);
2159 }
2160 // kmpc_barrier.
2161 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2162 /*EmitChecks=*/false,
2163 /*ForceSimpleCall=*/true);
2164 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2165 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2166 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2167
2168 // if (lane_id == 0)
2169 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2170 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2171 CGF.EmitBlock(ThenBB);
2172
2173 // Reduce element = LocalReduceList[i]
2174 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2175 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2176 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2177 // elemptr = ((CopyType*)(elemptrptr)) + I
2178 Address ElemPtr = Address(ElemPtrPtr, Align);
2179 ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
2180 if (NumIters > 1) {
2181 ElemPtr = Address(Bld.CreateGEP(ElemPtr.getElementType(),
2182 ElemPtr.getPointer(), Cnt),
2183 ElemPtr.getAlignment());
2184 }
2185
2186 // Get pointer to location in transfer medium.
2187 // MediumPtr = &medium[warp_id]
2188 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2189 TransferMedium->getValueType(), TransferMedium,
2190 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2191 Address MediumPtr(MediumPtrVal, Align);
2192 // Casting to actual data type.
2193 // MediumPtr = (CopyType*)MediumPtrAddr;
2194 MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
2195
2196 // elem = *elemptr
2197 //*MediumPtr = elem
2198 llvm::Value *Elem = CGF.EmitLoadOfScalar(
2199 ElemPtr, /*Volatile=*/false, CType, Loc,
2200 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2201 // Store the source element value to the dest element address.
2202 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
2203 LValueBaseInfo(AlignmentSource::Type),
2204 TBAAAccessInfo());
2205
2206 Bld.CreateBr(MergeBB);
2207
2208 CGF.EmitBlock(ElseBB);
2209 Bld.CreateBr(MergeBB);
2210
2211 CGF.EmitBlock(MergeBB);
2212
2213 // kmpc_barrier.
2214 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2215 /*EmitChecks=*/false,
2216 /*ForceSimpleCall=*/true);
2217
2218 //
2219 // Warp 0 copies reduce element from transfer medium.
2220 //
2221 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2222 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2223 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2224
2225 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2226 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2227 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
2228
2229 // Up to 32 threads in warp 0 are active.
2230 llvm::Value *IsActiveThread =
2231 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2232 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2233
2234 CGF.EmitBlock(W0ThenBB);
2235
2236 // SrcMediumPtr = &medium[tid]
2237 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2238 TransferMedium->getValueType(), TransferMedium,
2239 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2240 Address SrcMediumPtr(SrcMediumPtrVal, Align);
2241 // SrcMediumVal = *SrcMediumPtr;
2242 SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
2243
2244 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2245 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2246 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2247 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
2248 Address TargetElemPtr = Address(TargetElemPtrVal, Align);
2249 TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2250 if (NumIters > 1) {
2251 TargetElemPtr = Address(Bld.CreateGEP(TargetElemPtr.getElementType(),
2252 TargetElemPtr.getPointer(), Cnt),
2253 TargetElemPtr.getAlignment());
2254 }
2255
2256 // *TargetElemPtr = SrcMediumVal;
2257 llvm::Value *SrcMediumValue =
2258 CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
2259 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2260 CType);
2261 Bld.CreateBr(W0MergeBB);
2262
2263 CGF.EmitBlock(W0ElseBB);
2264 Bld.CreateBr(W0MergeBB);
2265
2266 CGF.EmitBlock(W0MergeBB);
2267
2268 if (NumIters > 1) {
2269 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
2270 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
2271 CGF.EmitBranch(PrecondBB);
2272 (void)ApplyDebugLocation::CreateEmpty(CGF);
2273 CGF.EmitBlock(ExitBB);
2274 }
2275 RealTySize %= TySize;
2276 }
2277 ++Idx;
2278 }
2279
2280 CGF.FinishFunction();
2281 return Fn;
2282}
2283
2284/// Emit a helper that reduces data across two OpenMP threads (lanes)
2285/// in the same warp. It uses shuffle instructions to copy over data from
2286/// a remote lane's stack. The reduction algorithm performed is specified
2287/// by the fourth parameter.
2288///
2289/// Algorithm Versions.
2290/// Full Warp Reduce (argument value 0):
2291/// This algorithm assumes that all 32 lanes are active and gathers
2292/// data from these 32 lanes, producing a single resultant value.
2293/// Contiguous Partial Warp Reduce (argument value 1):
2294/// This algorithm assumes that only a *contiguous* subset of lanes
2295/// are active. This happens for the last warp in a parallel region
2296/// when the user specified num_threads is not an integer multiple of
2297/// 32. This contiguous subset always starts with the zeroth lane.
2298/// Partial Warp Reduce (argument value 2):
2299/// This algorithm gathers data from any number of lanes at any position.
2300/// All reduced values are stored in the lowest possible lane. The set
2301/// of problems every algorithm addresses is a super set of those
2302/// addressable by algorithms with a lower version number. Overhead
2303/// increases as algorithm version increases.
2304///
2305/// Terminology
2306/// Reduce element:
2307/// Reduce element refers to the individual data field with primitive
2308/// data types to be combined and reduced across threads.
2309/// Reduce list:
2310/// Reduce list refers to a collection of local, thread-private
2311/// reduce elements.
2312/// Remote Reduce list:
2313/// Remote Reduce list refers to a collection of remote (relative to
2314/// the current thread) reduce elements.
2315///
2316/// We distinguish between three states of threads that are important to
2317/// the implementation of this function.
2318/// Alive threads:
2319/// Threads in a warp executing the SIMT instruction, as distinguished from
2320/// threads that are inactive due to divergent control flow.
2321/// Active threads:
2322/// The minimal set of threads that has to be alive upon entry to this
2323/// function. The computation is correct iff active threads are alive.
2324/// Some threads are alive but they are not active because they do not
2325/// contribute to the computation in any useful manner. Turning them off
2326/// may introduce control flow overheads without any tangible benefits.
2327/// Effective threads:
2328/// In order to comply with the argument requirements of the shuffle
2329/// function, we must keep all lanes holding data alive. But at most
2330/// half of them perform value aggregation; we refer to this half of
2331/// threads as effective. The other half is simply handing off their
2332/// data.
2333///
2334/// Procedure
2335/// Value shuffle:
2336/// In this step active threads transfer data from higher lane positions
2337/// in the warp to lower lane positions, creating Remote Reduce list.
2338/// Value aggregation:
2339/// In this step, effective threads combine their thread local Reduce list
2340/// with Remote Reduce list and store the result in the thread local
2341/// Reduce list.
2342/// Value copy:
2343/// In this step, we deal with the assumption made by algorithm 2
2344/// (i.e. contiguity assumption). When we have an odd number of lanes
2345/// active, say 2k+1, only k threads will be effective and therefore k
2346/// new values will be produced. However, the Reduce list owned by the
2347/// (2k+1)th thread is ignored in the value aggregation. Therefore
2348/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2349/// that the contiguity assumption still holds.
2350static llvm::Function *emitShuffleAndReduceFunction(
2351 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2352 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2353 ASTContext &C = CGM.getContext();
2354
2355 // Thread local Reduce list used to host the values of data to be reduced.
2356 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2357 C.VoidPtrTy, ImplicitParamDecl::Other);
2358 // Current lane id; could be logical.
2359 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2360 ImplicitParamDecl::Other);
2361 // Offset of the remote source lane relative to the current lane.
2362 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2363 C.ShortTy, ImplicitParamDecl::Other);
2364 // Algorithm version. This is expected to be known at compile time.
2365 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2366 C.ShortTy, ImplicitParamDecl::Other);
2367 FunctionArgList Args;
2368 Args.push_back(&ReduceListArg);
2369 Args.push_back(&LaneIDArg);
2370 Args.push_back(&RemoteLaneOffsetArg);
2371 Args.push_back(&AlgoVerArg);
2372
2373 const CGFunctionInfo &CGFI =
2374 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2375 auto *Fn = llvm::Function::Create(
2376 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2377 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2378 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2379 Fn->setDoesNotRecurse();
2380
2381 CodeGenFunction CGF(CGM);
2382 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2383
2384 CGBuilderTy &Bld = CGF.Builder;
2385
2386 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2387 Address LocalReduceList(
2388 Bld.CreatePointerBitCastOrAddrSpaceCast(
2389 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2390 C.VoidPtrTy, SourceLocation()),
2391 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2392 CGF.getPointerAlign());
2393
2394 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2395 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2396 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2397
2398 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2399 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2400 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2401
2402 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2403 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2404 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2405
2406 // Create a local thread-private variable to host the Reduce list
2407 // from a remote lane.
2408 Address RemoteReduceList =
2409 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2410
2411 // This loop iterates through the list of reduce elements and copies,
2412 // element by element, from a remote lane in the warp to RemoteReduceList,
2413 // hosted on the thread's stack.
2414 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2415 LocalReduceList, RemoteReduceList,
2416 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2417 /*ScratchpadIndex=*/nullptr,
2418 /*ScratchpadWidth=*/nullptr});
2419
2420 // The actions to be performed on the Remote Reduce list is dependent
2421 // on the algorithm version.
2422 //
2423 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2424 // LaneId % 2 == 0 && Offset > 0):
2425 // do the reduction value aggregation
2426 //
2427 // The thread local variable Reduce list is mutated in place to host the
2428 // reduced data, which is the aggregated value produced from local and
2429 // remote lanes.
2430 //
2431 // Note that AlgoVer is expected to be a constant integer known at compile
2432 // time.
2433 // When AlgoVer==0, the first conjunction evaluates to true, making
2434 // the entire predicate true during compile time.
2435 // When AlgoVer==1, the second conjunction has only the second part to be
2436 // evaluated during runtime. Other conjunctions evaluates to false
2437 // during compile time.
2438 // When AlgoVer==2, the third conjunction has only the second part to be
2439 // evaluated during runtime. Other conjunctions evaluates to false
2440 // during compile time.
2441 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2442
2443 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2444 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2445 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2446
2447 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2448 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2449 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2450 CondAlgo2 = Bld.CreateAnd(
2451 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2452
2453 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2454 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2455
2456 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2457 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2458 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2459 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2460
2461 CGF.EmitBlock(ThenBB);
2462 // reduce_function(LocalReduceList, RemoteReduceList)
2463 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2464 LocalReduceList.getPointer(), CGF.VoidPtrTy);
2465 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2466 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2467 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2468 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2469 Bld.CreateBr(MergeBB);
2470
2471 CGF.EmitBlock(ElseBB);
2472 Bld.CreateBr(MergeBB);
2473
2474 CGF.EmitBlock(MergeBB);
2475
2476 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2477 // Reduce list.
2478 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2479 llvm::Value *CondCopy = Bld.CreateAnd(
2480 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2481
2482 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2483 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2484 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2485 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2486
2487 CGF.EmitBlock(CpyThenBB);
2488 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2489 RemoteReduceList, LocalReduceList);
2490 Bld.CreateBr(CpyMergeBB);
2491
2492 CGF.EmitBlock(CpyElseBB);
2493 Bld.CreateBr(CpyMergeBB);
2494
2495 CGF.EmitBlock(CpyMergeBB);
2496
2497 CGF.FinishFunction();
2498 return Fn;
2499}
2500
2501/// This function emits a helper that copies all the reduction variables from
2502/// the team into the provided global buffer for the reduction variables.
2503///
2504/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2505/// For all data entries D in reduce_data:
2506/// Copy local D to buffer.D[Idx]
2507static llvm::Value *emitListToGlobalCopyFunction(
2508 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2509 QualType ReductionArrayTy, SourceLocation Loc,
2510 const RecordDecl *TeamReductionRec,
2511 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2512 &VarFieldMap) {
2513 ASTContext &C = CGM.getContext();
2514
2515 // Buffer: global reduction buffer.
2516 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2517 C.VoidPtrTy, ImplicitParamDecl::Other);
2518 // Idx: index of the buffer.
2519 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2520 ImplicitParamDecl::Other);
2521 // ReduceList: thread local Reduce list.
2522 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2523 C.VoidPtrTy, ImplicitParamDecl::Other);
2524 FunctionArgList Args;
2525 Args.push_back(&BufferArg);
2526 Args.push_back(&IdxArg);
2527 Args.push_back(&ReduceListArg);
2528
2529 const CGFunctionInfo &CGFI =
2530 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2531 auto *Fn = llvm::Function::Create(
2532 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2533 "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2534 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2535 Fn->setDoesNotRecurse();
2536 CodeGenFunction CGF(CGM);
2537 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2538
2539 CGBuilderTy &Bld = CGF.Builder;
2540
2541 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2542 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2543 Address LocalReduceList(
2544 Bld.CreatePointerBitCastOrAddrSpaceCast(
2545 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2546 C.VoidPtrTy, Loc),
2547 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2548 CGF.getPointerAlign());
2549 QualType StaticTy = C.getRecordType(TeamReductionRec);
2550 llvm::Type *LLVMReductionsBufferTy =
2551 CGM.getTypes().ConvertTypeForMem(StaticTy);
2552 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2553 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2554 LLVMReductionsBufferTy->getPointerTo());
2555 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2556 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2557 /*Volatile=*/false, C.IntTy,
2558 Loc)};
2559 unsigned Idx = 0;
2560 for (const Expr *Private : Privates) {
2561 // Reduce element = LocalReduceList[i]
2562 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2563 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2564 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2565 // elemptr = ((CopyType*)(elemptrptr)) + I
2566 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2567 ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
2568 Address ElemPtr =
2569 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2570 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2571 // Global = Buffer.VD[Idx];
2572 const FieldDecl *FD = VarFieldMap.lookup(VD);
2573 LValue GlobLVal = CGF.EmitLValueForField(
2574 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2575 Address GlobAddr = GlobLVal.getAddress(CGF);
2576 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2577 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2578 GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
2579 switch (CGF.getEvaluationKind(Private->getType())) {
2580 case TEK_Scalar: {
2581 llvm::Value *V = CGF.EmitLoadOfScalar(
2582 ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2583 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2584 CGF.EmitStoreOfScalar(V, GlobLVal);
2585 break;
2586 }
2587 case TEK_Complex: {
2588 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2589 CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2590 CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2591 break;
2592 }
2593 case TEK_Aggregate:
2594 CGF.EmitAggregateCopy(GlobLVal,
2595 CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2596 Private->getType(), AggValueSlot::DoesNotOverlap);
2597 break;
2598 }
2599 ++Idx;
2600 }
2601
2602 CGF.FinishFunction();
2603 return Fn;
2604}
2605
2606/// This function emits a helper that reduces all the reduction variables from
2607/// the team into the provided global buffer for the reduction variables.
2608///
2609/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2610/// void *GlobPtrs[];
2611/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2612/// ...
2613/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2614/// reduce_function(GlobPtrs, reduce_data);
2615static llvm::Value *emitListToGlobalReduceFunction(
2616 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2617 QualType ReductionArrayTy, SourceLocation Loc,
2618 const RecordDecl *TeamReductionRec,
2619 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2620 &VarFieldMap,
2621 llvm::Function *ReduceFn) {
2622 ASTContext &C = CGM.getContext();
2623
2624 // Buffer: global reduction buffer.
2625 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2626 C.VoidPtrTy, ImplicitParamDecl::Other);
2627 // Idx: index of the buffer.
2628 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2629 ImplicitParamDecl::Other);
2630 // ReduceList: thread local Reduce list.
2631 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2632 C.VoidPtrTy, ImplicitParamDecl::Other);
2633 FunctionArgList Args;
2634 Args.push_back(&BufferArg);
2635 Args.push_back(&IdxArg);
2636 Args.push_back(&ReduceListArg);
2637
2638 const CGFunctionInfo &CGFI =
2639 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2640 auto *Fn = llvm::Function::Create(
2641 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2642 "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2643 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2644 Fn->setDoesNotRecurse();
2645 CodeGenFunction CGF(CGM);
2646 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2647
2648 CGBuilderTy &Bld = CGF.Builder;
2649
2650 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2651 QualType StaticTy = C.getRecordType(TeamReductionRec);
2652 llvm::Type *LLVMReductionsBufferTy =
2653 CGM.getTypes().ConvertTypeForMem(StaticTy);
2654 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2655 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2656 LLVMReductionsBufferTy->getPointerTo());
2657
2658 // 1. Build a list of reduction variables.
2659 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2660 Address ReductionList =
2661 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2662 auto IPriv = Privates.begin();
2663 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2664 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2665 /*Volatile=*/false, C.IntTy,
2666 Loc)};
2667 unsigned Idx = 0;
2668 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2669 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2670 // Global = Buffer.VD[Idx];
2671 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2672 const FieldDecl *FD = VarFieldMap.lookup(VD);
2673 LValue GlobLVal = CGF.EmitLValueForField(
2674 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2675 Address GlobAddr = GlobLVal.getAddress(CGF);
2676 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2677 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2678 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2679 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2680 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2681 // Store array size.
2682 ++Idx;
2683 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2684 llvm::Value *Size = CGF.Builder.CreateIntCast(
2685 CGF.getVLASize(
2686 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2687 .NumElts,
2688 CGF.SizeTy, /*isSigned=*/false);
2689 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2690 Elem);
2691 }
2692 }
2693
2694 // Call reduce_function(GlobalReduceList, ReduceList)
2695 llvm::Value *GlobalReduceList =
2696 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2697 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2698 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2699 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2700 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2701 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2702 CGF.FinishFunction();
2703 return Fn;
2704}
2705
2706/// This function emits a helper that copies all the reduction variables from
2707/// the team into the provided global buffer for the reduction variables.
2708///
2709/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2710/// For all data entries D in reduce_data:
2711/// Copy buffer.D[Idx] to local D;
2712static llvm::Value *emitGlobalToListCopyFunction(
2713 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2714 QualType ReductionArrayTy, SourceLocation Loc,
2715 const RecordDecl *TeamReductionRec,
2716 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2717 &VarFieldMap) {
2718 ASTContext &C = CGM.getContext();
2719
2720 // Buffer: global reduction buffer.
2721 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2722 C.VoidPtrTy, ImplicitParamDecl::Other);
2723 // Idx: index of the buffer.
2724 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2725 ImplicitParamDecl::Other);
2726 // ReduceList: thread local Reduce list.
2727 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2728 C.VoidPtrTy, ImplicitParamDecl::Other);
2729 FunctionArgList Args;
2730 Args.push_back(&BufferArg);
2731 Args.push_back(&IdxArg);
2732 Args.push_back(&ReduceListArg);
2733
2734 const CGFunctionInfo &CGFI =
2735 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2736 auto *Fn = llvm::Function::Create(
2737 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2738 "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2739 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2740 Fn->setDoesNotRecurse();
2741 CodeGenFunction CGF(CGM);
2742 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2743
2744 CGBuilderTy &Bld = CGF.Builder;
2745
2746 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2747 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2748 Address LocalReduceList(
2749 Bld.CreatePointerBitCastOrAddrSpaceCast(
2750 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2751 C.VoidPtrTy, Loc),
2752 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2753 CGF.getPointerAlign());
2754 QualType StaticTy = C.getRecordType(TeamReductionRec);
2755 llvm::Type *LLVMReductionsBufferTy =
2756 CGM.getTypes().ConvertTypeForMem(StaticTy);
2757 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2758 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2759 LLVMReductionsBufferTy->getPointerTo());
2760
2761 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2762 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2763 /*Volatile=*/false, C.IntTy,
2764 Loc)};
2765 unsigned Idx = 0;
2766 for (const Expr *Private : Privates) {
2767 // Reduce element = LocalReduceList[i]
2768 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2769 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2770 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2771 // elemptr = ((CopyType*)(elemptrptr)) + I
2772 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2773 ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
2774 Address ElemPtr =
2775 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2776 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2777 // Global = Buffer.VD[Idx];
2778 const FieldDecl *FD = VarFieldMap.lookup(VD);
2779 LValue GlobLVal = CGF.EmitLValueForField(
2780 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2781 Address GlobAddr = GlobLVal.getAddress(CGF);
2782 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2783 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2784 GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
2785 switch (CGF.getEvaluationKind(Private->getType())) {
2786 case TEK_Scalar: {
2787 llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2788 CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2789 LValueBaseInfo(AlignmentSource::Type),
2790 TBAAAccessInfo());
2791 break;
2792 }
2793 case TEK_Complex: {
2794 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
2795 CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2796 /*isInit=*/false);
2797 break;
2798 }
2799 case TEK_Aggregate:
2800 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2801 GlobLVal, Private->getType(),
2802 AggValueSlot::DoesNotOverlap);
2803 break;
2804 }
2805 ++Idx;
2806 }
2807
2808 CGF.FinishFunction();
2809 return Fn;
2810}
2811
2812/// This function emits a helper that reduces all the reduction variables from
2813/// the team into the provided global buffer for the reduction variables.
2814///
2815/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2816/// void *GlobPtrs[];
2817/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2818/// ...
2819/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2820/// reduce_function(reduce_data, GlobPtrs);
2821static llvm::Value *emitGlobalToListReduceFunction(
2822 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2823 QualType ReductionArrayTy, SourceLocation Loc,
2824 const RecordDecl *TeamReductionRec,
2825 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2826 &VarFieldMap,
2827 llvm::Function *ReduceFn) {
2828 ASTContext &C = CGM.getContext();
2829
2830 // Buffer: global reduction buffer.
2831 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2832 C.VoidPtrTy, ImplicitParamDecl::Other);
2833 // Idx: index of the buffer.
2834 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2835 ImplicitParamDecl::Other);
2836 // ReduceList: thread local Reduce list.
2837 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2838 C.VoidPtrTy, ImplicitParamDecl::Other);
2839 FunctionArgList Args;
2840 Args.push_back(&BufferArg);
2841 Args.push_back(&IdxArg);
2842 Args.push_back(&ReduceListArg);
2843
2844 const CGFunctionInfo &CGFI =
2845 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2846 auto *Fn = llvm::Function::Create(
2847 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2848 "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2849 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2850 Fn->setDoesNotRecurse();
2851 CodeGenFunction CGF(CGM);
2852 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2853
2854 CGBuilderTy &Bld = CGF.Builder;
2855
2856 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2857 QualType StaticTy = C.getRecordType(TeamReductionRec);
2858 llvm::Type *LLVMReductionsBufferTy =
2859 CGM.getTypes().ConvertTypeForMem(StaticTy);
2860 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2861 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2862 LLVMReductionsBufferTy->getPointerTo());
2863
2864 // 1. Build a list of reduction variables.
2865 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2866 Address ReductionList =
2867 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2868 auto IPriv = Privates.begin();
2869 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2870 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2871 /*Volatile=*/false, C.IntTy,
2872 Loc)};
2873 unsigned Idx = 0;
2874 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2875 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2876 // Global = Buffer.VD[Idx];
2877 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2878 const FieldDecl *FD = VarFieldMap.lookup(VD);
2879 LValue GlobLVal = CGF.EmitLValueForField(
2880 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2881 Address GlobAddr = GlobLVal.getAddress(CGF);
2882 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2883 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2884 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2885 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2886 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2887 // Store array size.
2888 ++Idx;
2889 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2890 llvm::Value *Size = CGF.Builder.CreateIntCast(
2891 CGF.getVLASize(
2892 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2893 .NumElts,
2894 CGF.SizeTy, /*isSigned=*/false);
2895 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2896 Elem);
2897 }
2898 }
2899
2900 // Call reduce_function(ReduceList, GlobalReduceList)
2901 llvm::Value *GlobalReduceList =
2902 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2903 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2904 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2905 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2906 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2907 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2908 CGF.FinishFunction();
2909 return Fn;
2910}
2911
2912///
2913/// Design of OpenMP reductions on the GPU
2914///
2915/// Consider a typical OpenMP program with one or more reduction
2916/// clauses:
2917///
2918/// float foo;
2919/// double bar;
2920/// #pragma omp target teams distribute parallel for \
2921/// reduction(+:foo) reduction(*:bar)
2922/// for (int i = 0; i < N; i++) {
2923/// foo += A[i]; bar *= B[i];
2924/// }
2925///
2926/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2927/// all teams. In our OpenMP implementation on the NVPTX device an
2928/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2929/// within a team are mapped to CUDA threads within a threadblock.
2930/// Our goal is to efficiently aggregate values across all OpenMP
2931/// threads such that:
2932///
2933/// - the compiler and runtime are logically concise, and
2934/// - the reduction is performed efficiently in a hierarchical
2935/// manner as follows: within OpenMP threads in the same warp,
2936/// across warps in a threadblock, and finally across teams on
2937/// the NVPTX device.
2938///
2939/// Introduction to Decoupling
2940///
2941/// We would like to decouple the compiler and the runtime so that the
2942/// latter is ignorant of the reduction variables (number, data types)
2943/// and the reduction operators. This allows a simpler interface
2944/// and implementation while still attaining good performance.
2945///
2946/// Pseudocode for the aforementioned OpenMP program generated by the
2947/// compiler is as follows:
2948///
2949/// 1. Create private copies of reduction variables on each OpenMP
2950/// thread: 'foo_private', 'bar_private'
2951/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2952/// to it and writes the result in 'foo_private' and 'bar_private'
2953/// respectively.
2954/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2955/// and store the result on the team master:
2956///
2957/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2958/// reduceData, shuffleReduceFn, interWarpCpyFn)
2959///
2960/// where:
2961/// struct ReduceData {
2962/// double *foo;
2963/// double *bar;
2964/// } reduceData
2965/// reduceData.foo = &foo_private
2966/// reduceData.bar = &bar_private
2967///
2968/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2969/// auxiliary functions generated by the compiler that operate on
2970/// variables of type 'ReduceData'. They aid the runtime perform
2971/// algorithmic steps in a data agnostic manner.
2972///
2973/// 'shuffleReduceFn' is a pointer to a function that reduces data
2974/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2975/// same warp. It takes the following arguments as input:
2976///
2977/// a. variable of type 'ReduceData' on the calling lane,
2978/// b. its lane_id,
2979/// c. an offset relative to the current lane_id to generate a
2980/// remote_lane_id. The remote lane contains the second
2981/// variable of type 'ReduceData' that is to be reduced.
2982/// d. an algorithm version parameter determining which reduction
2983/// algorithm to use.
2984///
2985/// 'shuffleReduceFn' retrieves data from the remote lane using
2986/// efficient GPU shuffle intrinsics and reduces, using the
2987/// algorithm specified by the 4th parameter, the two operands
2988/// element-wise. The result is written to the first operand.
2989///
2990/// Different reduction algorithms are implemented in different
2991/// runtime functions, all calling 'shuffleReduceFn' to perform
2992/// the essential reduction step. Therefore, based on the 4th
2993/// parameter, this function behaves slightly differently to
2994/// cooperate with the runtime to ensure correctness under
2995/// different circumstances.
2996///
2997/// 'InterWarpCpyFn' is a pointer to a function that transfers
2998/// reduced variables across warps. It tunnels, through CUDA
2999/// shared memory, the thread-private data of type 'ReduceData'
3000/// from lane 0 of each warp to a lane in the first warp.
3001/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3002/// The last team writes the global reduced value to memory.
3003///
3004/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3005/// reduceData, shuffleReduceFn, interWarpCpyFn,
3006/// scratchpadCopyFn, loadAndReduceFn)
3007///
3008/// 'scratchpadCopyFn' is a helper that stores reduced
3009/// data from the team master to a scratchpad array in
3010/// global memory.
3011///
3012/// 'loadAndReduceFn' is a helper that loads data from
3013/// the scratchpad array and reduces it with the input
3014/// operand.
3015///
3016/// These compiler generated functions hide address
3017/// calculation and alignment information from the runtime.
3018/// 5. if ret == 1:
3019/// The team master of the last team stores the reduced
3020/// result to the globals in memory.
3021/// foo += reduceData.foo; bar *= reduceData.bar
3022///
3023///
3024/// Warp Reduction Algorithms
3025///
3026/// On the warp level, we have three algorithms implemented in the
3027/// OpenMP runtime depending on the number of active lanes:
3028///
3029/// Full Warp Reduction
3030///
3031/// The reduce algorithm within a warp where all lanes are active
3032/// is implemented in the runtime as follows:
3033///
3034/// full_warp_reduce(void *reduce_data,
3035/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3036/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3037/// ShuffleReduceFn(reduce_data, 0, offset, 0);
3038/// }
3039///
3040/// The algorithm completes in log(2, WARPSIZE) steps.
3041///
3042/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3043/// not used therefore we save instructions by not retrieving lane_id
3044/// from the corresponding special registers. The 4th parameter, which
3045/// represents the version of the algorithm being used, is set to 0 to
3046/// signify full warp reduction.
3047///
3048/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3049///
3050/// #reduce_elem refers to an element in the local lane's data structure
3051/// #remote_elem is retrieved from a remote lane
3052/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3053/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3054///
3055/// Contiguous Partial Warp Reduction
3056///
3057/// This reduce algorithm is used within a warp where only the first
3058/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3059/// number of OpenMP threads in a parallel region is not a multiple of
3060/// WARPSIZE. The algorithm is implemented in the runtime as follows:
3061///
3062/// void
3063/// contiguous_partial_reduce(void *reduce_data,
3064/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3065/// int size, int lane_id) {
3066/// int curr_size;
3067/// int offset;
3068/// curr_size = size;
3069/// mask = curr_size/2;
3070/// while (offset>0) {
3071/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3072/// curr_size = (curr_size+1)/2;
3073/// offset = curr_size/2;
3074/// }
3075/// }
3076///
3077/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3078///
3079/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3080/// if (lane_id < offset)
3081/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3082/// else
3083/// reduce_elem = remote_elem
3084///
3085/// This algorithm assumes that the data to be reduced are located in a
3086/// contiguous subset of lanes starting from the first. When there is
3087/// an odd number of active lanes, the data in the last lane is not
3088/// aggregated with any other lane's dat but is instead copied over.
3089///
3090/// Dispersed Partial Warp Reduction
3091///
3092/// This algorithm is used within a warp when any discontiguous subset of
3093/// lanes are active. It is used to implement the reduction operation
3094/// across lanes in an OpenMP simd region or in a nested parallel region.
3095///
3096/// void
3097/// dispersed_partial_reduce(void *reduce_data,
3098/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3099/// int size, remote_id;
3100/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3101/// do {
3102/// remote_id = next_active_lane_id_right_after_me();
3103/// # the above function returns 0 of no active lane
3104/// # is present right after the current lane.
3105/// size = number_of_active_lanes_in_this_warp();
3106/// logical_lane_id /= 2;
3107/// ShuffleReduceFn(reduce_data, logical_lane_id,
3108/// remote_id-1-threadIdx.x, 2);
3109/// } while (logical_lane_id % 2 == 0 && size > 1);
3110/// }
3111///
3112/// There is no assumption made about the initial state of the reduction.
3113/// Any number of lanes (>=1) could be active at any position. The reduction
3114/// result is returned in the first active lane.
3115///
3116/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3117///
3118/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3119/// if (lane_id % 2 == 0 && offset > 0)
3120/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3121/// else
3122/// reduce_elem = remote_elem
3123///
3124///
3125/// Intra-Team Reduction
3126///
3127/// This function, as implemented in the runtime call
3128/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3129/// threads in a team. It first reduces within a warp using the
3130/// aforementioned algorithms. We then proceed to gather all such
3131/// reduced values at the first warp.
3132///
3133/// The runtime makes use of the function 'InterWarpCpyFn', which copies
3134/// data from each of the "warp master" (zeroth lane of each warp, where
3135/// warp-reduced data is held) to the zeroth warp. This step reduces (in
3136/// a mathematical sense) the problem of reduction across warp masters in
3137/// a block to the problem of warp reduction.
3138///
3139///
3140/// Inter-Team Reduction
3141///
3142/// Once a team has reduced its data to a single value, it is stored in
3143/// a global scratchpad array. Since each team has a distinct slot, this
3144/// can be done without locking.
3145///
3146/// The last team to write to the scratchpad array proceeds to reduce the
3147/// scratchpad array. One or more workers in the last team use the helper
3148/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3149/// the k'th worker reduces every k'th element.
3150///
3151/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3152/// reduce across workers and compute a globally reduced value.
3153///
3154void CGOpenMPRuntimeGPU::emitReduction(
3155 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3156 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3157 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3158 if (!CGF.HaveInsertPoint())
1
Taking false branch
3159 return;
3160
3161 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3162#ifndef NDEBUG1
3163 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3164#endif
3165
3166 if (Options.SimpleReduction) {
2
Assuming field 'SimpleReduction' is false
3
Taking false branch
3167 assert(!TeamsReduction && !ParallelReduction &&(static_cast<void> (0))
3168 "Invalid reduction selection in emitReduction.")(static_cast<void> (0));
3169 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3170 ReductionOps, Options);
3171 return;
3172 }
3173
3174 assert((TeamsReduction || ParallelReduction) &&(static_cast<void> (0))
3175 "Invalid reduction selection in emitReduction.")(static_cast<void> (0));
3176
3177 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3178 // RedList, shuffle_reduce_func, interwarp_copy_func);
3179 // or
3180 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3181 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3182 llvm::Value *ThreadId = getThreadID(CGF, Loc);
3183
3184 llvm::Value *Res;
3185 ASTContext &C = CGM.getContext();
3186 // 1. Build a list of reduction variables.
3187 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3188 auto Size = RHSExprs.size();
3189 for (const Expr *E : Privates) {
4
Assuming '__begin1' is equal to '__end1'
3190 if (E->getType()->isVariablyModifiedType())
3191 // Reserve place for array size.
3192 ++Size;
3193 }
3194 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3195 QualType ReductionArrayTy =
3196 C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
3197 /*IndexTypeQuals=*/0);
3198 Address ReductionList =
3199 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3200 auto IPriv = Privates.begin();
3201 unsigned Idx = 0;
3202 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
5
Assuming 'I' is >= 'E'
6
Loop condition is false. Execution continues on line 3222
3203 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3204 CGF.Builder.CreateStore(
3205 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3206 CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
3207 Elem);
3208 if ((*IPriv)->getType()->isVariablyModifiedType()) {
3209 // Store array size.
3210 ++Idx;
3211 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3212 llvm::Value *Size = CGF.Builder.CreateIntCast(
3213 CGF.getVLASize(
3214 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3215 .NumElts,
3216 CGF.SizeTy, /*isSigned=*/false);
3217 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3218 Elem);
3219 }
3220 }
3221
3222 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3223 ReductionList.getPointer(), CGF.VoidPtrTy);
3224 llvm::Function *ReductionFn = emitReductionFunction(
3225 Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
3226 LHSExprs, RHSExprs, ReductionOps);
3227 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3228 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3229 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3230 llvm::Value *InterWarpCopyFn =
3231 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
7
Calling 'emitInterWarpCopyFunction'
3232
3233 if (ParallelReduction) {
3234 llvm::Value *Args[] = {RTLoc,
3235 ThreadId,
3236 CGF.Builder.getInt32(RHSExprs.size()),
3237 ReductionArrayTySize,
3238 RL,
3239 ShuffleAndReduceFn,
3240 InterWarpCopyFn};
3241
3242 Res = CGF.EmitRuntimeCall(
3243 OMPBuilder.getOrCreateRuntimeFunction(
3244 CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3245 Args);
3246 } else {
3247 assert(TeamsReduction && "expected teams reduction.")(static_cast<void> (0));
3248 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3249 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3250 int Cnt = 0;
3251 for (const Expr *DRE : Privates) {
3252 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3253 ++Cnt;
3254 }
3255 const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3256 CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3257 C.getLangOpts().OpenMPCUDAReductionBufNum);
3258 TeamsReductions.push_back(TeamReductionRec);
3259 if (!KernelTeamsReductionPtr) {
3260 KernelTeamsReductionPtr = new llvm::GlobalVariable(
3261 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3262 llvm::GlobalValue::InternalLinkage, nullptr,
3263 "_openmp_teams_reductions_buffer_$_$ptr");
3264 }
3265 llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3266 Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
3267 /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3268 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3269 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3270 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3271 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3272 ReductionFn);
3273 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3274 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3275 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3276 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3277 ReductionFn);
3278
3279 llvm::Value *Args[] = {
3280 RTLoc,
3281 ThreadId,
3282 GlobalBufferPtr,
3283 CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3284 RL,
3285 ShuffleAndReduceFn,
3286 InterWarpCopyFn,
3287 GlobalToBufferCpyFn,
3288 GlobalToBufferRedFn,
3289 BufferToGlobalCpyFn,
3290 BufferToGlobalRedFn};
3291
3292 Res = CGF.EmitRuntimeCall(
3293 OMPBuilder.getOrCreateRuntimeFunction(
3294 CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3295 Args);
3296 }
3297
3298 // 5. Build if (res == 1)
3299 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3300 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3301 llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3302 Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3303 CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3304
3305 // 6. Build then branch: where we have reduced values in the master
3306 // thread in each team.
3307 // __kmpc_end_reduce{_nowait}(<gtid>);
3308 // break;
3309 CGF.EmitBlock(ThenBB);
3310
3311 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3312 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3313 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3314 auto IPriv = Privates.begin();
3315 auto ILHS = LHSExprs.begin();
3316 auto IRHS = RHSExprs.begin();
3317 for (const Expr *E : ReductionOps) {
3318 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3319 cast<DeclRefExpr>(*IRHS));
3320 ++IPriv;
3321 ++ILHS;
3322 ++IRHS;
3323 }
3324 };
3325 llvm::Value *EndArgs[] = {ThreadId};
3326 RegionCodeGenTy RCG(CodeGen);
3327 NVPTXActionTy Action(
3328 nullptr, llvm::None,
3329 OMPBuilder.getOrCreateRuntimeFunction(
3330 CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3331 EndArgs);
3332 RCG.setAction(Action);
3333 RCG(CGF);
3334 // There is no need to emit line number for unconditional branch.
3335 (void)ApplyDebugLocation::CreateEmpty(CGF);
3336 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3337}
3338
3339const VarDecl *
3340CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3341 const VarDecl *NativeParam) const {
3342 if (!NativeParam->getType()->isReferenceType())
3343 return NativeParam;
3344 QualType ArgType = NativeParam->getType();
3345 QualifierCollector QC;
3346 const Type *NonQualTy = QC.strip(ArgType);
3347 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3348 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3349 if (Attr->getCaptureKind() == OMPC_map) {
3350 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3351 LangAS::opencl_global);
3352 }
3353 }
3354 ArgType = CGM.getContext().getPointerType(PointeeTy);
3355 QC.addRestrict();
3356 enum { NVPTX_local_addr = 5 };
3357 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3358 ArgType = QC.apply(CGM.getContext(), ArgType);
3359 if (isa<ImplicitParamDecl>(NativeParam))
3360 return ImplicitParamDecl::Create(
3361 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3362 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3363 return ParmVarDecl::Create(
3364 CGM.getContext(),
3365 const_cast<DeclContext *>(NativeParam->getDeclContext()),
3366 NativeParam->getBeginLoc(), NativeParam->getLocation(),
3367 NativeParam->getIdentifier(), ArgType,
3368 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3369}
3370
3371Address
3372CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3373 const VarDecl *NativeParam,
3374 const VarDecl *TargetParam) const {
3375 assert(NativeParam != TargetParam &&(static_cast<void> (0))
3376 NativeParam->getType()->isReferenceType() &&(static_cast<void> (0))
3377 "Native arg must not be the same as target arg.")(static_cast<void> (0));
3378 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3379 QualType NativeParamType = NativeParam->getType();
3380 QualifierCollector QC;
3381 const Type *NonQualTy = QC.strip(NativeParamType);
3382 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3383 unsigned NativePointeeAddrSpace =
3384 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3385 QualType TargetTy = TargetParam->getType();
3386 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3387 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3388 // First cast to generic.
3389 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3390 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3391 /*AddrSpace=*/0));
3392 // Cast from generic to native address space.
3393 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3394 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3395 NativePointeeAddrSpace));
3396 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3397 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3398 NativeParamType);
3399 return NativeParamAddr;
3400}
3401
3402void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3403 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3404 ArrayRef<llvm::Value *> Args) const {
3405 SmallVector<llvm::Value *, 4> TargetArgs;
3406 TargetArgs.reserve(Args.size());
3407 auto *FnType = OutlinedFn.getFunctionType();
3408 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3409 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3410 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3411 break;
3412 }
3413 llvm::Type *TargetType = FnType->getParamType(I);
3414 llvm::Value *NativeArg = Args[I];
3415 if (!TargetType->isPointerTy()) {
3416 TargetArgs.emplace_back(NativeArg);
3417 continue;
3418 }
3419 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3420 NativeArg,
3421 NativeArg->getType()->getPointerElementType()->getPointerTo());
3422 TargetArgs.emplace_back(
3423 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3424 }
3425 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3426}
3427
3428/// Emit function which wraps the outline parallel region
3429/// and controls the arguments which are passed to this function.
3430/// The wrapper ensures that the outlined function is called
3431/// with the correct arguments when data is shared.
3432llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3433 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3434 ASTContext &Ctx = CGM.getContext();
3435 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3436
3437 // Create a function that takes as argument the source thread.
3438 FunctionArgList WrapperArgs;
3439 QualType Int16QTy =
3440 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3441 QualType Int32QTy =
3442 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3443 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3444 /*Id=*/nullptr, Int16QTy,
3445 ImplicitParamDecl::Other);
3446 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3447 /*Id=*/nullptr, Int32QTy,
3448 ImplicitParamDecl::Other);
3449 WrapperArgs.emplace_back(&ParallelLevelArg);
3450 WrapperArgs.emplace_back(&WrapperArg);
3451
3452 const CGFunctionInfo &CGFI =
3453 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3454
3455 auto *Fn = llvm::Function::Create(
3456 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3457 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3458
3459 // Ensure we do not inline the function. This is trivially true for the ones
3460 // passed to __kmpc_fork_call but the ones calles in serialized regions
3461 // could be inlined. This is not a perfect but it is closer to the invariant
3462 // we want, namely, every data environment starts with a new function.
3463 // TODO: We should pass the if condition to the runtime function and do the
3464 // handling there. Much cleaner code.
3465 Fn->addFnAttr(llvm::Attribute::NoInline);
3466
3467 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3468 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3469 Fn->setDoesNotRecurse();
3470
3471 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3472 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3473 D.getBeginLoc(), D.getBeginLoc());
3474
3475 const auto *RD = CS.getCapturedRecordDecl();
3476 auto CurField = RD->field_begin();
3477
3478 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3479 /*Name=*/".zero.addr");
3480 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
3481 // Get the array of arguments.
3482 SmallVector<llvm::Value *, 8> Args;
3483
3484 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3485 Args.emplace_back(ZeroAddr.getPointer());
3486
3487 CGBuilderTy &Bld = CGF.Builder;
3488 auto CI = CS.capture_begin();
3489
3490 // Use global memory for data sharing.
3491 // Handle passing of global args to workers.
3492 Address GlobalArgs =
3493 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3494 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3495 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3496 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3497 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3498 DataSharingArgs);
3499
3500 // Retrieve the shared variables from the list of references returned
3501 // by the runtime. Pass the variables to the outlined function.
3502 Address SharedArgListAddress = Address::invalid();
3503 if (CS.capture_size() > 0 ||
3504 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3505 SharedArgListAddress = CGF.EmitLoadOfPointer(
3506 GlobalArgs, CGF.getContext()
3507 .getPointerType(CGF.getContext().getPointerType(
3508 CGF.getContext().VoidPtrTy))
3509 .castAs<PointerType>());
3510 }
3511 unsigned Idx = 0;
3512 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3513 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3514 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3515 Src, CGF.SizeTy->getPointerTo());
3516 llvm::Value *LB = CGF.EmitLoadOfScalar(
3517 TypedAddress,
3518 /*Volatile=*/false,
3519 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3520 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3521 Args.emplace_back(LB);
3522 ++Idx;
3523 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3524 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3525 Src, CGF.SizeTy->getPointerTo());
3526 llvm::Value *UB = CGF.EmitLoadOfScalar(
3527 TypedAddress,
3528 /*Volatile=*/false,
3529 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3530 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3531 Args.emplace_back(UB);
3532 ++Idx;
3533 }
3534 if (CS.capture_size() > 0) {
3535 ASTContext &CGFContext = CGF.getContext();
3536 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3537 QualType ElemTy = CurField->getType();
3538 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3539 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3540 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3541 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3542 /*Volatile=*/false,
3543 CGFContext.getPointerType(ElemTy),
3544 CI->getLocation());
3545 if (CI->capturesVariableByCopy() &&
3546 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3547 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3548 CI->getLocation());
3549 }
3550 Args.emplace_back(Arg);
3551 }
3552 }
3553
3554 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3555 CGF.FinishFunction();
3556 return Fn;
3557}
3558
3559void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3560 const Decl *D) {
3561 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3562 return;
3563
3564 assert(D && "Expected function or captured|block decl.")(static_cast<void> (0));
3565 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&(static_cast<void> (0))
3566 "Function is registered already.")(static_cast<void> (0));
3567 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&(static_cast<void> (0))
3568 "Team is set but not processed.")(static_cast<void> (0));
3569 const Stmt *Body = nullptr;
3570 bool NeedToDelayGlobalization = false;
3571 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3572 Body = FD->getBody();
3573 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3574 Body = BD->getBody();
3575 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3576 Body = CD->getBody();
3577 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3578 if (NeedToDelayGlobalization &&
3579 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3580 return;
3581 }
3582 if (!Body)
3583 return;
3584 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3585 VarChecker.Visit(Body);
3586 const RecordDecl *GlobalizedVarsRecord =
3587 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3588 TeamAndReductions.first = nullptr;
3589 TeamAndReductions.second.clear();
3590 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3591 VarChecker.getEscapedVariableLengthDecls();
3592 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3593 return;
3594 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3595 I->getSecond().MappedParams =
3596 std::make_unique<CodeGenFunction::OMPMapVars>();
3597 I->getSecond().EscapedParameters.insert(
3598 VarChecker.getEscapedParameters().begin(),
3599 VarChecker.getEscapedParameters().end());
3600 I->getSecond().EscapedVariableLengthDecls.append(
3601 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3602 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3603 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3604 assert(VD->isCanonicalDecl() && "Expected canonical declaration")(static_cast<void> (0));
3605 Data.insert(std::make_pair(VD, MappedVarData()));
3606 }
3607 if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
3608 CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
3609 VarChecker.Visit(Body);
3610 I->getSecond().SecondaryLocalVarData.emplace();
3611 DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
3612 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3613 assert(VD->isCanonicalDecl() && "Expected canonical declaration")(static_cast<void> (0));
3614 Data.insert(std::make_pair(VD, MappedVarData()));
3615 }
3616 }
3617 if (!NeedToDelayGlobalization) {
3618 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
3619 struct GlobalizationScope final : EHScopeStack::Cleanup {
3620 GlobalizationScope() = default;
3621
3622 void Emit(CodeGenFunction &CGF, Flags flags) override {
3623 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3624 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
3625 }
3626 };
3627 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3628 }
3629}
3630
3631Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3632 const VarDecl *VD) {
3633 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3634 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3635 auto AS = LangAS::Default;
3636 switch (A->getAllocatorType()) {
3637 // Use the default allocator here as by default local vars are
3638 // threadlocal.
3639 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3640 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3641 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3642 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3643 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3644 // Follow the user decision - use default allocation.
3645 return Address::invalid();
3646 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3647 // TODO: implement aupport for user-defined allocators.
3648 return Address::invalid();
3649 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3650 AS = LangAS::cuda_constant;
3651 break;
3652 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3653 AS = LangAS::cuda_shared;
3654 break;
3655 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3656 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3657 break;
3658 }
3659 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3660 auto *GV = new llvm::GlobalVariable(
3661 CGM.getModule(), VarTy, /*isConstant=*/false,
3662 llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
3663 VD->getName(),
3664 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3665 CGM.getContext().getTargetAddressSpace(AS));
3666 CharUnits Align = CGM.getContext().getDeclAlign(VD);
3667 GV->setAlignment(Align.getAsAlign());
3668 return Address(
3669 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3670 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3671 VD->getType().getAddressSpace()))),
3672 Align);
3673 }
3674
3675 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3676 return Address::invalid();
3677
3678 VD = VD->getCanonicalDecl();
3679 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3680 if (I == FunctionGlobalizedDecls.end())
3681 return Address::invalid();
3682 auto VDI = I->getSecond().LocalVarData.find(VD);
3683 if (VDI != I->getSecond().LocalVarData.end())
3684 return VDI->second.PrivateAddr;
3685 if (VD->hasAttrs()) {
3686 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3687 E(VD->attr_end());
3688 IT != E; ++IT) {
3689 auto VDI = I->getSecond().LocalVarData.find(
3690 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3691 ->getCanonicalDecl());
3692 if (VDI != I->getSecond().LocalVarData.end())
3693 return VDI->second.PrivateAddr;
3694 }
3695 }
3696
3697 return Address::invalid();
3698}
3699
3700void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3701 FunctionGlobalizedDecls.erase(CGF.CurFn);
3702 CGOpenMPRuntime::functionFinished(CGF);
3703}
3704
3705void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3706 CodeGenFunction &CGF, const OMPLoopDirective &S,
3707 OpenMPDistScheduleClauseKind &ScheduleKind,
3708 llvm::Value *&Chunk) const {
3709 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3710 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3711 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3712 Chunk = CGF.EmitScalarConversion(
3713 RT.getGPUNumThreads(CGF),
3714 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3715 S.getIterationVariable()->getType(), S.getBeginLoc());
3716 return;
3717 }
3718 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3719 CGF, S, ScheduleKind, Chunk);
3720}
3721
3722void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3723 CodeGenFunction &CGF, const OMPLoopDirective &S,
3724 OpenMPScheduleClauseKind &ScheduleKind,
3725 const Expr *&ChunkExpr) const {
3726 ScheduleKind = OMPC_SCHEDULE_static;
3727 // Chunk size is 1 in this case.
3728 llvm::APInt ChunkSize(32, 1);
3729 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3730 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3731 SourceLocation());
3732}
3733
3734void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3735 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3736 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&(static_cast<void> (0))
3737 " Expected target-based directive.")(static_cast<void> (0));
3738 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3739 for (const CapturedStmt::Capture &C : CS->captures()) {
3740 // Capture variables captured by reference in lambdas for target-based
3741 // directives.
3742 if (!C.capturesVariable())
3743 continue;
3744 const VarDecl *VD = C.getCapturedVar();
3745 const auto *RD = VD->getType()
3746 .getCanonicalType()
3747 .getNonReferenceType()
3748 ->getAsCXXRecordDecl();
3749 if (!RD || !RD->isLambda())
3750 continue;
3751 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3752 LValue VDLVal;
3753 if (VD->getType().getCanonicalType()->isReferenceType())
3754 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3755 else
3756 VDLVal = CGF.MakeAddrLValue(
3757 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3758 llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
3759 FieldDecl *ThisCapture = nullptr;
3760 RD->getCaptureFields(Captures, ThisCapture);
3761 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3762 LValue ThisLVal =
3763 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3764 llvm::Value *CXXThis = CGF.LoadCXXThis();
3765 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3766 }
3767 for (const LambdaCapture &LC : RD->captures()) {
3768 if (LC.getCaptureKind() != LCK_ByRef)
3769 continue;
3770 const VarDecl *VD = LC.getCapturedVar();
3771 if (!CS->capturesVariable(VD))
3772 continue;
3773 auto It = Captures.find(VD);
3774 assert(It != Captures.end() && "Found lambda capture without field.")(static_cast<void> (0));
3775 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3776 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3777 if (VD->getType().getCanonicalType()->isReferenceType())
3778 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3779 VD->getType().getCanonicalType())
3780 .getAddress(CGF);
3781 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3782 }
3783 }
3784}
3785
3786bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3787 LangAS &AS) {
3788 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3789 return false;
3790 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3791 switch(A->getAllocatorType()) {
3792 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3793 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3794 // Not supported, fallback to the default mem space.
3795 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3796 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3797 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3798 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3799 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3800 AS = LangAS::Default;
3801 return true;
3802 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3803 AS = LangAS::cuda_constant;
3804 return true;
3805 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3806 AS = LangAS::cuda_shared;
3807 return true;
3808 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3809 llvm_unreachable("Expected predefined allocator for the variables with the "__builtin_unreachable()
3810 "static storage.")__builtin_unreachable();
3811 }
3812 return false;
3813}
3814
3815// Get current CudaArch and ignore any unknown values
3816static CudaArch getCudaArch(CodeGenModule &CGM) {
3817 if (!CGM.getTarget().hasFeature("ptx"))
3818 return CudaArch::UNKNOWN;
3819 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3820 if (Feature.getValue()) {
3821 CudaArch Arch = StringToCudaArch(Feature.getKey());
3822 if (Arch != CudaArch::UNKNOWN)
3823 return Arch;
3824 }
3825 }
3826 return CudaArch::UNKNOWN;
3827}
3828
3829/// Check to see if target architecture supports unified addressing which is
3830/// a restriction for OpenMP requires clause "unified_shared_memory".
3831void CGOpenMPRuntimeGPU::processRequiresDirective(
3832 const OMPRequiresDecl *D) {
3833 for (const OMPClause *Clause : D->clauselists()) {
3834 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3835 CudaArch Arch = getCudaArch(CGM);
3836 switch (Arch) {
3837 case CudaArch::SM_20:
3838 case CudaArch::SM_21:
3839 case CudaArch::SM_30:
3840 case CudaArch::SM_32:
3841 case CudaArch::SM_35:
3842 case CudaArch::SM_37:
3843 case CudaArch::SM_50:
3844 case CudaArch::SM_52:
3845 case CudaArch::SM_53: {
3846 SmallString<256> Buffer;
3847 llvm::raw_svector_ostream Out(Buffer);
3848 Out << "Target architecture " << CudaArchToString(Arch)
3849 << " does not support unified addressing";
3850 CGM.Error(Clause->getBeginLoc(), Out.str());
3851 return;
3852 }
3853 case CudaArch::SM_60:
3854 case CudaArch::SM_61:
3855 case CudaArch::SM_62:
3856 case CudaArch::SM_70:
3857 case CudaArch::SM_72:
3858 case CudaArch::SM_75:
3859 case CudaArch::SM_80:
3860 case CudaArch::SM_86:
3861 case CudaArch::GFX600:
3862 case CudaArch::GFX601:
3863 case CudaArch::GFX602:
3864 case CudaArch::GFX700:
3865 case CudaArch::GFX701:
3866 case CudaArch::GFX702:
3867 case CudaArch::GFX703:
3868 case CudaArch::GFX704:
3869 case CudaArch::GFX705:
3870 case CudaArch::GFX801:
3871 case CudaArch::GFX802:
3872 case CudaArch::GFX803:
3873 case CudaArch::GFX805:
3874 case CudaArch::GFX810:
3875 case CudaArch::GFX900:
3876 case CudaArch::GFX902:
3877 case CudaArch::GFX904:
3878 case CudaArch::GFX906:
3879 case CudaArch::GFX908:
3880 case CudaArch::GFX909:
3881 case CudaArch::GFX90a:
3882 case CudaArch::GFX90c:
3883 case CudaArch::GFX1010:
3884 case CudaArch::GFX1011:
3885 case CudaArch::GFX1012:
3886 case CudaArch::GFX1013:
3887 case CudaArch::GFX1030:
3888 case CudaArch::GFX1031:
3889 case CudaArch::GFX1032:
3890 case CudaArch::GFX1033:
3891 case CudaArch::GFX1034:
3892 case CudaArch::GFX1035:
3893 case CudaArch::UNUSED:
3894 case CudaArch::UNKNOWN:
3895 break;
3896 case CudaArch::LAST:
3897 llvm_unreachable("Unexpected Cuda arch.")__builtin_unreachable();
3898 }
3899 }
3900 }
3901 CGOpenMPRuntime::processRequiresDirective(D);
3902}
3903
3904void CGOpenMPRuntimeGPU::clear() {
3905
3906 if (!TeamsReductions.empty()) {
3907 ASTContext &C = CGM.getContext();
3908 RecordDecl *StaticRD = C.buildImplicitRecord(
3909 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3910 StaticRD->startDefinition();
3911 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3912 QualType RecTy = C.getRecordType(TeamReductionRec);
3913 auto *Field = FieldDecl::Create(
3914 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3915 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3916 /*BW=*/nullptr, /*Mutable=*/false,
3917 /*InitStyle=*/ICIS_NoInit);
3918 Field->setAccess(AS_public);
3919 StaticRD->addDecl(Field);
3920 }
3921 StaticRD->completeDefinition();
3922 QualType StaticTy = C.getRecordType(StaticRD);
3923 llvm::Type *LLVMReductionsBufferTy =
3924 CGM.getTypes().ConvertTypeForMem(StaticTy);
3925 // FIXME: nvlink does not handle weak linkage correctly (object with the
3926 // different size are reported as erroneous).
3927 // Restore CommonLinkage as soon as nvlink is fixed.
3928 auto *GV = new llvm::GlobalVariable(
3929 CGM.getModule(), LLVMReductionsBufferTy,
3930 /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3931 llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3932 "_openmp_teams_reductions_buffer_$_");
3933 KernelTeamsReductionPtr->setInitializer(
3934 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3935 CGM.VoidPtrTy));
3936 }
3937 CGOpenMPRuntime::clear();
3938}