Bug Summary

File:clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Warning:line 549, column 29
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name CGOpenMPRuntimeGPU.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -relaxed-aliasing -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/build-llvm -resource-dir /usr/lib/llvm-14/lib/clang/14.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I tools/clang/lib/CodeGen -I /build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/clang/lib/CodeGen -I /build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/clang/include -I tools/clang/include -I include -I /build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-14/lib/clang/14.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-01-19-001817-16337-1 -x c++ /build/llvm-toolchain-snapshot-14~++20220118101002+ec47dba1c8a2/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
17#include "clang/AST/DeclOpenMP.h"
18#include "clang/AST/StmtOpenMP.h"
19#include "clang/AST/StmtVisitor.h"
20#include "clang/Basic/Cuda.h"
21#include "llvm/ADT/SmallPtrSet.h"
22#include "llvm/Frontend/OpenMP/OMPGridValues.h"
23#include "llvm/Support/MathExtras.h"
24
25using namespace clang;
26using namespace CodeGen;
27using namespace llvm::omp;
28
29namespace {
30/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
31class NVPTXActionTy final : public PrePostActionTy {
32 llvm::FunctionCallee EnterCallee = nullptr;
33 ArrayRef<llvm::Value *> EnterArgs;
34 llvm::FunctionCallee ExitCallee = nullptr;
35 ArrayRef<llvm::Value *> ExitArgs;
36 bool Conditional = false;
37 llvm::BasicBlock *ContBlock = nullptr;
38
39public:
40 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
41 ArrayRef<llvm::Value *> EnterArgs,
42 llvm::FunctionCallee ExitCallee,
43 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
44 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
45 ExitArgs(ExitArgs), Conditional(Conditional) {}
46 void Enter(CodeGenFunction &CGF) override {
47 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
48 if (Conditional) {
49 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
50 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
51 ContBlock = CGF.createBasicBlock("omp_if.end");
52 // Generate the branch (If-stmt)
53 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
54 CGF.EmitBlock(ThenBlock);
55 }
56 }
57 void Done(CodeGenFunction &CGF) {
58 // Emit the rest of blocks/branches
59 CGF.EmitBranch(ContBlock);
60 CGF.EmitBlock(ContBlock, true);
61 }
62 void Exit(CodeGenFunction &CGF) override {
63 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
64 }
65};
66
67/// A class to track the execution mode when codegening directives within
68/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
69/// to the target region and used by containing directives such as 'parallel'
70/// to emit optimized code.
71class ExecutionRuntimeModesRAII {
72private:
73 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
74 CGOpenMPRuntimeGPU::EM_Unknown;
75 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
76 bool SavedRuntimeMode = false;
77 bool *RuntimeMode = nullptr;
78
79public:
80 /// Constructor for Non-SPMD mode.
81 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
82 : ExecMode(ExecMode) {
83 SavedExecMode = ExecMode;
84 ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
85 }
86 /// Constructor for SPMD mode.
87 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
88 bool &RuntimeMode, bool FullRuntimeMode)
89 : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
90 SavedExecMode = ExecMode;
91 SavedRuntimeMode = RuntimeMode;
92 ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
93 RuntimeMode = FullRuntimeMode;
94 }
95 ~ExecutionRuntimeModesRAII() {
96 ExecMode = SavedExecMode;
97 if (RuntimeMode)
98 *RuntimeMode = SavedRuntimeMode;
99 }
100};
101
102/// GPU Configuration: This information can be derived from cuda registers,
103/// however, providing compile time constants helps generate more efficient
104/// code. For all practical purposes this is fine because the configuration
105/// is the same for all known NVPTX architectures.
106enum MachineConfiguration : unsigned {
107 /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
108 /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
109
110 /// Global memory alignment for performance.
111 GlobalMemoryAlignment = 128,
112
113 /// Maximal size of the shared memory buffer.
114 SharedMemorySize = 128,
115};
116
117static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
118 RefExpr = RefExpr->IgnoreParens();
119 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
120 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
121 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
122 Base = TempASE->getBase()->IgnoreParenImpCasts();
123 RefExpr = Base;
124 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
125 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
126 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
127 Base = TempOASE->getBase()->IgnoreParenImpCasts();
128 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
129 Base = TempASE->getBase()->IgnoreParenImpCasts();
130 RefExpr = Base;
131 }
132 RefExpr = RefExpr->IgnoreParenImpCasts();
133 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
134 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
135 const auto *ME = cast<MemberExpr>(RefExpr);
136 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
137}
138
139
140static RecordDecl *buildRecordForGlobalizedVars(
141 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
142 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
143 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
144 &MappedDeclsFields, int BufSize) {
145 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
146 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
147 return nullptr;
148 SmallVector<VarsDataTy, 4> GlobalizedVars;
149 for (const ValueDecl *D : EscapedDecls)
150 GlobalizedVars.emplace_back(
151 CharUnits::fromQuantity(std::max(
152 C.getDeclAlign(D).getQuantity(),
153 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
154 D);
155 for (const ValueDecl *D : EscapedDeclsForTeams)
156 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
157 llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
158 return L.first > R.first;
159 });
160
161 // Build struct _globalized_locals_ty {
162 // /* globalized vars */[WarSize] align (max(decl_align,
163 // GlobalMemoryAlignment))
164 // /* globalized vars */ for EscapedDeclsForTeams
165 // };
166 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
167 GlobalizedRD->startDefinition();
168 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
169 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
170 for (const auto &Pair : GlobalizedVars) {
171 const ValueDecl *VD = Pair.second;
172 QualType Type = VD->getType();
173 if (Type->isLValueReferenceType())
174 Type = C.getPointerType(Type.getNonReferenceType());
175 else
176 Type = Type.getNonReferenceType();
177 SourceLocation Loc = VD->getLocation();
178 FieldDecl *Field;
179 if (SingleEscaped.count(VD)) {
180 Field = FieldDecl::Create(
181 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
182 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
183 /*BW=*/nullptr, /*Mutable=*/false,
184 /*InitStyle=*/ICIS_NoInit);
185 Field->setAccess(AS_public);
186 if (VD->hasAttrs()) {
187 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
188 E(VD->getAttrs().end());
189 I != E; ++I)
190 Field->addAttr(*I);
191 }
192 } else {
193 llvm::APInt ArraySize(32, BufSize);
194 Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
195 0);
196 Field = FieldDecl::Create(
197 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
198 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
199 /*BW=*/nullptr, /*Mutable=*/false,
200 /*InitStyle=*/ICIS_NoInit);
201 Field->setAccess(AS_public);
202 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
203 static_cast<CharUnits::QuantityType>(
204 GlobalMemoryAlignment)));
205 Field->addAttr(AlignedAttr::CreateImplicit(
206 C, /*IsAlignmentExpr=*/true,
207 IntegerLiteral::Create(C, Align,
208 C.getIntTypeForBitwidth(32, /*Signed=*/0),
209 SourceLocation()),
210 {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
211 }
212 GlobalizedRD->addDecl(Field);
213 MappedDeclsFields.try_emplace(VD, Field);
214 }
215 GlobalizedRD->completeDefinition();
216 return GlobalizedRD;
217}
218
219/// Get the list of variables that can escape their declaration context.
220class CheckVarsEscapingDeclContext final
221 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
222 CodeGenFunction &CGF;
223 llvm::SetVector<const ValueDecl *> EscapedDecls;
224 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
225 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
226 RecordDecl *GlobalizedRD = nullptr;
227 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
228 bool AllEscaped = false;
229 bool IsForCombinedParallelRegion = false;
230
231 void markAsEscaped(const ValueDecl *VD) {
232 // Do not globalize declare target variables.
233 if (!isa<VarDecl>(VD) ||
234 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
235 return;
236 VD = cast<ValueDecl>(VD->getCanonicalDecl());
237 // Use user-specified allocation.
238 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
239 return;
240 // Variables captured by value must be globalized.
241 if (auto *CSI = CGF.CapturedStmtInfo) {
242 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
243 // Check if need to capture the variable that was already captured by
244 // value in the outer region.
245 if (!IsForCombinedParallelRegion) {
246 if (!FD->hasAttrs())
247 return;
248 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
249 if (!Attr)
250 return;
251 if (((Attr->getCaptureKind() != OMPC_map) &&
252 !isOpenMPPrivate(Attr->getCaptureKind())) ||
253 ((Attr->getCaptureKind() == OMPC_map) &&
254 !FD->getType()->isAnyPointerType()))
255 return;
256 }
257 if (!FD->getType()->isReferenceType()) {
258 assert(!VD->getType()->isVariablyModifiedType() &&(static_cast <bool> (!VD->getType()->isVariablyModifiedType
() && "Parameter captured by value with variably modified type"
) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 259, __extension__
__PRETTY_FUNCTION__))
259 "Parameter captured by value with variably modified type")(static_cast <bool> (!VD->getType()->isVariablyModifiedType
() && "Parameter captured by value with variably modified type"
) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 259, __extension__
__PRETTY_FUNCTION__))
;
260 EscapedParameters.insert(VD);
261 } else if (!IsForCombinedParallelRegion) {
262 return;
263 }
264 }
265 }
266 if ((!CGF.CapturedStmtInfo ||
267 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
268 VD->getType()->isReferenceType())
269 // Do not globalize variables with reference type.
270 return;
271 if (VD->getType()->isVariablyModifiedType())
272 EscapedVariableLengthDecls.insert(VD);
273 else
274 EscapedDecls.insert(VD);
275 }
276
277 void VisitValueDecl(const ValueDecl *VD) {
278 if (VD->getType()->isLValueReferenceType())
279 markAsEscaped(VD);
280 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
281 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
282 const bool SavedAllEscaped = AllEscaped;
283 AllEscaped = VD->getType()->isLValueReferenceType();
284 Visit(VarD->getInit());
285 AllEscaped = SavedAllEscaped;
286 }
287 }
288 }
289 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
290 ArrayRef<OMPClause *> Clauses,
291 bool IsCombinedParallelRegion) {
292 if (!S)
293 return;
294 for (const CapturedStmt::Capture &C : S->captures()) {
295 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
296 const ValueDecl *VD = C.getCapturedVar();
297 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
298 if (IsCombinedParallelRegion) {
299 // Check if the variable is privatized in the combined construct and
300 // those private copies must be shared in the inner parallel
301 // directive.
302 IsForCombinedParallelRegion = false;
303 for (const OMPClause *C : Clauses) {
304 if (!isOpenMPPrivate(C->getClauseKind()) ||
305 C->getClauseKind() == OMPC_reduction ||
306 C->getClauseKind() == OMPC_linear ||
307 C->getClauseKind() == OMPC_private)
308 continue;
309 ArrayRef<const Expr *> Vars;
310 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
311 Vars = PC->getVarRefs();
312 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
313 Vars = PC->getVarRefs();
314 else
315 llvm_unreachable("Unexpected clause.")::llvm::llvm_unreachable_internal("Unexpected clause.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 315)
;
316 for (const auto *E : Vars) {
317 const Decl *D =
318 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
319 if (D == VD->getCanonicalDecl()) {
320 IsForCombinedParallelRegion = true;
321 break;
322 }
323 }
324 if (IsForCombinedParallelRegion)
325 break;
326 }
327 }
328 markAsEscaped(VD);
329 if (isa<OMPCapturedExprDecl>(VD))
330 VisitValueDecl(VD);
331 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
332 }
333 }
334 }
335
336 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
337 assert(!GlobalizedRD &&(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already."
) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 338, __extension__
__PRETTY_FUNCTION__))
338 "Record for globalized variables is built already.")(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already."
) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 338, __extension__
__PRETTY_FUNCTION__))
;
339 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
340 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
341 if (IsInTTDRegion)
342 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
343 else
344 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
345 GlobalizedRD = ::buildRecordForGlobalizedVars(
346 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
347 MappedDeclsFields, WarpSize);
348 }
349
350public:
351 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
352 ArrayRef<const ValueDecl *> TeamsReductions)
353 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
354 }
355 virtual ~CheckVarsEscapingDeclContext() = default;
356 void VisitDeclStmt(const DeclStmt *S) {
357 if (!S)
358 return;
359 for (const Decl *D : S->decls())
360 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
361 VisitValueDecl(VD);
362 }
363 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
364 if (!D)
365 return;
366 if (!D->hasAssociatedStmt())
367 return;
368 if (const auto *S =
369 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
370 // Do not analyze directives that do not actually require capturing,
371 // like `omp for` or `omp simd` directives.
372 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
373 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
374 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
375 VisitStmt(S->getCapturedStmt());
376 return;
377 }
378 VisitOpenMPCapturedStmt(
379 S, D->clauses(),
380 CaptureRegions.back() == OMPD_parallel &&
381 isOpenMPDistributeDirective(D->getDirectiveKind()));
382 }
383 }
384 void VisitCapturedStmt(const CapturedStmt *S) {
385 if (!S)
386 return;
387 for (const CapturedStmt::Capture &C : S->captures()) {
388 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
389 const ValueDecl *VD = C.getCapturedVar();
390 markAsEscaped(VD);
391 if (isa<OMPCapturedExprDecl>(VD))
392 VisitValueDecl(VD);
393 }
394 }
395 }
396 void VisitLambdaExpr(const LambdaExpr *E) {
397 if (!E)
398 return;
399 for (const LambdaCapture &C : E->captures()) {
400 if (C.capturesVariable()) {
401 if (C.getCaptureKind() == LCK_ByRef) {
402 const ValueDecl *VD = C.getCapturedVar();
403 markAsEscaped(VD);
404 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
405 VisitValueDecl(VD);
406 }
407 }
408 }
409 }
410 void VisitBlockExpr(const BlockExpr *E) {
411 if (!E)
412 return;
413 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
414 if (C.isByRef()) {
415 const VarDecl *VD = C.getVariable();
416 markAsEscaped(VD);
417 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
418 VisitValueDecl(VD);
419 }
420 }
421 }
422 void VisitCallExpr(const CallExpr *E) {
423 if (!E)
424 return;
425 for (const Expr *Arg : E->arguments()) {
426 if (!Arg)
427 continue;
428 if (Arg->isLValue()) {
429 const bool SavedAllEscaped = AllEscaped;
430 AllEscaped = true;
431 Visit(Arg);
432 AllEscaped = SavedAllEscaped;
433 } else {
434 Visit(Arg);
435 }
436 }
437 Visit(E->getCallee());
438 }
439 void VisitDeclRefExpr(const DeclRefExpr *E) {
440 if (!E)
441 return;
442 const ValueDecl *VD = E->getDecl();
443 if (AllEscaped)
444 markAsEscaped(VD);
445 if (isa<OMPCapturedExprDecl>(VD))
446 VisitValueDecl(VD);
447 else if (const auto *VarD = dyn_cast<VarDecl>(VD))
448 if (VarD->isInitCapture())
449 VisitValueDecl(VD);
450 }
451 void VisitUnaryOperator(const UnaryOperator *E) {
452 if (!E)
453 return;
454 if (E->getOpcode() == UO_AddrOf) {
455 const bool SavedAllEscaped = AllEscaped;
456 AllEscaped = true;
457 Visit(E->getSubExpr());
458 AllEscaped = SavedAllEscaped;
459 } else {
460 Visit(E->getSubExpr());
461 }
462 }
463 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
464 if (!E)
465 return;
466 if (E->getCastKind() == CK_ArrayToPointerDecay) {
467 const bool SavedAllEscaped = AllEscaped;
468 AllEscaped = true;
469 Visit(E->getSubExpr());
470 AllEscaped = SavedAllEscaped;
471 } else {
472 Visit(E->getSubExpr());
473 }
474 }
475 void VisitExpr(const Expr *E) {
476 if (!E)
477 return;
478 bool SavedAllEscaped = AllEscaped;
479 if (!E->isLValue())
480 AllEscaped = false;
481 for (const Stmt *Child : E->children())
482 if (Child)
483 Visit(Child);
484 AllEscaped = SavedAllEscaped;
485 }
486 void VisitStmt(const Stmt *S) {
487 if (!S)
488 return;
489 for (const Stmt *Child : S->children())
490 if (Child)
491 Visit(Child);
492 }
493
494 /// Returns the record that handles all the escaped local variables and used
495 /// instead of their original storage.
496 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
497 if (!GlobalizedRD)
498 buildRecordForGlobalizedVars(IsInTTDRegion);
499 return GlobalizedRD;
500 }
501
502 /// Returns the field in the globalized record for the escaped variable.
503 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
504 assert(GlobalizedRD &&(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already."
) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 505, __extension__
__PRETTY_FUNCTION__))
505 "Record for globalized variables must be generated already.")(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already."
) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 505, __extension__
__PRETTY_FUNCTION__))
;
506 auto I = MappedDeclsFields.find(VD);
507 if (I == MappedDeclsFields.end())
508 return nullptr;
509 return I->getSecond();
510 }
511
512 /// Returns the list of the escaped local variables/parameters.
513 ArrayRef<const ValueDecl *> getEscapedDecls() const {
514 return EscapedDecls.getArrayRef();
515 }
516
517 /// Checks if the escaped local variable is actually a parameter passed by
518 /// value.
519 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
520 return EscapedParameters;
521 }
522
523 /// Returns the list of the escaped variables with the variably modified
524 /// types.
525 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
526 return EscapedVariableLengthDecls.getArrayRef();
527 }
528};
529} // anonymous namespace
530
531/// Get the id of the warp in the block.
532/// We assume that the warp size is 32, which is always the case
533/// on the NVPTX device, to generate more efficient code.
534static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
535 CGBuilderTy &Bld = CGF.Builder;
536 unsigned LaneIDBits =
537 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
538 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
539 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
540}
541
542/// Get the id of the current lane in the Warp.
543/// We assume that the warp size is 32, which is always the case
544/// on the NVPTX device, to generate more efficient code.
545static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
546 CGBuilderTy &Bld = CGF.Builder;
547 unsigned LaneIDBits =
548 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
549 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
14
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'
550 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
551 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
552 "nvptx_lane_id");
553}
554
555CGOpenMPRuntimeGPU::ExecutionMode
556CGOpenMPRuntimeGPU::getExecutionMode() const {
557 return CurrentExecutionMode;
558}
559
560static CGOpenMPRuntimeGPU::DataSharingMode
561getDataSharingMode(CodeGenModule &CGM) {
562 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
563 : CGOpenMPRuntimeGPU::Generic;
564}
565
566/// Check for inner (nested) SPMD construct, if any
567static bool hasNestedSPMDDirective(ASTContext &Ctx,
568 const OMPExecutableDirective &D) {
569 const auto *CS = D.getInnermostCapturedStmt();
570 const auto *Body =
571 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
572 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
573
574 if (const auto *NestedDir =
575 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
576 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
577 switch (D.getDirectiveKind()) {
578 case OMPD_target:
579 if (isOpenMPParallelDirective(DKind))
580 return true;
581 if (DKind == OMPD_teams) {
582 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
583 /*IgnoreCaptured=*/true);
584 if (!Body)
585 return false;
586 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
587 if (const auto *NND =
588 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
589 DKind = NND->getDirectiveKind();
590 if (isOpenMPParallelDirective(DKind))
591 return true;
592 }
593 }
594 return false;
595 case OMPD_target_teams:
596 return isOpenMPParallelDirective(DKind);
597 case OMPD_target_simd:
598 case OMPD_target_parallel:
599 case OMPD_target_parallel_for:
600 case OMPD_target_parallel_for_simd:
601 case OMPD_target_teams_distribute:
602 case OMPD_target_teams_distribute_simd:
603 case OMPD_target_teams_distribute_parallel_for:
604 case OMPD_target_teams_distribute_parallel_for_simd:
605 case OMPD_parallel:
606 case OMPD_for:
607 case OMPD_parallel_for:
608 case OMPD_parallel_master:
609 case OMPD_parallel_sections:
610 case OMPD_for_simd:
611 case OMPD_parallel_for_simd:
612 case OMPD_cancel:
613 case OMPD_cancellation_point:
614 case OMPD_ordered:
615 case OMPD_threadprivate:
616 case OMPD_allocate:
617 case OMPD_task:
618 case OMPD_simd:
619 case OMPD_sections:
620 case OMPD_section:
621 case OMPD_single:
622 case OMPD_master:
623 case OMPD_critical:
624 case OMPD_taskyield:
625 case OMPD_barrier:
626 case OMPD_taskwait:
627 case OMPD_taskgroup:
628 case OMPD_atomic:
629 case OMPD_flush:
630 case OMPD_depobj:
631 case OMPD_scan:
632 case OMPD_teams:
633 case OMPD_target_data:
634 case OMPD_target_exit_data:
635 case OMPD_target_enter_data:
636 case OMPD_distribute:
637 case OMPD_distribute_simd:
638 case OMPD_distribute_parallel_for:
639 case OMPD_distribute_parallel_for_simd:
640 case OMPD_teams_distribute:
641 case OMPD_teams_distribute_simd:
642 case OMPD_teams_distribute_parallel_for:
643 case OMPD_teams_distribute_parallel_for_simd:
644 case OMPD_target_update:
645 case OMPD_declare_simd:
646 case OMPD_declare_variant:
647 case OMPD_begin_declare_variant:
648 case OMPD_end_declare_variant:
649 case OMPD_declare_target:
650 case OMPD_end_declare_target:
651 case OMPD_declare_reduction:
652 case OMPD_declare_mapper:
653 case OMPD_taskloop:
654 case OMPD_taskloop_simd:
655 case OMPD_master_taskloop:
656 case OMPD_master_taskloop_simd:
657 case OMPD_parallel_master_taskloop:
658 case OMPD_parallel_master_taskloop_simd:
659 case OMPD_requires:
660 case OMPD_unknown:
661 default:
662 llvm_unreachable("Unexpected directive.")::llvm::llvm_unreachable_internal("Unexpected directive.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 662)
;
663 }
664 }
665
666 return false;
667}
668
669static bool supportsSPMDExecutionMode(ASTContext &Ctx,
670 const OMPExecutableDirective &D) {
671 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
672 switch (DirectiveKind) {
673 case OMPD_target:
674 case OMPD_target_teams:
675 return hasNestedSPMDDirective(Ctx, D);
676 case OMPD_target_parallel:
677 case OMPD_target_parallel_for:
678 case OMPD_target_parallel_for_simd:
679 case OMPD_target_teams_distribute_parallel_for:
680 case OMPD_target_teams_distribute_parallel_for_simd:
681 case OMPD_target_simd:
682 case OMPD_target_teams_distribute_simd:
683 return true;
684 case OMPD_target_teams_distribute:
685 return false;
686 case OMPD_parallel:
687 case OMPD_for:
688 case OMPD_parallel_for:
689 case OMPD_parallel_master:
690 case OMPD_parallel_sections:
691 case OMPD_for_simd:
692 case OMPD_parallel_for_simd:
693 case OMPD_cancel:
694 case OMPD_cancellation_point:
695 case OMPD_ordered:
696 case OMPD_threadprivate:
697 case OMPD_allocate:
698 case OMPD_task:
699 case OMPD_simd:
700 case OMPD_sections:
701 case OMPD_section:
702 case OMPD_single:
703 case OMPD_master:
704 case OMPD_critical:
705 case OMPD_taskyield:
706 case OMPD_barrier:
707 case OMPD_taskwait:
708 case OMPD_taskgroup:
709 case OMPD_atomic:
710 case OMPD_flush:
711 case OMPD_depobj:
712 case OMPD_scan:
713 case OMPD_teams:
714 case OMPD_target_data:
715 case OMPD_target_exit_data:
716 case OMPD_target_enter_data:
717 case OMPD_distribute:
718 case OMPD_distribute_simd:
719 case OMPD_distribute_parallel_for:
720 case OMPD_distribute_parallel_for_simd:
721 case OMPD_teams_distribute:
722 case OMPD_teams_distribute_simd:
723 case OMPD_teams_distribute_parallel_for:
724 case OMPD_teams_distribute_parallel_for_simd:
725 case OMPD_target_update:
726 case OMPD_declare_simd:
727 case OMPD_declare_variant:
728 case OMPD_begin_declare_variant:
729 case OMPD_end_declare_variant:
730 case OMPD_declare_target:
731 case OMPD_end_declare_target:
732 case OMPD_declare_reduction:
733 case OMPD_declare_mapper:
734 case OMPD_taskloop:
735 case OMPD_taskloop_simd:
736 case OMPD_master_taskloop:
737 case OMPD_master_taskloop_simd:
738 case OMPD_parallel_master_taskloop:
739 case OMPD_parallel_master_taskloop_simd:
740 case OMPD_requires:
741 case OMPD_unknown:
742 default:
743 break;
744 }
745 llvm_unreachable(::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 746)
746 "Unknown programming model for OpenMP directive on NVPTX target.")::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 746)
;
747}
748
749/// Check if the directive is loops based and has schedule clause at all or has
750/// static scheduling.
751static bool hasStaticScheduling(const OMPExecutableDirective &D) {
752 assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPWorksharingDirective(D.getDirectiveKind
()) && isOpenMPLoopDirective(D.getDirectiveKind()) &&
"Expected loop-based directive.") ? void (0) : __assert_fail
("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 754, __extension__
__PRETTY_FUNCTION__))
753 isOpenMPLoopDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPWorksharingDirective(D.getDirectiveKind
()) && isOpenMPLoopDirective(D.getDirectiveKind()) &&
"Expected loop-based directive.") ? void (0) : __assert_fail
("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 754, __extension__
__PRETTY_FUNCTION__))
754 "Expected loop-based directive.")(static_cast <bool> (isOpenMPWorksharingDirective(D.getDirectiveKind
()) && isOpenMPLoopDirective(D.getDirectiveKind()) &&
"Expected loop-based directive.") ? void (0) : __assert_fail
("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 754, __extension__
__PRETTY_FUNCTION__))
;
755 return !D.hasClausesOfKind<OMPOrderedClause>() &&
756 (!D.hasClausesOfKind<OMPScheduleClause>() ||
757 llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
758 [](const OMPScheduleClause *C) {
759 return C->getScheduleKind() == OMPC_SCHEDULE_static;
760 }));
761}
762
763/// Check for inner (nested) lightweight runtime construct, if any
764static bool hasNestedLightweightDirective(ASTContext &Ctx,
765 const OMPExecutableDirective &D) {
766 assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.")(static_cast <bool> (supportsSPMDExecutionMode(Ctx, D) &&
"Expected SPMD mode directive.") ? void (0) : __assert_fail (
"supportsSPMDExecutionMode(Ctx, D) && \"Expected SPMD mode directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 766, __extension__
__PRETTY_FUNCTION__))
;
767 const auto *CS = D.getInnermostCapturedStmt();
768 const auto *Body =
769 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
770 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
771
772 if (const auto *NestedDir =
773 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
774 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
775 switch (D.getDirectiveKind()) {
776 case OMPD_target:
777 if (isOpenMPParallelDirective(DKind) &&
778 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
779 hasStaticScheduling(*NestedDir))
780 return true;
781 if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
782 return true;
783 if (DKind == OMPD_parallel) {
784 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
785 /*IgnoreCaptured=*/true);
786 if (!Body)
787 return false;
788 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
789 if (const auto *NND =
790 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
791 DKind = NND->getDirectiveKind();
792 if (isOpenMPWorksharingDirective(DKind) &&
793 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
794 return true;
795 }
796 } else if (DKind == OMPD_teams) {
797 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
798 /*IgnoreCaptured=*/true);
799 if (!Body)
800 return false;
801 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
802 if (const auto *NND =
803 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
804 DKind = NND->getDirectiveKind();
805 if (isOpenMPParallelDirective(DKind) &&
806 isOpenMPWorksharingDirective(DKind) &&
807 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
808 return true;
809 if (DKind == OMPD_parallel) {
810 Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
811 /*IgnoreCaptured=*/true);
812 if (!Body)
813 return false;
814 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
815 if (const auto *NND =
816 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
817 DKind = NND->getDirectiveKind();
818 if (isOpenMPWorksharingDirective(DKind) &&
819 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
820 return true;
821 }
822 }
823 }
824 }
825 return false;
826 case OMPD_target_teams:
827 if (isOpenMPParallelDirective(DKind) &&
828 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
829 hasStaticScheduling(*NestedDir))
830 return true;
831 if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
832 return true;
833 if (DKind == OMPD_parallel) {
834 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
835 /*IgnoreCaptured=*/true);
836 if (!Body)
837 return false;
838 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
839 if (const auto *NND =
840 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
841 DKind = NND->getDirectiveKind();
842 if (isOpenMPWorksharingDirective(DKind) &&
843 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
844 return true;
845 }
846 }
847 return false;
848 case OMPD_target_parallel:
849 if (DKind == OMPD_simd)
850 return true;
851 return isOpenMPWorksharingDirective(DKind) &&
852 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
853 case OMPD_target_teams_distribute:
854 case OMPD_target_simd:
855 case OMPD_target_parallel_for:
856 case OMPD_target_parallel_for_simd:
857 case OMPD_target_teams_distribute_simd:
858 case OMPD_target_teams_distribute_parallel_for:
859 case OMPD_target_teams_distribute_parallel_for_simd:
860 case OMPD_parallel:
861 case OMPD_for:
862 case OMPD_parallel_for:
863 case OMPD_parallel_master:
864 case OMPD_parallel_sections:
865 case OMPD_for_simd:
866 case OMPD_parallel_for_simd:
867 case OMPD_cancel:
868 case OMPD_cancellation_point:
869 case OMPD_ordered:
870 case OMPD_threadprivate:
871 case OMPD_allocate:
872 case OMPD_task:
873 case OMPD_simd:
874 case OMPD_sections:
875 case OMPD_section:
876 case OMPD_single:
877 case OMPD_master:
878 case OMPD_critical:
879 case OMPD_taskyield:
880 case OMPD_barrier:
881 case OMPD_taskwait:
882 case OMPD_taskgroup:
883 case OMPD_atomic:
884 case OMPD_flush:
885 case OMPD_depobj:
886 case OMPD_scan:
887 case OMPD_teams:
888 case OMPD_target_data:
889 case OMPD_target_exit_data:
890 case OMPD_target_enter_data:
891 case OMPD_distribute:
892 case OMPD_distribute_simd:
893 case OMPD_distribute_parallel_for:
894 case OMPD_distribute_parallel_for_simd:
895 case OMPD_teams_distribute:
896 case OMPD_teams_distribute_simd:
897 case OMPD_teams_distribute_parallel_for:
898 case OMPD_teams_distribute_parallel_for_simd:
899 case OMPD_target_update:
900 case OMPD_declare_simd:
901 case OMPD_declare_variant:
902 case OMPD_begin_declare_variant:
903 case OMPD_end_declare_variant:
904 case OMPD_declare_target:
905 case OMPD_end_declare_target:
906 case OMPD_declare_reduction:
907 case OMPD_declare_mapper:
908 case OMPD_taskloop:
909 case OMPD_taskloop_simd:
910 case OMPD_master_taskloop:
911 case OMPD_master_taskloop_simd:
912 case OMPD_parallel_master_taskloop:
913 case OMPD_parallel_master_taskloop_simd:
914 case OMPD_requires:
915 case OMPD_unknown:
916 default:
917 llvm_unreachable("Unexpected directive.")::llvm::llvm_unreachable_internal("Unexpected directive.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 917)
;
918 }
919 }
920
921 return false;
922}
923
924/// Checks if the construct supports lightweight runtime. It must be SPMD
925/// construct + inner loop-based construct with static scheduling.
926static bool supportsLightweightRuntime(ASTContext &Ctx,
927 const OMPExecutableDirective &D) {
928 if (!supportsSPMDExecutionMode(Ctx, D))
929 return false;
930 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
931 switch (DirectiveKind) {
932 case OMPD_target:
933 case OMPD_target_teams:
934 case OMPD_target_parallel:
935 return hasNestedLightweightDirective(Ctx, D);
936 case OMPD_target_parallel_for:
937 case OMPD_target_parallel_for_simd:
938 case OMPD_target_teams_distribute_parallel_for:
939 case OMPD_target_teams_distribute_parallel_for_simd:
940 // (Last|First)-privates must be shared in parallel region.
941 return hasStaticScheduling(D);
942 case OMPD_target_simd:
943 case OMPD_target_teams_distribute_simd:
944 return true;
945 case OMPD_target_teams_distribute:
946 return false;
947 case OMPD_parallel:
948 case OMPD_for:
949 case OMPD_parallel_for:
950 case OMPD_parallel_master:
951 case OMPD_parallel_sections:
952 case OMPD_for_simd:
953 case OMPD_parallel_for_simd:
954 case OMPD_cancel:
955 case OMPD_cancellation_point:
956 case OMPD_ordered:
957 case OMPD_threadprivate:
958 case OMPD_allocate:
959 case OMPD_task:
960 case OMPD_simd:
961 case OMPD_sections:
962 case OMPD_section:
963 case OMPD_single:
964 case OMPD_master:
965 case OMPD_critical:
966 case OMPD_taskyield:
967 case OMPD_barrier:
968 case OMPD_taskwait:
969 case OMPD_taskgroup:
970 case OMPD_atomic:
971 case OMPD_flush:
972 case OMPD_depobj:
973 case OMPD_scan:
974 case OMPD_teams:
975 case OMPD_target_data:
976 case OMPD_target_exit_data:
977 case OMPD_target_enter_data:
978 case OMPD_distribute:
979 case OMPD_distribute_simd:
980 case OMPD_distribute_parallel_for:
981 case OMPD_distribute_parallel_for_simd:
982 case OMPD_teams_distribute:
983 case OMPD_teams_distribute_simd:
984 case OMPD_teams_distribute_parallel_for:
985 case OMPD_teams_distribute_parallel_for_simd:
986 case OMPD_target_update:
987 case OMPD_declare_simd:
988 case OMPD_declare_variant:
989 case OMPD_begin_declare_variant:
990 case OMPD_end_declare_variant:
991 case OMPD_declare_target:
992 case OMPD_end_declare_target:
993 case OMPD_declare_reduction:
994 case OMPD_declare_mapper:
995 case OMPD_taskloop:
996 case OMPD_taskloop_simd:
997 case OMPD_master_taskloop:
998 case OMPD_master_taskloop_simd:
999 case OMPD_parallel_master_taskloop:
1000 case OMPD_parallel_master_taskloop_simd:
1001 case OMPD_requires:
1002 case OMPD_unknown:
1003 default:
1004 break;
1005 }
1006 llvm_unreachable(::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1007)
1007 "Unknown programming model for OpenMP directive on NVPTX target.")::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1007)
;
1008}
1009
1010void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
1011 StringRef ParentName,
1012 llvm::Function *&OutlinedFn,
1013 llvm::Constant *&OutlinedFnID,
1014 bool IsOffloadEntry,
1015 const RegionCodeGenTy &CodeGen) {
1016 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1017 EntryFunctionState EST;
1018 WrapperFunctionsMap.clear();
1019
1020 // Emit target region as a standalone region.
1021 class NVPTXPrePostActionTy : public PrePostActionTy {
1022 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1023
1024 public:
1025 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1026 : EST(EST) {}
1027 void Enter(CodeGenFunction &CGF) override {
1028 auto &RT =
1029 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1030 RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
1031 // Skip target region initialization.
1032 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1033 }
1034 void Exit(CodeGenFunction &CGF) override {
1035 auto &RT =
1036 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1037 RT.clearLocThreadIdInsertPt(CGF);
1038 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
1039 }
1040 } Action(EST);
1041 CodeGen.setAction(Action);
1042 IsInTTDRegion = true;
1043 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1044 IsOffloadEntry, CodeGen);
1045 IsInTTDRegion = false;
1046}
1047
1048void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
1049 EntryFunctionState &EST, bool IsSPMD) {
1050 CGBuilderTy &Bld = CGF.Builder;
1051 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
1052 IsInTargetMasterThreadRegion = IsSPMD;
1053 if (!IsSPMD)
1054 emitGenericVarsProlog(CGF, EST.Loc);
1055}
1056
1057void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
1058 EntryFunctionState &EST,
1059 bool IsSPMD) {
1060 if (!IsSPMD)
1061 emitGenericVarsEpilog(CGF);
1062
1063 CGBuilderTy &Bld = CGF.Builder;
1064 OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
1065}
1066
1067void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
1068 StringRef ParentName,
1069 llvm::Function *&OutlinedFn,
1070 llvm::Constant *&OutlinedFnID,
1071 bool IsOffloadEntry,
1072 const RegionCodeGenTy &CodeGen) {
1073 ExecutionRuntimeModesRAII ModeRAII(
1074 CurrentExecutionMode, RequiresFullRuntime,
1075 CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1076 !supportsLightweightRuntime(CGM.getContext(), D));
1077 EntryFunctionState EST;
1078
1079 // Emit target region as a standalone region.
1080 class NVPTXPrePostActionTy : public PrePostActionTy {
1081 CGOpenMPRuntimeGPU &RT;
1082 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1083
1084 public:
1085 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
1086 CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1087 : RT(RT), EST(EST) {}
1088 void Enter(CodeGenFunction &CGF) override {
1089 RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
1090 // Skip target region initialization.
1091 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1092 }
1093 void Exit(CodeGenFunction &CGF) override {
1094 RT.clearLocThreadIdInsertPt(CGF);
1095 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
1096 }
1097 } Action(*this, EST);
1098 CodeGen.setAction(Action);
1099 IsInTTDRegion = true;
1100 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1101 IsOffloadEntry, CodeGen);
1102 IsInTTDRegion = false;
1103}
1104
1105// Create a unique global variable to indicate the execution mode of this target
1106// region. The execution mode is either 'generic', or 'spmd' depending on the
1107// target directive. This variable is picked up by the offload library to setup
1108// the device appropriately before kernel launch. If the execution mode is
1109// 'generic', the runtime reserves one warp for the master, otherwise, all
1110// warps participate in parallel work.
1111static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1112 bool Mode) {
1113 auto *GVMode = new llvm::GlobalVariable(
1114 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1115 llvm::GlobalValue::WeakAnyLinkage,
1116 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
1117 : OMP_TGT_EXEC_MODE_GENERIC),
1118 Twine(Name, "_exec_mode"));
1119 CGM.addCompilerUsedGlobal(GVMode);
1120}
1121
1122void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
1123 llvm::Constant *Addr,
1124 uint64_t Size, int32_t,
1125 llvm::GlobalValue::LinkageTypes) {
1126 // TODO: Add support for global variables on the device after declare target
1127 // support.
1128 if (!isa<llvm::Function>(Addr))
1129 return;
1130 llvm::Module &M = CGM.getModule();
1131 llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1132
1133 // Get "nvvm.annotations" metadata node
1134 llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1135
1136 llvm::Metadata *MDVals[] = {
1137 llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
1138 llvm::ConstantAsMetadata::get(
1139 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1140 // Append metadata to nvvm.annotations
1141 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1142}
1143
1144void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1145 const OMPExecutableDirective &D, StringRef ParentName,
1146 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1147 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1148 if (!IsOffloadEntry) // Nothing to do.
1149 return;
1150
1151 assert(!ParentName.empty() && "Invalid target region parent name!")(static_cast <bool> (!ParentName.empty() && "Invalid target region parent name!"
) ? void (0) : __assert_fail ("!ParentName.empty() && \"Invalid target region parent name!\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1151, __extension__
__PRETTY_FUNCTION__))
;
1152
1153 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1154 if (Mode)
1155 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1156 CodeGen);
1157 else
1158 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1159 CodeGen);
1160
1161 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1162}
1163
1164namespace {
1165LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()using ::llvm::BitmaskEnumDetail::operator~; using ::llvm::BitmaskEnumDetail
::operator|; using ::llvm::BitmaskEnumDetail::operator&; using
::llvm::BitmaskEnumDetail::operator^; using ::llvm::BitmaskEnumDetail
::operator|=; using ::llvm::BitmaskEnumDetail::operator&=
; using ::llvm::BitmaskEnumDetail::operator^=
;
1166/// Enum for accesseing the reserved_2 field of the ident_t struct.
1167enum ModeFlagsTy : unsigned {
1168 /// Bit set to 1 when in SPMD mode.
1169 KMP_IDENT_SPMD_MODE = 0x01,
1170 /// Bit set to 1 when a simplified runtime is used.
1171 KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1172 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)LLVM_BITMASK_LARGEST_ENUMERATOR = KMP_IDENT_SIMPLE_RT_MODE
1173};
1174
1175/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1176static const ModeFlagsTy UndefinedMode =
1177 (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1178} // anonymous namespace
1179
1180unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
1181 switch (getExecutionMode()) {
1182 case EM_SPMD:
1183 if (requiresFullRuntime())
1184 return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1185 return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1186 case EM_NonSPMD:
1187 assert(requiresFullRuntime() && "Expected full runtime.")(static_cast <bool> (requiresFullRuntime() && "Expected full runtime."
) ? void (0) : __assert_fail ("requiresFullRuntime() && \"Expected full runtime.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1187, __extension__
__PRETTY_FUNCTION__))
;
1188 return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1189 case EM_Unknown:
1190 return UndefinedMode;
1191 }
1192 llvm_unreachable("Unknown flags are requested.")::llvm::llvm_unreachable_internal("Unknown flags are requested."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1192)
;
1193}
1194
1195CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
1196 : CGOpenMPRuntime(CGM, "_", "$") {
1197 if (!CGM.getLangOpts().OpenMPIsDevice)
1198 llvm_unreachable("OpenMP can only handle device code.")::llvm::llvm_unreachable_internal("OpenMP can only handle device code."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1198)
;
1199
1200 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
1201 if (CGM.getLangOpts().OpenMPTargetNewRuntime) {
1202 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
1203 "__omp_rtl_debug_kind");
1204 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
1205 "__omp_rtl_assume_teams_oversubscription");
1206 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
1207 "__omp_rtl_assume_threads_oversubscription");
1208 }
1209}
1210
1211void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
1212 ProcBindKind ProcBind,
1213 SourceLocation Loc) {
1214 // Do nothing in case of SPMD mode and L0 parallel.
1215 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1216 return;
1217
1218 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1219}
1220
1221void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
1222 llvm::Value *NumThreads,
1223 SourceLocation Loc) {
1224 // Nothing to do.
1225}
1226
1227void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
1228 const Expr *NumTeams,
1229 const Expr *ThreadLimit,
1230 SourceLocation Loc) {}
1231
1232llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
1233 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1234 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1235 // Emit target region as a standalone region.
1236 class NVPTXPrePostActionTy : public PrePostActionTy {
1237 bool &IsInParallelRegion;
1238 bool PrevIsInParallelRegion;
1239
1240 public:
1241 NVPTXPrePostActionTy(bool &IsInParallelRegion)
1242 : IsInParallelRegion(IsInParallelRegion) {}
1243 void Enter(CodeGenFunction &CGF) override {
1244 PrevIsInParallelRegion = IsInParallelRegion;
1245 IsInParallelRegion = true;
1246 }
1247 void Exit(CodeGenFunction &CGF) override {
1248 IsInParallelRegion = PrevIsInParallelRegion;
1249 }
1250 } Action(IsInParallelRegion);
1251 CodeGen.setAction(Action);
1252 bool PrevIsInTTDRegion = IsInTTDRegion;
1253 IsInTTDRegion = false;
1254 bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1255 IsInTargetMasterThreadRegion = false;
1256 auto *OutlinedFun =
1257 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1258 D, ThreadIDVar, InnermostKind, CodeGen));
1259 IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1260 IsInTTDRegion = PrevIsInTTDRegion;
1261 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
1262 !IsInParallelRegion) {
1263 llvm::Function *WrapperFun =
1264 createParallelDataSharingWrapper(OutlinedFun, D);
1265 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1266 }
1267
1268 return OutlinedFun;
1269}
1270
1271/// Get list of lastprivate variables from the teams distribute ... or
1272/// teams {distribute ...} directives.
1273static void
1274getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1275 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1276 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1277, __extension__
__PRETTY_FUNCTION__))
1277 "expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1277, __extension__
__PRETTY_FUNCTION__))
;
1278 const OMPExecutableDirective *Dir = &D;
1279 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1280 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
1281 Ctx,
1282 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1283 /*IgnoreCaptured=*/true))) {
1284 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1285 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1286 Dir = nullptr;
1287 }
1288 }
1289 if (!Dir)
1290 return;
1291 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1292 for (const Expr *E : C->getVarRefs())
1293 Vars.push_back(getPrivateItem(E));
1294 }
1295}
1296
1297/// Get list of reduction variables from the teams ... directives.
1298static void
1299getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1300 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1301 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1302, __extension__
__PRETTY_FUNCTION__))
1302 "expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1302, __extension__
__PRETTY_FUNCTION__))
;
1303 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1304 for (const Expr *E : C->privates())
1305 Vars.push_back(getPrivateItem(E));
1306 }
1307}
1308
1309llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1310 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1311 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1312 SourceLocation Loc = D.getBeginLoc();
1313
1314 const RecordDecl *GlobalizedRD = nullptr;
1315 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1316 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1317 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1318 // Globalize team reductions variable unconditionally in all modes.
1319 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1320 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1321 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1322 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1323 if (!LastPrivatesReductions.empty()) {
1324 GlobalizedRD = ::buildRecordForGlobalizedVars(
1325 CGM.getContext(), llvm::None, LastPrivatesReductions,
1326 MappedDeclsFields, WarpSize);
1327 }
1328 } else if (!LastPrivatesReductions.empty()) {
1329 assert(!TeamAndReductions.first &&(static_cast <bool> (!TeamAndReductions.first &&
"Previous team declaration is not expected.") ? void (0) : __assert_fail
("!TeamAndReductions.first && \"Previous team declaration is not expected.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1330, __extension__
__PRETTY_FUNCTION__))
1330 "Previous team declaration is not expected.")(static_cast <bool> (!TeamAndReductions.first &&
"Previous team declaration is not expected.") ? void (0) : __assert_fail
("!TeamAndReductions.first && \"Previous team declaration is not expected.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1330, __extension__
__PRETTY_FUNCTION__))
;
1331 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1332 std::swap(TeamAndReductions.second, LastPrivatesReductions);
1333 }
1334
1335 // Emit target region as a standalone region.
1336 class NVPTXPrePostActionTy : public PrePostActionTy {
1337 SourceLocation &Loc;
1338 const RecordDecl *GlobalizedRD;
1339 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1340 &MappedDeclsFields;
1341
1342 public:
1343 NVPTXPrePostActionTy(
1344 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1345 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1346 &MappedDeclsFields)
1347 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1348 MappedDeclsFields(MappedDeclsFields) {}
1349 void Enter(CodeGenFunction &CGF) override {
1350 auto &Rt =
1351 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1352 if (GlobalizedRD) {
1353 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1354 I->getSecond().MappedParams =
1355 std::make_unique<CodeGenFunction::OMPMapVars>();
1356 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1357 for (const auto &Pair : MappedDeclsFields) {
1358 assert(Pair.getFirst()->isCanonicalDecl() &&(static_cast <bool> (Pair.getFirst()->isCanonicalDecl
() && "Expected canonical declaration") ? void (0) : __assert_fail
("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1359, __extension__
__PRETTY_FUNCTION__))
1359 "Expected canonical declaration")(static_cast <bool> (Pair.getFirst()->isCanonicalDecl
() && "Expected canonical declaration") ? void (0) : __assert_fail
("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1359, __extension__
__PRETTY_FUNCTION__))
;
1360 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1361 }
1362 }
1363 Rt.emitGenericVarsProlog(CGF, Loc);
1364 }
1365 void Exit(CodeGenFunction &CGF) override {
1366 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1367 .emitGenericVarsEpilog(CGF);
1368 }
1369 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1370 CodeGen.setAction(Action);
1371 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1372 D, ThreadIDVar, InnermostKind, CodeGen);
1373
1374 return OutlinedFun;
1375}
1376
1377void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1378 SourceLocation Loc,
1379 bool WithSPMDCheck) {
1380 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1381 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1382 return;
1383
1384 CGBuilderTy &Bld = CGF.Builder;
1385
1386 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1387 if (I == FunctionGlobalizedDecls.end())
1388 return;
1389
1390 for (auto &Rec : I->getSecond().LocalVarData) {
1391 const auto *VD = cast<VarDecl>(Rec.first);
1392 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1393 QualType VarTy = VD->getType();
1394
1395 // Get the local allocation of a firstprivate variable before sharing
1396 llvm::Value *ParValue;
1397 if (EscapedParam) {
1398 LValue ParLVal =
1399 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1400 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1401 }
1402
1403 // Allocate space for the variable to be globalized
1404 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1405 llvm::CallBase *VoidPtr =
1406 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1407 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1408 AllocArgs, VD->getName());
1409 // FIXME: We should use the variables actual alignment as an argument.
1410 VoidPtr->addRetAttr(llvm::Attribute::get(
1411 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1412 CGM.getContext().getTargetInfo().getNewAlign() / 8));
1413
1414 // Cast the void pointer and get the address of the globalized variable.
1415 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1416 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1417 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1418 LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1419 Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1420 Rec.second.GlobalizedVal = VoidPtr;
1421
1422 // Assign the local allocation to the newly globalized location.
1423 if (EscapedParam) {
1424 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1425 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1426 }
1427 if (auto *DI = CGF.getDebugInfo())
1428 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1429 }
1430 for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1431 // Use actual memory size of the VLA object including the padding
1432 // for alignment purposes.
1433 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1434 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1435 Size = Bld.CreateNUWAdd(
1436 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1437 llvm::Value *AlignVal =
1438 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1439
1440 Size = Bld.CreateUDiv(Size, AlignVal);
1441 Size = Bld.CreateNUWMul(Size, AlignVal);
1442
1443 // Allocate space for this VLA object to be globalized.
1444 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1445 llvm::CallBase *VoidPtr =
1446 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1447 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1448 AllocArgs, VD->getName());
1449 VoidPtr->addRetAttr(
1450 llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
1451 CGM.getContext().getTargetInfo().getNewAlign()));
1452
1453 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1454 std::pair<llvm::Value *, llvm::Value *>(
1455 {VoidPtr, CGF.getTypeSize(VD->getType())}));
1456 LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1457 CGM.getContext().getDeclAlign(VD),
1458 AlignmentSource::Decl);
1459 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1460 Base.getAddress(CGF));
1461 }
1462 I->getSecond().MappedParams->apply(CGF);
1463}
1464
1465void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1466 bool WithSPMDCheck) {
1467 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1468 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1469 return;
1470
1471 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1472 if (I != FunctionGlobalizedDecls.end()) {
1473 // Deallocate the memory for each globalized VLA object
1474 for (auto AddrSizePair :
1475 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1476 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1477 CGM.getModule(), OMPRTL___kmpc_free_shared),
1478 {AddrSizePair.first, AddrSizePair.second});
1479 }
1480 // Deallocate the memory for each globalized value
1481 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1482 const auto *VD = cast<VarDecl>(Rec.first);
1483 I->getSecond().MappedParams->restore(CGF);
1484
1485 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1486 CGF.getTypeSize(VD->getType())};
1487 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1488 CGM.getModule(), OMPRTL___kmpc_free_shared),
1489 FreeArgs);
1490 }
1491 }
1492}
1493
1494void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1495 const OMPExecutableDirective &D,
1496 SourceLocation Loc,
1497 llvm::Function *OutlinedFn,
1498 ArrayRef<llvm::Value *> CapturedVars) {
1499 if (!CGF.HaveInsertPoint())
1500 return;
1501
1502 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1503 /*Name=*/".zero.addr");
1504 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1505 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1506 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1507 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1508 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1509 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1510}
1511
1512void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1513 SourceLocation Loc,
1514 llvm::Function *OutlinedFn,
1515 ArrayRef<llvm::Value *> CapturedVars,
1516 const Expr *IfCond,
1517 llvm::Value *NumThreads) {
1518 if (!CGF.HaveInsertPoint())
1519 return;
1520
1521 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1522 NumThreads](CodeGenFunction &CGF,
1523 PrePostActionTy &Action) {
1524 CGBuilderTy &Bld = CGF.Builder;
1525 llvm::Value *NumThreadsVal = NumThreads;
1526 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1527 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1528 if (WFn)
1529 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1530 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1531
1532 // Create a private scope that will globalize the arguments
1533 // passed from the outside of the target region.
1534 // TODO: Is that needed?
1535 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1536
1537 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1538 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1539 "captured_vars_addrs");
1540 // There's something to share.
1541 if (!CapturedVars.empty()) {
1542 // Prepare for parallel region. Indicate the outlined function.
1543 ASTContext &Ctx = CGF.getContext();
1544 unsigned Idx = 0;
1545 for (llvm::Value *V : CapturedVars) {
1546 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1547 llvm::Value *PtrV;
1548 if (V->getType()->isIntegerTy())
1549 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1550 else
1551 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1552 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1553 Ctx.getPointerType(Ctx.VoidPtrTy));
1554 ++Idx;
1555 }
1556 }
1557
1558 llvm::Value *IfCondVal = nullptr;
1559 if (IfCond)
1560 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1561 /* isSigned */ false);
1562 else
1563 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1564
1565 if (!NumThreadsVal)
1566 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1567 else
1568 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1569
1570 assert(IfCondVal && "Expected a value")(static_cast <bool> (IfCondVal && "Expected a value"
) ? void (0) : __assert_fail ("IfCondVal && \"Expected a value\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1570, __extension__
__PRETTY_FUNCTION__))
;
1571 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1572 llvm::Value *Args[] = {
1573 RTLoc,
1574 getThreadID(CGF, Loc),
1575 IfCondVal,
1576 NumThreadsVal,
1577 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1578 FnPtr,
1579 ID,
1580 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1581 CGF.VoidPtrPtrTy),
1582 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1583 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1584 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1585 Args);
1586 };
1587
1588 RegionCodeGenTy RCG(ParallelGen);
1589 RCG(CGF);
1590}
1591
1592void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1593 // Always emit simple barriers!
1594 if (!CGF.HaveInsertPoint())
1595 return;
1596 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1597 // This function does not use parameters, so we can emit just default values.
1598 llvm::Value *Args[] = {
1599 llvm::ConstantPointerNull::get(
1600 cast<llvm::PointerType>(getIdentTyPointerTy())),
1601 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1602 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1603 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1604 Args);
1605}
1606
1607void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1608 SourceLocation Loc,
1609 OpenMPDirectiveKind Kind, bool,
1610 bool) {
1611 // Always emit simple barriers!
1612 if (!CGF.HaveInsertPoint())
1613 return;
1614 // Build call __kmpc_cancel_barrier(loc, thread_id);
1615 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1616 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1617 getThreadID(CGF, Loc)};
1618
1619 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1620 CGM.getModule(), OMPRTL___kmpc_barrier),
1621 Args);
1622}
1623
1624void CGOpenMPRuntimeGPU::emitCriticalRegion(
1625 CodeGenFunction &CGF, StringRef CriticalName,
1626 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1627 const Expr *Hint) {
1628 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1629 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1630 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1631 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1632 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1633
1634 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1635
1636 // Get the mask of active threads in the warp.
1637 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1638 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1639 // Fetch team-local id of the thread.
1640 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1641
1642 // Get the width of the team.
1643 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1644
1645 // Initialize the counter variable for the loop.
1646 QualType Int32Ty =
1647 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1648 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1649 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1650 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1651 /*isInit=*/true);
1652
1653 // Block checks if loop counter exceeds upper bound.
1654 CGF.EmitBlock(LoopBB);
1655 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1656 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1657 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1658
1659 // Block tests which single thread should execute region, and which threads
1660 // should go straight to synchronisation point.
1661 CGF.EmitBlock(TestBB);
1662 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1663 llvm::Value *CmpThreadToCounter =
1664 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1665 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1666
1667 // Block emits the body of the critical region.
1668 CGF.EmitBlock(BodyBB);
1669
1670 // Output the critical statement.
1671 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1672 Hint);
1673
1674 // After the body surrounded by the critical region, the single executing
1675 // thread will jump to the synchronisation point.
1676 // Block waits for all threads in current team to finish then increments the
1677 // counter variable and returns to the loop.
1678 CGF.EmitBlock(SyncBB);
1679 // Reconverge active threads in the warp.
1680 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1681 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1682 Mask);
1683
1684 llvm::Value *IncCounterVal =
1685 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1686 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1687 CGF.EmitBranch(LoopBB);
1688
1689 // Block that is reached when all threads in the team complete the region.
1690 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1691}
1692
1693/// Cast value to the specified type.
1694static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1695 QualType ValTy, QualType CastTy,
1696 SourceLocation Loc) {
1697 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(CastTy).isZero() && "Cast type must sized.") ? void (
0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1698, __extension__
__PRETTY_FUNCTION__))
1698 "Cast type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(CastTy).isZero() && "Cast type must sized.") ? void (
0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1698, __extension__
__PRETTY_FUNCTION__))
;
1699 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(ValTy).isZero() && "Val type must sized.") ? void (0
) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1700, __extension__
__PRETTY_FUNCTION__))
1700 "Val type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(ValTy).isZero() && "Val type must sized.") ? void (0
) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1700, __extension__
__PRETTY_FUNCTION__))
;
1701 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1702 if (ValTy == CastTy)
1703 return Val;
1704 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1705 CGF.getContext().getTypeSizeInChars(CastTy))
1706 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1707 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1708 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1709 CastTy->hasSignedIntegerRepresentation());
1710 Address CastItem = CGF.CreateMemTemp(CastTy);
1711 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1712 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
1713 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1714 LValueBaseInfo(AlignmentSource::Type),
1715 TBAAAccessInfo());
1716 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1717 LValueBaseInfo(AlignmentSource::Type),
1718 TBAAAccessInfo());
1719}
1720
1721/// This function creates calls to one of two shuffle functions to copy
1722/// variables between lanes in a warp.
1723static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1724 llvm::Value *Elem,
1725 QualType ElemType,
1726 llvm::Value *Offset,
1727 SourceLocation Loc) {
1728 CodeGenModule &CGM = CGF.CGM;
1729 CGBuilderTy &Bld = CGF.Builder;
1730 CGOpenMPRuntimeGPU &RT =
1731 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1732 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1733
1734 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1735 assert(Size.getQuantity() <= 8 &&(static_cast <bool> (Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.") ? void (0) :
__assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1736, __extension__
__PRETTY_FUNCTION__))
1736 "Unsupported bitwidth in shuffle instruction.")(static_cast <bool> (Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.") ? void (0) :
__assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1736, __extension__
__PRETTY_FUNCTION__))
;
1737
1738 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1739 ? OMPRTL___kmpc_shuffle_int32
1740 : OMPRTL___kmpc_shuffle_int64;
1741
1742 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1743 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1744 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1745 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1746 llvm::Value *WarpSize =
1747 Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1748
1749 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1750 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1751 {ElemCast, Offset, WarpSize});
1752
1753 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1754}
1755
1756static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1757 Address DestAddr, QualType ElemType,
1758 llvm::Value *Offset, SourceLocation Loc) {
1759 CGBuilderTy &Bld = CGF.Builder;
1760
1761 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1762 // Create the loop over the big sized data.
1763 // ptr = (void*)Elem;
1764 // ptrEnd = (void*) Elem + 1;
1765 // Step = 8;
1766 // while (ptr + Step < ptrEnd)
1767 // shuffle((int64_t)*ptr);
1768 // Step = 4;
1769 // while (ptr + Step < ptrEnd)
1770 // shuffle((int32_t)*ptr);
1771 // ...
1772 Address ElemPtr = DestAddr;
1773 Address Ptr = SrcAddr;
1774 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1775 Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy);
1776 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1777 if (Size < CharUnits::fromQuantity(IntSize))
1778 continue;
1779 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1780 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1781 /*Signed=*/1);
1782 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1783 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
1784 ElemPtr =
1785 Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
1786 if (Size.getQuantity() / IntSize > 1) {
1787 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1788 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1789 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1790 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1791 CGF.EmitBlock(PreCondBB);
1792 llvm::PHINode *PhiSrc =
1793 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1794 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1795 llvm::PHINode *PhiDest =
1796 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1797 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1798 Ptr = Address(PhiSrc, Ptr.getAlignment());
1799 ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
1800 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1801 PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
1802 Ptr.getPointer(), CGF.VoidPtrTy));
1803 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1804 ThenBB, ExitBB);
1805 CGF.EmitBlock(ThenBB);
1806 llvm::Value *Res = createRuntimeShuffleFunction(
1807 CGF,
1808 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1809 LValueBaseInfo(AlignmentSource::Type),
1810 TBAAAccessInfo()),
1811 IntType, Offset, Loc);
1812 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1813 LValueBaseInfo(AlignmentSource::Type),
1814 TBAAAccessInfo());
1815 Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1816 Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1817 PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1818 PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1819 CGF.EmitBranch(PreCondBB);
1820 CGF.EmitBlock(ExitBB);
1821 } else {
1822 llvm::Value *Res = createRuntimeShuffleFunction(
1823 CGF,
1824 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1825 LValueBaseInfo(AlignmentSource::Type),
1826 TBAAAccessInfo()),
1827 IntType, Offset, Loc);
1828 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1829 LValueBaseInfo(AlignmentSource::Type),
1830 TBAAAccessInfo());
1831 Ptr = Bld.CreateConstGEP(Ptr, 1);
1832 ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1833 }
1834 Size = Size % IntSize;
1835 }
1836}
1837
1838namespace {
1839enum CopyAction : unsigned {
1840 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1841 // the warp using shuffle instructions.
1842 RemoteLaneToThread,
1843 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1844 ThreadCopy,
1845 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1846 ThreadToScratchpad,
1847 // ScratchpadToThread: Copy from a scratchpad array in global memory
1848 // containing team-reduced data to a thread's stack.
1849 ScratchpadToThread,
1850};
1851} // namespace
1852
1853struct CopyOptionsTy {
1854 llvm::Value *RemoteLaneOffset;
1855 llvm::Value *ScratchpadIndex;
1856 llvm::Value *ScratchpadWidth;
1857};
1858
1859/// Emit instructions to copy a Reduce list, which contains partially
1860/// aggregated values, in the specified direction.
1861static void emitReductionListCopy(
1862 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1863 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1864 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1865
1866 CodeGenModule &CGM = CGF.CGM;
1867 ASTContext &C = CGM.getContext();
1868 CGBuilderTy &Bld = CGF.Builder;
1869
1870 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1871 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1872 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1873
1874 // Iterates, element-by-element, through the source Reduce list and
1875 // make a copy.
1876 unsigned Idx = 0;
1877 unsigned Size = Privates.size();
1878 for (const Expr *Private : Privates) {
1879 Address SrcElementAddr = Address::invalid();
1880 Address DestElementAddr = Address::invalid();
1881 Address DestElementPtrAddr = Address::invalid();
1882 // Should we shuffle in an element from a remote lane?
1883 bool ShuffleInElement = false;
1884 // Set to true to update the pointer in the dest Reduce list to a
1885 // newly created element.
1886 bool UpdateDestListPtr = false;
1887 // Increment the src or dest pointer to the scratchpad, for each
1888 // new element.
1889 bool IncrScratchpadSrc = false;
1890 bool IncrScratchpadDest = false;
1891
1892 switch (Action) {
1893 case RemoteLaneToThread: {
1894 // Step 1.1: Get the address for the src element in the Reduce list.
1895 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1896 SrcElementAddr = CGF.EmitLoadOfPointer(
1897 SrcElementPtrAddr,
1898 C.getPointerType(Private->getType())->castAs<PointerType>());
1899
1900 // Step 1.2: Create a temporary to store the element in the destination
1901 // Reduce list.
1902 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1903 DestElementAddr =
1904 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1905 ShuffleInElement = true;
1906 UpdateDestListPtr = true;
1907 break;
1908 }
1909 case ThreadCopy: {
1910 // Step 1.1: Get the address for the src element in the Reduce list.
1911 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1912 SrcElementAddr = CGF.EmitLoadOfPointer(
1913 SrcElementPtrAddr,
1914 C.getPointerType(Private->getType())->castAs<PointerType>());
1915
1916 // Step 1.2: Get the address for dest element. The destination
1917 // element has already been created on the thread's stack.
1918 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1919 DestElementAddr = CGF.EmitLoadOfPointer(
1920 DestElementPtrAddr,
1921 C.getPointerType(Private->getType())->castAs<PointerType>());
1922 break;
1923 }
1924 case ThreadToScratchpad: {
1925 // Step 1.1: Get the address for the src element in the Reduce list.
1926 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1927 SrcElementAddr = CGF.EmitLoadOfPointer(
1928 SrcElementPtrAddr,
1929 C.getPointerType(Private->getType())->castAs<PointerType>());
1930
1931 // Step 1.2: Get the address for dest element:
1932 // address = base + index * ElementSizeInChars.
1933 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1934 llvm::Value *CurrentOffset =
1935 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1936 llvm::Value *ScratchPadElemAbsolutePtrVal =
1937 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1938 ScratchPadElemAbsolutePtrVal =
1939 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1940 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1941 C.getTypeAlignInChars(Private->getType()));
1942 IncrScratchpadDest = true;
1943 break;
1944 }
1945 case ScratchpadToThread: {
1946 // Step 1.1: Get the address for the src element in the scratchpad.
1947 // address = base + index * ElementSizeInChars.
1948 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1949 llvm::Value *CurrentOffset =
1950 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1951 llvm::Value *ScratchPadElemAbsolutePtrVal =
1952 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1953 ScratchPadElemAbsolutePtrVal =
1954 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1955 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1956 C.getTypeAlignInChars(Private->getType()));
1957 IncrScratchpadSrc = true;
1958
1959 // Step 1.2: Create a temporary to store the element in the destination
1960 // Reduce list.
1961 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1962 DestElementAddr =
1963 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1964 UpdateDestListPtr = true;
1965 break;
1966 }
1967 }
1968
1969 // Regardless of src and dest of copy, we emit the load of src
1970 // element as this is required in all directions
1971 SrcElementAddr = Bld.CreateElementBitCast(
1972 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1973 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1974 SrcElementAddr.getElementType());
1975
1976 // Now that all active lanes have read the element in the
1977 // Reduce list, shuffle over the value from the remote lane.
1978 if (ShuffleInElement) {
1979 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1980 RemoteLaneOffset, Private->getExprLoc());
1981 } else {
1982 switch (CGF.getEvaluationKind(Private->getType())) {
1983 case TEK_Scalar: {
1984 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1985 SrcElementAddr, /*Volatile=*/false, Private->getType(),
1986 Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
1987 TBAAAccessInfo());
1988 // Store the source element value to the dest element address.
1989 CGF.EmitStoreOfScalar(
1990 Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
1991 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
1992 break;
1993 }
1994 case TEK_Complex: {
1995 CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
1996 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1997 Private->getExprLoc());
1998 CGF.EmitStoreOfComplex(
1999 Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2000 /*isInit=*/false);
2001 break;
2002 }
2003 case TEK_Aggregate:
2004 CGF.EmitAggregateCopy(
2005 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2006 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2007 Private->getType(), AggValueSlot::DoesNotOverlap);
2008 break;
2009 }
2010 }
2011
2012 // Step 3.1: Modify reference in dest Reduce list as needed.
2013 // Modifying the reference in Reduce list to point to the newly
2014 // created element. The element is live in the current function
2015 // scope and that of functions it invokes (i.e., reduce_function).
2016 // RemoteReduceData[i] = (void*)&RemoteElem
2017 if (UpdateDestListPtr) {
2018 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2019 DestElementAddr.getPointer(), CGF.VoidPtrTy),
2020 DestElementPtrAddr, /*Volatile=*/false,
2021 C.VoidPtrTy);
2022 }
2023
2024 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2025 // address of the next element in scratchpad memory, unless we're currently
2026 // processing the last one. Memory alignment is also taken care of here.
2027 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2028 llvm::Value *ScratchpadBasePtr =
2029 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2030 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2031 ScratchpadBasePtr = Bld.CreateNUWAdd(
2032 ScratchpadBasePtr,
2033 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2034
2035 // Take care of global memory alignment for performance
2036 ScratchpadBasePtr = Bld.CreateNUWSub(
2037 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2038 ScratchpadBasePtr = Bld.CreateUDiv(
2039 ScratchpadBasePtr,
2040 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2041 ScratchpadBasePtr = Bld.CreateNUWAdd(
2042 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2043 ScratchpadBasePtr = Bld.CreateNUWMul(
2044 ScratchpadBasePtr,
2045 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2046
2047 if (IncrScratchpadDest)
2048 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2049 else /* IncrScratchpadSrc = true */
2050 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2051 }
2052
2053 ++Idx;
2054 }
2055}
2056
2057/// This function emits a helper that gathers Reduce lists from the first
2058/// lane of every active warp to lanes in the first warp.
2059///
2060/// void inter_warp_copy_func(void* reduce_data, num_warps)
2061/// shared smem[warp_size];
2062/// For all data entries D in reduce_data:
2063/// sync
2064/// If (I am the first lane in each warp)
2065/// Copy my local D to smem[warp_id]
2066/// sync
2067/// if (I am the first warp)
2068/// Copy smem[thread_id] to my local D
2069static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2070 ArrayRef<const Expr *> Privates,
2071 QualType ReductionArrayTy,
2072 SourceLocation Loc) {
2073 ASTContext &C = CGM.getContext();
2074 llvm::Module &M = CGM.getModule();
2075
2076 // ReduceList: thread local Reduce list.
2077 // At the stage of the computation when this function is called, partially
2078 // aggregated values reside in the first lane of every active warp.
2079 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2080 C.VoidPtrTy, ImplicitParamDecl::Other);
2081 // NumWarps: number of warps active in the parallel region. This could
2082 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2083 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2084 C.getIntTypeForBitwidth(32, /* Signed */ true),
2085 ImplicitParamDecl::Other);
2086 FunctionArgList Args;
2087 Args.push_back(&ReduceListArg);
2088 Args.push_back(&NumWarpsArg);
2089
2090 const CGFunctionInfo &CGFI =
2091 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2092 auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
2093 llvm::GlobalValue::InternalLinkage,
2094 "_omp_reduction_inter_warp_copy_func", &M);
2095 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2096 Fn->setDoesNotRecurse();
2097 CodeGenFunction CGF(CGM);
2098 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2099
2100 CGBuilderTy &Bld = CGF.Builder;
2101
2102 // This array is used as a medium to transfer, one reduce element at a time,
2103 // the data from the first lane of every warp to lanes in the first warp
2104 // in order to perform the final step of a reduction in a parallel region
2105 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2106 // for reduced latency, as well as to have a distinct copy for concurrently
2107 // executing target regions. The array is declared with common linkage so
2108 // as to be shared across compilation units.
2109 StringRef TransferMediumName =
2110 "__openmp_nvptx_data_transfer_temporary_storage";
2111 llvm::GlobalVariable *TransferMedium =
2112 M.getGlobalVariable(TransferMediumName);
2113 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
2114 if (!TransferMedium) {
11
Assuming 'TransferMedium' is non-null
12
Taking false branch
2115 auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
2116 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2117 TransferMedium = new llvm::GlobalVariable(
2118 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
2119 llvm::UndefValue::get(Ty), TransferMediumName,
2120 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2121 SharedAddressSpace);
2122 CGM.addCompilerUsedGlobal(TransferMedium);
2123 }
2124
2125 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2126 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2127 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2128 // nvptx_lane_id = nvptx_id % warpsize
2129 llvm::Value *LaneID = getNVPTXLaneID(CGF);
13
Calling 'getNVPTXLaneID'
2130 // nvptx_warp_id = nvptx_id / warpsize
2131 llvm::Value *WarpID = getNVPTXWarpID(CGF);
2132
2133 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2134 Address LocalReduceList(
2135 Bld.CreatePointerBitCastOrAddrSpaceCast(
2136 CGF.EmitLoadOfScalar(
2137 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
2138 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
2139 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2140 CGF.getPointerAlign());
2141
2142 unsigned Idx = 0;
2143 for (const Expr *Private : Privates) {
2144 //
2145 // Warp master copies reduce element to transfer medium in __shared__
2146 // memory.
2147 //
2148 unsigned RealTySize =
2149 C.getTypeSizeInChars(Private->getType())
2150 .alignTo(C.getTypeAlignInChars(Private->getType()))
2151 .getQuantity();
2152 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2153 unsigned NumIters = RealTySize / TySize;
2154 if (NumIters == 0)
2155 continue;
2156 QualType CType = C.getIntTypeForBitwidth(
2157 C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
2158 llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
2159 CharUnits Align = CharUnits::fromQuantity(TySize);
2160 llvm::Value *Cnt = nullptr;
2161 Address CntAddr = Address::invalid();
2162 llvm::BasicBlock *PrecondBB = nullptr;
2163 llvm::BasicBlock *ExitBB = nullptr;
2164 if (NumIters > 1) {
2165 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
2166 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
2167 /*Volatile=*/false, C.IntTy);
2168 PrecondBB = CGF.createBasicBlock("precond");
2169 ExitBB = CGF.createBasicBlock("exit");
2170 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
2171 // There is no need to emit line number for unconditional branch.
2172 (void)ApplyDebugLocation::CreateEmpty(CGF);
2173 CGF.EmitBlock(PrecondBB);
2174 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
2175 llvm::Value *Cmp =
2176 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
2177 Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2178 CGF.EmitBlock(BodyBB);
2179 }
2180 // kmpc_barrier.
2181 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2182 /*EmitChecks=*/false,
2183 /*ForceSimpleCall=*/true);
2184 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2185 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2186 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2187
2188 // if (lane_id == 0)
2189 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2190 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2191 CGF.EmitBlock(ThenBB);
2192
2193 // Reduce element = LocalReduceList[i]
2194 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2195 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2196 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2197 // elemptr = ((CopyType*)(elemptrptr)) + I
2198 Address ElemPtr = Address(ElemPtrPtr, Align);
2199 ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
2200 if (NumIters > 1)
2201 ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
2202
2203 // Get pointer to location in transfer medium.
2204 // MediumPtr = &medium[warp_id]
2205 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2206 TransferMedium->getValueType(), TransferMedium,
2207 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2208 Address MediumPtr(MediumPtrVal, Align);
2209 // Casting to actual data type.
2210 // MediumPtr = (CopyType*)MediumPtrAddr;
2211 MediumPtr = Bld.CreateElementBitCast(MediumPtr, CopyType);
2212
2213 // elem = *elemptr
2214 //*MediumPtr = elem
2215 llvm::Value *Elem = CGF.EmitLoadOfScalar(
2216 ElemPtr, /*Volatile=*/false, CType, Loc,
2217 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2218 // Store the source element value to the dest element address.
2219 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
2220 LValueBaseInfo(AlignmentSource::Type),
2221 TBAAAccessInfo());
2222
2223 Bld.CreateBr(MergeBB);
2224
2225 CGF.EmitBlock(ElseBB);
2226 Bld.CreateBr(MergeBB);
2227
2228 CGF.EmitBlock(MergeBB);
2229
2230 // kmpc_barrier.
2231 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2232 /*EmitChecks=*/false,
2233 /*ForceSimpleCall=*/true);
2234
2235 //
2236 // Warp 0 copies reduce element from transfer medium.
2237 //
2238 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2239 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2240 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2241
2242 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2243 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2244 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
2245
2246 // Up to 32 threads in warp 0 are active.
2247 llvm::Value *IsActiveThread =
2248 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2249 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2250
2251 CGF.EmitBlock(W0ThenBB);
2252
2253 // SrcMediumPtr = &medium[tid]
2254 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2255 TransferMedium->getValueType(), TransferMedium,
2256 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2257 Address SrcMediumPtr(SrcMediumPtrVal, Align);
2258 // SrcMediumVal = *SrcMediumPtr;
2259 SrcMediumPtr = Bld.CreateElementBitCast(SrcMediumPtr, CopyType);
2260
2261 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2262 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2263 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2264 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
2265 Address TargetElemPtr = Address(TargetElemPtrVal, Align);
2266 TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2267 if (NumIters > 1)
2268 TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
2269
2270 // *TargetElemPtr = SrcMediumVal;
2271 llvm::Value *SrcMediumValue =
2272 CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
2273 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2274 CType);
2275 Bld.CreateBr(W0MergeBB);
2276
2277 CGF.EmitBlock(W0ElseBB);
2278 Bld.CreateBr(W0MergeBB);
2279
2280 CGF.EmitBlock(W0MergeBB);
2281
2282 if (NumIters > 1) {
2283 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
2284 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
2285 CGF.EmitBranch(PrecondBB);
2286 (void)ApplyDebugLocation::CreateEmpty(CGF);
2287 CGF.EmitBlock(ExitBB);
2288 }
2289 RealTySize %= TySize;
2290 }
2291 ++Idx;
2292 }
2293
2294 CGF.FinishFunction();
2295 return Fn;
2296}
2297
2298/// Emit a helper that reduces data across two OpenMP threads (lanes)
2299/// in the same warp. It uses shuffle instructions to copy over data from
2300/// a remote lane's stack. The reduction algorithm performed is specified
2301/// by the fourth parameter.
2302///
2303/// Algorithm Versions.
2304/// Full Warp Reduce (argument value 0):
2305/// This algorithm assumes that all 32 lanes are active and gathers
2306/// data from these 32 lanes, producing a single resultant value.
2307/// Contiguous Partial Warp Reduce (argument value 1):
2308/// This algorithm assumes that only a *contiguous* subset of lanes
2309/// are active. This happens for the last warp in a parallel region
2310/// when the user specified num_threads is not an integer multiple of
2311/// 32. This contiguous subset always starts with the zeroth lane.
2312/// Partial Warp Reduce (argument value 2):
2313/// This algorithm gathers data from any number of lanes at any position.
2314/// All reduced values are stored in the lowest possible lane. The set
2315/// of problems every algorithm addresses is a super set of those
2316/// addressable by algorithms with a lower version number. Overhead
2317/// increases as algorithm version increases.
2318///
2319/// Terminology
2320/// Reduce element:
2321/// Reduce element refers to the individual data field with primitive
2322/// data types to be combined and reduced across threads.
2323/// Reduce list:
2324/// Reduce list refers to a collection of local, thread-private
2325/// reduce elements.
2326/// Remote Reduce list:
2327/// Remote Reduce list refers to a collection of remote (relative to
2328/// the current thread) reduce elements.
2329///
2330/// We distinguish between three states of threads that are important to
2331/// the implementation of this function.
2332/// Alive threads:
2333/// Threads in a warp executing the SIMT instruction, as distinguished from
2334/// threads that are inactive due to divergent control flow.
2335/// Active threads:
2336/// The minimal set of threads that has to be alive upon entry to this
2337/// function. The computation is correct iff active threads are alive.
2338/// Some threads are alive but they are not active because they do not
2339/// contribute to the computation in any useful manner. Turning them off
2340/// may introduce control flow overheads without any tangible benefits.
2341/// Effective threads:
2342/// In order to comply with the argument requirements of the shuffle
2343/// function, we must keep all lanes holding data alive. But at most
2344/// half of them perform value aggregation; we refer to this half of
2345/// threads as effective. The other half is simply handing off their
2346/// data.
2347///
2348/// Procedure
2349/// Value shuffle:
2350/// In this step active threads transfer data from higher lane positions
2351/// in the warp to lower lane positions, creating Remote Reduce list.
2352/// Value aggregation:
2353/// In this step, effective threads combine their thread local Reduce list
2354/// with Remote Reduce list and store the result in the thread local
2355/// Reduce list.
2356/// Value copy:
2357/// In this step, we deal with the assumption made by algorithm 2
2358/// (i.e. contiguity assumption). When we have an odd number of lanes
2359/// active, say 2k+1, only k threads will be effective and therefore k
2360/// new values will be produced. However, the Reduce list owned by the
2361/// (2k+1)th thread is ignored in the value aggregation. Therefore
2362/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2363/// that the contiguity assumption still holds.
2364static llvm::Function *emitShuffleAndReduceFunction(
2365 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2366 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2367 ASTContext &C = CGM.getContext();
2368
2369 // Thread local Reduce list used to host the values of data to be reduced.
2370 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2371 C.VoidPtrTy, ImplicitParamDecl::Other);
2372 // Current lane id; could be logical.
2373 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2374 ImplicitParamDecl::Other);
2375 // Offset of the remote source lane relative to the current lane.
2376 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2377 C.ShortTy, ImplicitParamDecl::Other);
2378 // Algorithm version. This is expected to be known at compile time.
2379 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2380 C.ShortTy, ImplicitParamDecl::Other);
2381 FunctionArgList Args;
2382 Args.push_back(&ReduceListArg);
2383 Args.push_back(&LaneIDArg);
2384 Args.push_back(&RemoteLaneOffsetArg);
2385 Args.push_back(&AlgoVerArg);
2386
2387 const CGFunctionInfo &CGFI =
2388 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2389 auto *Fn = llvm::Function::Create(
2390 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2391 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2392 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2393 Fn->setDoesNotRecurse();
2394
2395 CodeGenFunction CGF(CGM);
2396 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2397
2398 CGBuilderTy &Bld = CGF.Builder;
2399
2400 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2401 Address LocalReduceList(
2402 Bld.CreatePointerBitCastOrAddrSpaceCast(
2403 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2404 C.VoidPtrTy, SourceLocation()),
2405 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2406 CGF.getPointerAlign());
2407
2408 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2409 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2410 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2411
2412 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2413 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2414 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2415
2416 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2417 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2418 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2419
2420 // Create a local thread-private variable to host the Reduce list
2421 // from a remote lane.
2422 Address RemoteReduceList =
2423 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2424
2425 // This loop iterates through the list of reduce elements and copies,
2426 // element by element, from a remote lane in the warp to RemoteReduceList,
2427 // hosted on the thread's stack.
2428 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2429 LocalReduceList, RemoteReduceList,
2430 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2431 /*ScratchpadIndex=*/nullptr,
2432 /*ScratchpadWidth=*/nullptr});
2433
2434 // The actions to be performed on the Remote Reduce list is dependent
2435 // on the algorithm version.
2436 //
2437 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2438 // LaneId % 2 == 0 && Offset > 0):
2439 // do the reduction value aggregation
2440 //
2441 // The thread local variable Reduce list is mutated in place to host the
2442 // reduced data, which is the aggregated value produced from local and
2443 // remote lanes.
2444 //
2445 // Note that AlgoVer is expected to be a constant integer known at compile
2446 // time.
2447 // When AlgoVer==0, the first conjunction evaluates to true, making
2448 // the entire predicate true during compile time.
2449 // When AlgoVer==1, the second conjunction has only the second part to be
2450 // evaluated during runtime. Other conjunctions evaluates to false
2451 // during compile time.
2452 // When AlgoVer==2, the third conjunction has only the second part to be
2453 // evaluated during runtime. Other conjunctions evaluates to false
2454 // during compile time.
2455 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2456
2457 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2458 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2459 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2460
2461 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2462 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2463 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2464 CondAlgo2 = Bld.CreateAnd(
2465 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2466
2467 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2468 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2469
2470 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2471 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2472 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2473 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2474
2475 CGF.EmitBlock(ThenBB);
2476 // reduce_function(LocalReduceList, RemoteReduceList)
2477 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2478 LocalReduceList.getPointer(), CGF.VoidPtrTy);
2479 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2480 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2481 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2482 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2483 Bld.CreateBr(MergeBB);
2484
2485 CGF.EmitBlock(ElseBB);
2486 Bld.CreateBr(MergeBB);
2487
2488 CGF.EmitBlock(MergeBB);
2489
2490 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2491 // Reduce list.
2492 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2493 llvm::Value *CondCopy = Bld.CreateAnd(
2494 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2495
2496 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2497 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2498 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2499 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2500
2501 CGF.EmitBlock(CpyThenBB);
2502 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2503 RemoteReduceList, LocalReduceList);
2504 Bld.CreateBr(CpyMergeBB);
2505
2506 CGF.EmitBlock(CpyElseBB);
2507 Bld.CreateBr(CpyMergeBB);
2508
2509 CGF.EmitBlock(CpyMergeBB);
2510
2511 CGF.FinishFunction();
2512 return Fn;
2513}
2514
2515/// This function emits a helper that copies all the reduction variables from
2516/// the team into the provided global buffer for the reduction variables.
2517///
2518/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2519/// For all data entries D in reduce_data:
2520/// Copy local D to buffer.D[Idx]
2521static llvm::Value *emitListToGlobalCopyFunction(
2522 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2523 QualType ReductionArrayTy, SourceLocation Loc,
2524 const RecordDecl *TeamReductionRec,
2525 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2526 &VarFieldMap) {
2527 ASTContext &C = CGM.getContext();
2528
2529 // Buffer: global reduction buffer.
2530 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2531 C.VoidPtrTy, ImplicitParamDecl::Other);
2532 // Idx: index of the buffer.
2533 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2534 ImplicitParamDecl::Other);
2535 // ReduceList: thread local Reduce list.
2536 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2537 C.VoidPtrTy, ImplicitParamDecl::Other);
2538 FunctionArgList Args;
2539 Args.push_back(&BufferArg);
2540 Args.push_back(&IdxArg);
2541 Args.push_back(&ReduceListArg);
2542
2543 const CGFunctionInfo &CGFI =
2544 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2545 auto *Fn = llvm::Function::Create(
2546 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2547 "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2548 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2549 Fn->setDoesNotRecurse();
2550 CodeGenFunction CGF(CGM);
2551 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2552
2553 CGBuilderTy &Bld = CGF.Builder;
2554
2555 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2556 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2557 Address LocalReduceList(
2558 Bld.CreatePointerBitCastOrAddrSpaceCast(
2559 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2560 C.VoidPtrTy, Loc),
2561 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2562 CGF.getPointerAlign());
2563 QualType StaticTy = C.getRecordType(TeamReductionRec);
2564 llvm::Type *LLVMReductionsBufferTy =
2565 CGM.getTypes().ConvertTypeForMem(StaticTy);
2566 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2567 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2568 LLVMReductionsBufferTy->getPointerTo());
2569 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2570 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2571 /*Volatile=*/false, C.IntTy,
2572 Loc)};
2573 unsigned Idx = 0;
2574 for (const Expr *Private : Privates) {
2575 // Reduce element = LocalReduceList[i]
2576 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2577 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2578 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2579 // elemptr = ((CopyType*)(elemptrptr)) + I
2580 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2581 ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
2582 Address ElemPtr =
2583 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2584 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2585 // Global = Buffer.VD[Idx];
2586 const FieldDecl *FD = VarFieldMap.lookup(VD);
2587 LValue GlobLVal = CGF.EmitLValueForField(
2588 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2589 Address GlobAddr = GlobLVal.getAddress(CGF);
2590 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2591 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2592 GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
2593 switch (CGF.getEvaluationKind(Private->getType())) {
2594 case TEK_Scalar: {
2595 llvm::Value *V = CGF.EmitLoadOfScalar(
2596 ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2597 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2598 CGF.EmitStoreOfScalar(V, GlobLVal);
2599 break;
2600 }
2601 case TEK_Complex: {
2602 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2603 CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2604 CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2605 break;
2606 }
2607 case TEK_Aggregate:
2608 CGF.EmitAggregateCopy(GlobLVal,
2609 CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2610 Private->getType(), AggValueSlot::DoesNotOverlap);
2611 break;
2612 }
2613 ++Idx;
2614 }
2615
2616 CGF.FinishFunction();
2617 return Fn;
2618}
2619
2620/// This function emits a helper that reduces all the reduction variables from
2621/// the team into the provided global buffer for the reduction variables.
2622///
2623/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2624/// void *GlobPtrs[];
2625/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2626/// ...
2627/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2628/// reduce_function(GlobPtrs, reduce_data);
2629static llvm::Value *emitListToGlobalReduceFunction(
2630 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2631 QualType ReductionArrayTy, SourceLocation Loc,
2632 const RecordDecl *TeamReductionRec,
2633 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2634 &VarFieldMap,
2635 llvm::Function *ReduceFn) {
2636 ASTContext &C = CGM.getContext();
2637
2638 // Buffer: global reduction buffer.
2639 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2640 C.VoidPtrTy, ImplicitParamDecl::Other);
2641 // Idx: index of the buffer.
2642 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2643 ImplicitParamDecl::Other);
2644 // ReduceList: thread local Reduce list.
2645 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2646 C.VoidPtrTy, ImplicitParamDecl::Other);
2647 FunctionArgList Args;
2648 Args.push_back(&BufferArg);
2649 Args.push_back(&IdxArg);
2650 Args.push_back(&ReduceListArg);
2651
2652 const CGFunctionInfo &CGFI =
2653 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2654 auto *Fn = llvm::Function::Create(
2655 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2656 "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2657 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2658 Fn->setDoesNotRecurse();
2659 CodeGenFunction CGF(CGM);
2660 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2661
2662 CGBuilderTy &Bld = CGF.Builder;
2663
2664 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2665 QualType StaticTy = C.getRecordType(TeamReductionRec);
2666 llvm::Type *LLVMReductionsBufferTy =
2667 CGM.getTypes().ConvertTypeForMem(StaticTy);
2668 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2669 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2670 LLVMReductionsBufferTy->getPointerTo());
2671
2672 // 1. Build a list of reduction variables.
2673 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2674 Address ReductionList =
2675 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2676 auto IPriv = Privates.begin();
2677 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2678 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2679 /*Volatile=*/false, C.IntTy,
2680 Loc)};
2681 unsigned Idx = 0;
2682 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2683 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2684 // Global = Buffer.VD[Idx];
2685 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2686 const FieldDecl *FD = VarFieldMap.lookup(VD);
2687 LValue GlobLVal = CGF.EmitLValueForField(
2688 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2689 Address GlobAddr = GlobLVal.getAddress(CGF);
2690 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2691 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2692 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2693 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2694 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2695 // Store array size.
2696 ++Idx;
2697 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2698 llvm::Value *Size = CGF.Builder.CreateIntCast(
2699 CGF.getVLASize(
2700 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2701 .NumElts,
2702 CGF.SizeTy, /*isSigned=*/false);
2703 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2704 Elem);
2705 }
2706 }
2707
2708 // Call reduce_function(GlobalReduceList, ReduceList)
2709 llvm::Value *GlobalReduceList =
2710 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2711 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2712 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2713 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2714 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2715 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2716 CGF.FinishFunction();
2717 return Fn;
2718}
2719
2720/// This function emits a helper that copies all the reduction variables from
2721/// the team into the provided global buffer for the reduction variables.
2722///
2723/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2724/// For all data entries D in reduce_data:
2725/// Copy buffer.D[Idx] to local D;
2726static llvm::Value *emitGlobalToListCopyFunction(
2727 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2728 QualType ReductionArrayTy, SourceLocation Loc,
2729 const RecordDecl *TeamReductionRec,
2730 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2731 &VarFieldMap) {
2732 ASTContext &C = CGM.getContext();
2733
2734 // Buffer: global reduction buffer.
2735 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2736 C.VoidPtrTy, ImplicitParamDecl::Other);
2737 // Idx: index of the buffer.
2738 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2739 ImplicitParamDecl::Other);
2740 // ReduceList: thread local Reduce list.
2741 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2742 C.VoidPtrTy, ImplicitParamDecl::Other);
2743 FunctionArgList Args;
2744 Args.push_back(&BufferArg);
2745 Args.push_back(&IdxArg);
2746 Args.push_back(&ReduceListArg);
2747
2748 const CGFunctionInfo &CGFI =
2749 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2750 auto *Fn = llvm::Function::Create(
2751 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2752 "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2753 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2754 Fn->setDoesNotRecurse();
2755 CodeGenFunction CGF(CGM);
2756 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2757
2758 CGBuilderTy &Bld = CGF.Builder;
2759
2760 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2761 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2762 Address LocalReduceList(
2763 Bld.CreatePointerBitCastOrAddrSpaceCast(
2764 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2765 C.VoidPtrTy, Loc),
2766 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2767 CGF.getPointerAlign());
2768 QualType StaticTy = C.getRecordType(TeamReductionRec);
2769 llvm::Type *LLVMReductionsBufferTy =
2770 CGM.getTypes().ConvertTypeForMem(StaticTy);
2771 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2772 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2773 LLVMReductionsBufferTy->getPointerTo());
2774
2775 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2776 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2777 /*Volatile=*/false, C.IntTy,
2778 Loc)};
2779 unsigned Idx = 0;
2780 for (const Expr *Private : Privates) {
2781 // Reduce element = LocalReduceList[i]
2782 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2783 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2784 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2785 // elemptr = ((CopyType*)(elemptrptr)) + I
2786 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2787 ElemPtrPtr, CGF.ConvertTypeForMem(Private->getType())->getPointerTo());
2788 Address ElemPtr =
2789 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2790 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2791 // Global = Buffer.VD[Idx];
2792 const FieldDecl *FD = VarFieldMap.lookup(VD);
2793 LValue GlobLVal = CGF.EmitLValueForField(
2794 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2795 Address GlobAddr = GlobLVal.getAddress(CGF);
2796 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2797 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2798 GlobLVal.setAddress(Address(BufferPtr, GlobAddr.getAlignment()));
2799 switch (CGF.getEvaluationKind(Private->getType())) {
2800 case TEK_Scalar: {
2801 llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2802 CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2803 LValueBaseInfo(AlignmentSource::Type),
2804 TBAAAccessInfo());
2805 break;
2806 }
2807 case TEK_Complex: {
2808 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
2809 CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2810 /*isInit=*/false);
2811 break;
2812 }
2813 case TEK_Aggregate:
2814 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2815 GlobLVal, Private->getType(),
2816 AggValueSlot::DoesNotOverlap);
2817 break;
2818 }
2819 ++Idx;
2820 }
2821
2822 CGF.FinishFunction();
2823 return Fn;
2824}
2825
2826/// This function emits a helper that reduces all the reduction variables from
2827/// the team into the provided global buffer for the reduction variables.
2828///
2829/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2830/// void *GlobPtrs[];
2831/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2832/// ...
2833/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2834/// reduce_function(reduce_data, GlobPtrs);
2835static llvm::Value *emitGlobalToListReduceFunction(
2836 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2837 QualType ReductionArrayTy, SourceLocation Loc,
2838 const RecordDecl *TeamReductionRec,
2839 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2840 &VarFieldMap,
2841 llvm::Function *ReduceFn) {
2842 ASTContext &C = CGM.getContext();
2843
2844 // Buffer: global reduction buffer.
2845 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2846 C.VoidPtrTy, ImplicitParamDecl::Other);
2847 // Idx: index of the buffer.
2848 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2849 ImplicitParamDecl::Other);
2850 // ReduceList: thread local Reduce list.
2851 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2852 C.VoidPtrTy, ImplicitParamDecl::Other);
2853 FunctionArgList Args;
2854 Args.push_back(&BufferArg);
2855 Args.push_back(&IdxArg);
2856 Args.push_back(&ReduceListArg);
2857
2858 const CGFunctionInfo &CGFI =
2859 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2860 auto *Fn = llvm::Function::Create(
2861 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2862 "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2863 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2864 Fn->setDoesNotRecurse();
2865 CodeGenFunction CGF(CGM);
2866 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2867
2868 CGBuilderTy &Bld = CGF.Builder;
2869
2870 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2871 QualType StaticTy = C.getRecordType(TeamReductionRec);
2872 llvm::Type *LLVMReductionsBufferTy =
2873 CGM.getTypes().ConvertTypeForMem(StaticTy);
2874 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2875 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2876 LLVMReductionsBufferTy->getPointerTo());
2877
2878 // 1. Build a list of reduction variables.
2879 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2880 Address ReductionList =
2881 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2882 auto IPriv = Privates.begin();
2883 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2884 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2885 /*Volatile=*/false, C.IntTy,
2886 Loc)};
2887 unsigned Idx = 0;
2888 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2889 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2890 // Global = Buffer.VD[Idx];
2891 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2892 const FieldDecl *FD = VarFieldMap.lookup(VD);
2893 LValue GlobLVal = CGF.EmitLValueForField(
2894 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2895 Address GlobAddr = GlobLVal.getAddress(CGF);
2896 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2897 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2898 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2899 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2900 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2901 // Store array size.
2902 ++Idx;
2903 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2904 llvm::Value *Size = CGF.Builder.CreateIntCast(
2905 CGF.getVLASize(
2906 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2907 .NumElts,
2908 CGF.SizeTy, /*isSigned=*/false);
2909 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2910 Elem);
2911 }
2912 }
2913
2914 // Call reduce_function(ReduceList, GlobalReduceList)
2915 llvm::Value *GlobalReduceList =
2916 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2917 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2918 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2919 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2920 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2921 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2922 CGF.FinishFunction();
2923 return Fn;
2924}
2925
2926///
2927/// Design of OpenMP reductions on the GPU
2928///
2929/// Consider a typical OpenMP program with one or more reduction
2930/// clauses:
2931///
2932/// float foo;
2933/// double bar;
2934/// #pragma omp target teams distribute parallel for \
2935/// reduction(+:foo) reduction(*:bar)
2936/// for (int i = 0; i < N; i++) {
2937/// foo += A[i]; bar *= B[i];
2938/// }
2939///
2940/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2941/// all teams. In our OpenMP implementation on the NVPTX device an
2942/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2943/// within a team are mapped to CUDA threads within a threadblock.
2944/// Our goal is to efficiently aggregate values across all OpenMP
2945/// threads such that:
2946///
2947/// - the compiler and runtime are logically concise, and
2948/// - the reduction is performed efficiently in a hierarchical
2949/// manner as follows: within OpenMP threads in the same warp,
2950/// across warps in a threadblock, and finally across teams on
2951/// the NVPTX device.
2952///
2953/// Introduction to Decoupling
2954///
2955/// We would like to decouple the compiler and the runtime so that the
2956/// latter is ignorant of the reduction variables (number, data types)
2957/// and the reduction operators. This allows a simpler interface
2958/// and implementation while still attaining good performance.
2959///
2960/// Pseudocode for the aforementioned OpenMP program generated by the
2961/// compiler is as follows:
2962///
2963/// 1. Create private copies of reduction variables on each OpenMP
2964/// thread: 'foo_private', 'bar_private'
2965/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2966/// to it and writes the result in 'foo_private' and 'bar_private'
2967/// respectively.
2968/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2969/// and store the result on the team master:
2970///
2971/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2972/// reduceData, shuffleReduceFn, interWarpCpyFn)
2973///
2974/// where:
2975/// struct ReduceData {
2976/// double *foo;
2977/// double *bar;
2978/// } reduceData
2979/// reduceData.foo = &foo_private
2980/// reduceData.bar = &bar_private
2981///
2982/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2983/// auxiliary functions generated by the compiler that operate on
2984/// variables of type 'ReduceData'. They aid the runtime perform
2985/// algorithmic steps in a data agnostic manner.
2986///
2987/// 'shuffleReduceFn' is a pointer to a function that reduces data
2988/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2989/// same warp. It takes the following arguments as input:
2990///
2991/// a. variable of type 'ReduceData' on the calling lane,
2992/// b. its lane_id,
2993/// c. an offset relative to the current lane_id to generate a
2994/// remote_lane_id. The remote lane contains the second
2995/// variable of type 'ReduceData' that is to be reduced.
2996/// d. an algorithm version parameter determining which reduction
2997/// algorithm to use.
2998///
2999/// 'shuffleReduceFn' retrieves data from the remote lane using
3000/// efficient GPU shuffle intrinsics and reduces, using the
3001/// algorithm specified by the 4th parameter, the two operands
3002/// element-wise. The result is written to the first operand.
3003///
3004/// Different reduction algorithms are implemented in different
3005/// runtime functions, all calling 'shuffleReduceFn' to perform
3006/// the essential reduction step. Therefore, based on the 4th
3007/// parameter, this function behaves slightly differently to
3008/// cooperate with the runtime to ensure correctness under
3009/// different circumstances.
3010///
3011/// 'InterWarpCpyFn' is a pointer to a function that transfers
3012/// reduced variables across warps. It tunnels, through CUDA
3013/// shared memory, the thread-private data of type 'ReduceData'
3014/// from lane 0 of each warp to a lane in the first warp.
3015/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3016/// The last team writes the global reduced value to memory.
3017///
3018/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3019/// reduceData, shuffleReduceFn, interWarpCpyFn,
3020/// scratchpadCopyFn, loadAndReduceFn)
3021///
3022/// 'scratchpadCopyFn' is a helper that stores reduced
3023/// data from the team master to a scratchpad array in
3024/// global memory.
3025///
3026/// 'loadAndReduceFn' is a helper that loads data from
3027/// the scratchpad array and reduces it with the input
3028/// operand.
3029///
3030/// These compiler generated functions hide address
3031/// calculation and alignment information from the runtime.
3032/// 5. if ret == 1:
3033/// The team master of the last team stores the reduced
3034/// result to the globals in memory.
3035/// foo += reduceData.foo; bar *= reduceData.bar
3036///
3037///
3038/// Warp Reduction Algorithms
3039///
3040/// On the warp level, we have three algorithms implemented in the
3041/// OpenMP runtime depending on the number of active lanes:
3042///
3043/// Full Warp Reduction
3044///
3045/// The reduce algorithm within a warp where all lanes are active
3046/// is implemented in the runtime as follows:
3047///
3048/// full_warp_reduce(void *reduce_data,
3049/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3050/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3051/// ShuffleReduceFn(reduce_data, 0, offset, 0);
3052/// }
3053///
3054/// The algorithm completes in log(2, WARPSIZE) steps.
3055///
3056/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3057/// not used therefore we save instructions by not retrieving lane_id
3058/// from the corresponding special registers. The 4th parameter, which
3059/// represents the version of the algorithm being used, is set to 0 to
3060/// signify full warp reduction.
3061///
3062/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3063///
3064/// #reduce_elem refers to an element in the local lane's data structure
3065/// #remote_elem is retrieved from a remote lane
3066/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3067/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3068///
3069/// Contiguous Partial Warp Reduction
3070///
3071/// This reduce algorithm is used within a warp where only the first
3072/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3073/// number of OpenMP threads in a parallel region is not a multiple of
3074/// WARPSIZE. The algorithm is implemented in the runtime as follows:
3075///
3076/// void
3077/// contiguous_partial_reduce(void *reduce_data,
3078/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3079/// int size, int lane_id) {
3080/// int curr_size;
3081/// int offset;
3082/// curr_size = size;
3083/// mask = curr_size/2;
3084/// while (offset>0) {
3085/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3086/// curr_size = (curr_size+1)/2;
3087/// offset = curr_size/2;
3088/// }
3089/// }
3090///
3091/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3092///
3093/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3094/// if (lane_id < offset)
3095/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3096/// else
3097/// reduce_elem = remote_elem
3098///
3099/// This algorithm assumes that the data to be reduced are located in a
3100/// contiguous subset of lanes starting from the first. When there is
3101/// an odd number of active lanes, the data in the last lane is not
3102/// aggregated with any other lane's dat but is instead copied over.
3103///
3104/// Dispersed Partial Warp Reduction
3105///
3106/// This algorithm is used within a warp when any discontiguous subset of
3107/// lanes are active. It is used to implement the reduction operation
3108/// across lanes in an OpenMP simd region or in a nested parallel region.
3109///
3110/// void
3111/// dispersed_partial_reduce(void *reduce_data,
3112/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3113/// int size, remote_id;
3114/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3115/// do {
3116/// remote_id = next_active_lane_id_right_after_me();
3117/// # the above function returns 0 of no active lane
3118/// # is present right after the current lane.
3119/// size = number_of_active_lanes_in_this_warp();
3120/// logical_lane_id /= 2;
3121/// ShuffleReduceFn(reduce_data, logical_lane_id,
3122/// remote_id-1-threadIdx.x, 2);
3123/// } while (logical_lane_id % 2 == 0 && size > 1);
3124/// }
3125///
3126/// There is no assumption made about the initial state of the reduction.
3127/// Any number of lanes (>=1) could be active at any position. The reduction
3128/// result is returned in the first active lane.
3129///
3130/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3131///
3132/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3133/// if (lane_id % 2 == 0 && offset > 0)
3134/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3135/// else
3136/// reduce_elem = remote_elem
3137///
3138///
3139/// Intra-Team Reduction
3140///
3141/// This function, as implemented in the runtime call
3142/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3143/// threads in a team. It first reduces within a warp using the
3144/// aforementioned algorithms. We then proceed to gather all such
3145/// reduced values at the first warp.
3146///
3147/// The runtime makes use of the function 'InterWarpCpyFn', which copies
3148/// data from each of the "warp master" (zeroth lane of each warp, where
3149/// warp-reduced data is held) to the zeroth warp. This step reduces (in
3150/// a mathematical sense) the problem of reduction across warp masters in
3151/// a block to the problem of warp reduction.
3152///
3153///
3154/// Inter-Team Reduction
3155///
3156/// Once a team has reduced its data to a single value, it is stored in
3157/// a global scratchpad array. Since each team has a distinct slot, this
3158/// can be done without locking.
3159///
3160/// The last team to write to the scratchpad array proceeds to reduce the
3161/// scratchpad array. One or more workers in the last team use the helper
3162/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3163/// the k'th worker reduces every k'th element.
3164///
3165/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3166/// reduce across workers and compute a globally reduced value.
3167///
3168void CGOpenMPRuntimeGPU::emitReduction(
3169 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3170 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3171 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3172 if (!CGF.HaveInsertPoint())
1
Taking false branch
3173 return;
3174
3175 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3176#ifndef NDEBUG
3177 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3178#endif
3179
3180 if (Options.SimpleReduction) {
2
Assuming field 'SimpleReduction' is false
3181 assert(!TeamsReduction && !ParallelReduction &&(static_cast <bool> (!TeamsReduction && !ParallelReduction
&& "Invalid reduction selection in emitReduction.") ?
void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3182, __extension__
__PRETTY_FUNCTION__))
3182 "Invalid reduction selection in emitReduction.")(static_cast <bool> (!TeamsReduction && !ParallelReduction
&& "Invalid reduction selection in emitReduction.") ?
void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3182, __extension__
__PRETTY_FUNCTION__))
;
3183 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3184 ReductionOps, Options);
3185 return;
3186 }
3187
3188 assert((TeamsReduction || ParallelReduction) &&(static_cast <bool> ((TeamsReduction || ParallelReduction
) && "Invalid reduction selection in emitReduction.")
? void (0) : __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3189, __extension__
__PRETTY_FUNCTION__))
3
Taking false branch
4
Assuming 'TeamsReduction' is false
5
Assuming 'ParallelReduction' is true
6
'?' condition is true
3189 "Invalid reduction selection in emitReduction.")(static_cast <bool> ((TeamsReduction || ParallelReduction
) && "Invalid reduction selection in emitReduction.")
? void (0) : __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3189, __extension__
__PRETTY_FUNCTION__))
;
3190
3191 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3192 // RedList, shuffle_reduce_func, interwarp_copy_func);
3193 // or
3194 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3195 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3196 llvm::Value *ThreadId = getThreadID(CGF, Loc);
3197
3198 llvm::Value *Res;
3199 ASTContext &C = CGM.getContext();
3200 // 1. Build a list of reduction variables.
3201 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3202 auto Size = RHSExprs.size();
3203 for (const Expr *E : Privates) {
7
Assuming '__begin1' is equal to '__end1'
3204 if (E->getType()->isVariablyModifiedType())
3205 // Reserve place for array size.
3206 ++Size;
3207 }
3208 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3209 QualType ReductionArrayTy =
3210 C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
3211 /*IndexTypeQuals=*/0);
3212 Address ReductionList =
3213 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3214 auto IPriv = Privates.begin();
3215 unsigned Idx = 0;
3216 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
8
Assuming 'I' is >= 'E'
9
Loop condition is false. Execution continues on line 3236
3217 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3218 CGF.Builder.CreateStore(
3219 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3220 CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
3221 Elem);
3222 if ((*IPriv)->getType()->isVariablyModifiedType()) {
3223 // Store array size.
3224 ++Idx;
3225 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3226 llvm::Value *Size = CGF.Builder.CreateIntCast(
3227 CGF.getVLASize(
3228 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3229 .NumElts,
3230 CGF.SizeTy, /*isSigned=*/false);
3231 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3232 Elem);
3233 }
3234 }
3235
3236 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3237 ReductionList.getPointer(), CGF.VoidPtrTy);
3238 llvm::Function *ReductionFn = emitReductionFunction(
3239 Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
3240 LHSExprs, RHSExprs, ReductionOps);
3241 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3242 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3243 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3244 llvm::Value *InterWarpCopyFn =
3245 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
10
Calling 'emitInterWarpCopyFunction'
3246
3247 if (ParallelReduction) {
3248 llvm::Value *Args[] = {RTLoc,
3249 ThreadId,
3250 CGF.Builder.getInt32(RHSExprs.size()),
3251 ReductionArrayTySize,
3252 RL,
3253 ShuffleAndReduceFn,
3254 InterWarpCopyFn};
3255
3256 Res = CGF.EmitRuntimeCall(
3257 OMPBuilder.getOrCreateRuntimeFunction(
3258 CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3259 Args);
3260 } else {
3261 assert(TeamsReduction && "expected teams reduction.")(static_cast <bool> (TeamsReduction && "expected teams reduction."
) ? void (0) : __assert_fail ("TeamsReduction && \"expected teams reduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3261, __extension__
__PRETTY_FUNCTION__))
;
3262 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3263 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3264 int Cnt = 0;
3265 for (const Expr *DRE : Privates) {
3266 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3267 ++Cnt;
3268 }
3269 const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3270 CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3271 C.getLangOpts().OpenMPCUDAReductionBufNum);
3272 TeamsReductions.push_back(TeamReductionRec);
3273 if (!KernelTeamsReductionPtr) {
3274 KernelTeamsReductionPtr = new llvm::GlobalVariable(
3275 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3276 llvm::GlobalValue::InternalLinkage, nullptr,
3277 "_openmp_teams_reductions_buffer_$_$ptr");
3278 }
3279 llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3280 Address(KernelTeamsReductionPtr, CGM.getPointerAlign()),
3281 /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3282 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3283 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3284 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3285 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3286 ReductionFn);
3287 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3288 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3289 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3290 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3291 ReductionFn);
3292
3293 llvm::Value *Args[] = {
3294 RTLoc,
3295 ThreadId,
3296 GlobalBufferPtr,
3297 CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3298 RL,
3299 ShuffleAndReduceFn,
3300 InterWarpCopyFn,
3301 GlobalToBufferCpyFn,
3302 GlobalToBufferRedFn,
3303 BufferToGlobalCpyFn,
3304 BufferToGlobalRedFn};
3305
3306 Res = CGF.EmitRuntimeCall(
3307 OMPBuilder.getOrCreateRuntimeFunction(
3308 CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3309 Args);
3310 }
3311
3312 // 5. Build if (res == 1)
3313 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3314 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3315 llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3316 Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3317 CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3318
3319 // 6. Build then branch: where we have reduced values in the master
3320 // thread in each team.
3321 // __kmpc_end_reduce{_nowait}(<gtid>);
3322 // break;
3323 CGF.EmitBlock(ThenBB);
3324
3325 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3326 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3327 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3328 auto IPriv = Privates.begin();
3329 auto ILHS = LHSExprs.begin();
3330 auto IRHS = RHSExprs.begin();
3331 for (const Expr *E : ReductionOps) {
3332 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3333 cast<DeclRefExpr>(*IRHS));
3334 ++IPriv;
3335 ++ILHS;
3336 ++IRHS;
3337 }
3338 };
3339 llvm::Value *EndArgs[] = {ThreadId};
3340 RegionCodeGenTy RCG(CodeGen);
3341 NVPTXActionTy Action(
3342 nullptr, llvm::None,
3343 OMPBuilder.getOrCreateRuntimeFunction(
3344 CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3345 EndArgs);
3346 RCG.setAction(Action);
3347 RCG(CGF);
3348 // There is no need to emit line number for unconditional branch.
3349 (void)ApplyDebugLocation::CreateEmpty(CGF);
3350 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3351}
3352
3353const VarDecl *
3354CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3355 const VarDecl *NativeParam) const {
3356 if (!NativeParam->getType()->isReferenceType())
3357 return NativeParam;
3358 QualType ArgType = NativeParam->getType();
3359 QualifierCollector QC;
3360 const Type *NonQualTy = QC.strip(ArgType);
3361 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3362 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3363 if (Attr->getCaptureKind() == OMPC_map) {
3364 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3365 LangAS::opencl_global);
3366 }
3367 }
3368 ArgType = CGM.getContext().getPointerType(PointeeTy);
3369 QC.addRestrict();
3370 enum { NVPTX_local_addr = 5 };
3371 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3372 ArgType = QC.apply(CGM.getContext(), ArgType);
3373 if (isa<ImplicitParamDecl>(NativeParam))
3374 return ImplicitParamDecl::Create(
3375 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3376 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3377 return ParmVarDecl::Create(
3378 CGM.getContext(),
3379 const_cast<DeclContext *>(NativeParam->getDeclContext()),
3380 NativeParam->getBeginLoc(), NativeParam->getLocation(),
3381 NativeParam->getIdentifier(), ArgType,
3382 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3383}
3384
3385Address
3386CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3387 const VarDecl *NativeParam,
3388 const VarDecl *TargetParam) const {
3389 assert(NativeParam != TargetParam &&(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3391, __extension__
__PRETTY_FUNCTION__))
3390 NativeParam->getType()->isReferenceType() &&(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3391, __extension__
__PRETTY_FUNCTION__))
3391 "Native arg must not be the same as target arg.")(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3391, __extension__
__PRETTY_FUNCTION__))
;
3392 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3393 QualType NativeParamType = NativeParam->getType();
3394 QualifierCollector QC;
3395 const Type *NonQualTy = QC.strip(NativeParamType);
3396 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3397 unsigned NativePointeeAddrSpace =
3398 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3399 QualType TargetTy = TargetParam->getType();
3400 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3401 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3402 // First cast to generic.
3403 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3404 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3405 /*AddrSpace=*/0));
3406 // Cast from generic to native address space.
3407 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3408 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3409 NativePointeeAddrSpace));
3410 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3411 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3412 NativeParamType);
3413 return NativeParamAddr;
3414}
3415
3416void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3417 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3418 ArrayRef<llvm::Value *> Args) const {
3419 SmallVector<llvm::Value *, 4> TargetArgs;
3420 TargetArgs.reserve(Args.size());
3421 auto *FnType = OutlinedFn.getFunctionType();
3422 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3423 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3424 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3425 break;
3426 }
3427 llvm::Type *TargetType = FnType->getParamType(I);
3428 llvm::Value *NativeArg = Args[I];
3429 if (!TargetType->isPointerTy()) {
3430 TargetArgs.emplace_back(NativeArg);
3431 continue;
3432 }
3433 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3434 NativeArg,
3435 NativeArg->getType()->getPointerElementType()->getPointerTo());
3436 TargetArgs.emplace_back(
3437 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3438 }
3439 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3440}
3441
3442/// Emit function which wraps the outline parallel region
3443/// and controls the arguments which are passed to this function.
3444/// The wrapper ensures that the outlined function is called
3445/// with the correct arguments when data is shared.
3446llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3447 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3448 ASTContext &Ctx = CGM.getContext();
3449 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3450
3451 // Create a function that takes as argument the source thread.
3452 FunctionArgList WrapperArgs;
3453 QualType Int16QTy =
3454 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3455 QualType Int32QTy =
3456 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3457 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3458 /*Id=*/nullptr, Int16QTy,
3459 ImplicitParamDecl::Other);
3460 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3461 /*Id=*/nullptr, Int32QTy,
3462 ImplicitParamDecl::Other);
3463 WrapperArgs.emplace_back(&ParallelLevelArg);
3464 WrapperArgs.emplace_back(&WrapperArg);
3465
3466 const CGFunctionInfo &CGFI =
3467 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3468
3469 auto *Fn = llvm::Function::Create(
3470 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3471 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3472
3473 // Ensure we do not inline the function. This is trivially true for the ones
3474 // passed to __kmpc_fork_call but the ones calles in serialized regions
3475 // could be inlined. This is not a perfect but it is closer to the invariant
3476 // we want, namely, every data environment starts with a new function.
3477 // TODO: We should pass the if condition to the runtime function and do the
3478 // handling there. Much cleaner code.
3479 Fn->addFnAttr(llvm::Attribute::NoInline);
3480
3481 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3482 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3483 Fn->setDoesNotRecurse();
3484
3485 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3486 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3487 D.getBeginLoc(), D.getBeginLoc());
3488
3489 const auto *RD = CS.getCapturedRecordDecl();
3490 auto CurField = RD->field_begin();
3491
3492 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3493 /*Name=*/".zero.addr");
3494 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3495 // Get the array of arguments.
3496 SmallVector<llvm::Value *, 8> Args;
3497
3498 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3499 Args.emplace_back(ZeroAddr.getPointer());
3500
3501 CGBuilderTy &Bld = CGF.Builder;
3502 auto CI = CS.capture_begin();
3503
3504 // Use global memory for data sharing.
3505 // Handle passing of global args to workers.
3506 Address GlobalArgs =
3507 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3508 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3509 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3510 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3511 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3512 DataSharingArgs);
3513
3514 // Retrieve the shared variables from the list of references returned
3515 // by the runtime. Pass the variables to the outlined function.
3516 Address SharedArgListAddress = Address::invalid();
3517 if (CS.capture_size() > 0 ||
3518 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3519 SharedArgListAddress = CGF.EmitLoadOfPointer(
3520 GlobalArgs, CGF.getContext()
3521 .getPointerType(CGF.getContext().getPointerType(
3522 CGF.getContext().VoidPtrTy))
3523 .castAs<PointerType>());
3524 }
3525 unsigned Idx = 0;
3526 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3527 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3528 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3529 Src, CGF.SizeTy->getPointerTo());
3530 llvm::Value *LB = CGF.EmitLoadOfScalar(
3531 TypedAddress,
3532 /*Volatile=*/false,
3533 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3534 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3535 Args.emplace_back(LB);
3536 ++Idx;
3537 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3538 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3539 Src, CGF.SizeTy->getPointerTo());
3540 llvm::Value *UB = CGF.EmitLoadOfScalar(
3541 TypedAddress,
3542 /*Volatile=*/false,
3543 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3544 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3545 Args.emplace_back(UB);
3546 ++Idx;
3547 }
3548 if (CS.capture_size() > 0) {
3549 ASTContext &CGFContext = CGF.getContext();
3550 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3551 QualType ElemTy = CurField->getType();
3552 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3553 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3554 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3555 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3556 /*Volatile=*/false,
3557 CGFContext.getPointerType(ElemTy),
3558 CI->getLocation());
3559 if (CI->capturesVariableByCopy() &&
3560 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3561 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3562 CI->getLocation());
3563 }
3564 Args.emplace_back(Arg);
3565 }
3566 }
3567
3568 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3569 CGF.FinishFunction();
3570 return Fn;
3571}
3572
3573void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3574 const Decl *D) {
3575 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3576 return;
3577
3578 assert(D && "Expected function or captured|block decl.")(static_cast <bool> (D && "Expected function or captured|block decl."
) ? void (0) : __assert_fail ("D && \"Expected function or captured|block decl.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3578, __extension__
__PRETTY_FUNCTION__))
;
3579 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&(static_cast <bool> (FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 && "Function is registered already.") ? void
(0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3580, __extension__
__PRETTY_FUNCTION__))
3580 "Function is registered already.")(static_cast <bool> (FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 && "Function is registered already.") ? void
(0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3580, __extension__
__PRETTY_FUNCTION__))
;
3581 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&(static_cast <bool> ((!TeamAndReductions.first || TeamAndReductions
.first == D) && "Team is set but not processed.") ? void
(0) : __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3582, __extension__
__PRETTY_FUNCTION__))
3582 "Team is set but not processed.")(static_cast <bool> ((!TeamAndReductions.first || TeamAndReductions
.first == D) && "Team is set but not processed.") ? void
(0) : __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3582, __extension__
__PRETTY_FUNCTION__))
;
3583 const Stmt *Body = nullptr;
3584 bool NeedToDelayGlobalization = false;
3585 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3586 Body = FD->getBody();
3587 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3588 Body = BD->getBody();
3589 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3590 Body = CD->getBody();
3591 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3592 if (NeedToDelayGlobalization &&
3593 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3594 return;
3595 }
3596 if (!Body)
3597 return;
3598 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3599 VarChecker.Visit(Body);
3600 const RecordDecl *GlobalizedVarsRecord =
3601 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3602 TeamAndReductions.first = nullptr;
3603 TeamAndReductions.second.clear();
3604 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3605 VarChecker.getEscapedVariableLengthDecls();
3606 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3607 return;
3608 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3609 I->getSecond().MappedParams =
3610 std::make_unique<CodeGenFunction::OMPMapVars>();
3611 I->getSecond().EscapedParameters.insert(
3612 VarChecker.getEscapedParameters().begin(),
3613 VarChecker.getEscapedParameters().end());
3614 I->getSecond().EscapedVariableLengthDecls.append(
3615 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3616 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3617 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3618 assert(VD->isCanonicalDecl() && "Expected canonical declaration")(static_cast <bool> (VD->isCanonicalDecl() &&
"Expected canonical declaration") ? void (0) : __assert_fail
("VD->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3618, __extension__
__PRETTY_FUNCTION__))
;
3619 Data.insert(std::make_pair(VD, MappedVarData()));
3620 }
3621 if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
3622 CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
3623 VarChecker.Visit(Body);
3624 I->getSecond().SecondaryLocalVarData.emplace();
3625 DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
3626 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3627 assert(VD->isCanonicalDecl() && "Expected canonical declaration")(static_cast <bool> (VD->isCanonicalDecl() &&
"Expected canonical declaration") ? void (0) : __assert_fail
("VD->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3627, __extension__
__PRETTY_FUNCTION__))
;
3628 Data.insert(std::make_pair(VD, MappedVarData()));
3629 }
3630 }
3631 if (!NeedToDelayGlobalization) {
3632 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
3633 struct GlobalizationScope final : EHScopeStack::Cleanup {
3634 GlobalizationScope() = default;
3635
3636 void Emit(CodeGenFunction &CGF, Flags flags) override {
3637 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3638 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
3639 }
3640 };
3641 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3642 }
3643}
3644
3645Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3646 const VarDecl *VD) {
3647 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3648 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3649 auto AS = LangAS::Default;
3650 switch (A->getAllocatorType()) {
3651 // Use the default allocator here as by default local vars are
3652 // threadlocal.
3653 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3654 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3655 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3656 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3657 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3658 // Follow the user decision - use default allocation.
3659 return Address::invalid();
3660 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3661 // TODO: implement aupport for user-defined allocators.
3662 return Address::invalid();
3663 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3664 AS = LangAS::cuda_constant;
3665 break;
3666 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3667 AS = LangAS::cuda_shared;
3668 break;
3669 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3670 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3671 break;
3672 }
3673 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3674 auto *GV = new llvm::GlobalVariable(
3675 CGM.getModule(), VarTy, /*isConstant=*/false,
3676 llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
3677 VD->getName(),
3678 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3679 CGM.getContext().getTargetAddressSpace(AS));
3680 CharUnits Align = CGM.getContext().getDeclAlign(VD);
3681 GV->setAlignment(Align.getAsAlign());
3682 return Address(
3683 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3684 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3685 VD->getType().getAddressSpace()))),
3686 Align);
3687 }
3688
3689 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3690 return Address::invalid();
3691
3692 VD = VD->getCanonicalDecl();
3693 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3694 if (I == FunctionGlobalizedDecls.end())
3695 return Address::invalid();
3696 auto VDI = I->getSecond().LocalVarData.find(VD);
3697 if (VDI != I->getSecond().LocalVarData.end())
3698 return VDI->second.PrivateAddr;
3699 if (VD->hasAttrs()) {
3700 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3701 E(VD->attr_end());
3702 IT != E; ++IT) {
3703 auto VDI = I->getSecond().LocalVarData.find(
3704 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3705 ->getCanonicalDecl());
3706 if (VDI != I->getSecond().LocalVarData.end())
3707 return VDI->second.PrivateAddr;
3708 }
3709 }
3710
3711 return Address::invalid();
3712}
3713
3714void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3715 FunctionGlobalizedDecls.erase(CGF.CurFn);
3716 CGOpenMPRuntime::functionFinished(CGF);
3717}
3718
3719void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3720 CodeGenFunction &CGF, const OMPLoopDirective &S,
3721 OpenMPDistScheduleClauseKind &ScheduleKind,
3722 llvm::Value *&Chunk) const {
3723 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3724 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3725 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3726 Chunk = CGF.EmitScalarConversion(
3727 RT.getGPUNumThreads(CGF),
3728 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3729 S.getIterationVariable()->getType(), S.getBeginLoc());
3730 return;
3731 }
3732 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3733 CGF, S, ScheduleKind, Chunk);
3734}
3735
3736void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3737 CodeGenFunction &CGF, const OMPLoopDirective &S,
3738 OpenMPScheduleClauseKind &ScheduleKind,
3739 const Expr *&ChunkExpr) const {
3740 ScheduleKind = OMPC_SCHEDULE_static;
3741 // Chunk size is 1 in this case.
3742 llvm::APInt ChunkSize(32, 1);
3743 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3744 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3745 SourceLocation());
3746}
3747
3748void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3749 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3750 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) && " Expected target-based directive."
) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3751, __extension__
__PRETTY_FUNCTION__))
3751 " Expected target-based directive.")(static_cast <bool> (isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) && " Expected target-based directive."
) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3751, __extension__
__PRETTY_FUNCTION__))
;
3752 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3753 for (const CapturedStmt::Capture &C : CS->captures()) {
3754 // Capture variables captured by reference in lambdas for target-based
3755 // directives.
3756 if (!C.capturesVariable())
3757 continue;
3758 const VarDecl *VD = C.getCapturedVar();
3759 const auto *RD = VD->getType()
3760 .getCanonicalType()
3761 .getNonReferenceType()
3762 ->getAsCXXRecordDecl();
3763 if (!RD || !RD->isLambda())
3764 continue;
3765 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3766 LValue VDLVal;
3767 if (VD->getType().getCanonicalType()->isReferenceType())
3768 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3769 else
3770 VDLVal = CGF.MakeAddrLValue(
3771 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3772 llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
3773 FieldDecl *ThisCapture = nullptr;
3774 RD->getCaptureFields(Captures, ThisCapture);
3775 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3776 LValue ThisLVal =
3777 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3778 llvm::Value *CXXThis = CGF.LoadCXXThis();
3779 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3780 }
3781 for (const LambdaCapture &LC : RD->captures()) {
3782 if (LC.getCaptureKind() != LCK_ByRef)
3783 continue;
3784 const VarDecl *VD = LC.getCapturedVar();
3785 if (!CS->capturesVariable(VD))
3786 continue;
3787 auto It = Captures.find(VD);
3788 assert(It != Captures.end() && "Found lambda capture without field.")(static_cast <bool> (It != Captures.end() && "Found lambda capture without field."
) ? void (0) : __assert_fail ("It != Captures.end() && \"Found lambda capture without field.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3788, __extension__
__PRETTY_FUNCTION__))
;
3789 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3790 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3791 if (VD->getType().getCanonicalType()->isReferenceType())
3792 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3793 VD->getType().getCanonicalType())
3794 .getAddress(CGF);
3795 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3796 }
3797 }
3798}
3799
3800bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3801 LangAS &AS) {
3802 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3803 return false;
3804 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3805 switch(A->getAllocatorType()) {
3806 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3807 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3808 // Not supported, fallback to the default mem space.
3809 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3810 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3811 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3812 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3813 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3814 AS = LangAS::Default;
3815 return true;
3816 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3817 AS = LangAS::cuda_constant;
3818 return true;
3819 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3820 AS = LangAS::cuda_shared;
3821 return true;
3822 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3823 llvm_unreachable("Expected predefined allocator for the variables with the "::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the "
"static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3824)
3824 "static storage.")::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the "
"static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3824)
;
3825 }
3826 return false;
3827}
3828
3829// Get current CudaArch and ignore any unknown values
3830static CudaArch getCudaArch(CodeGenModule &CGM) {
3831 if (!CGM.getTarget().hasFeature("ptx"))
3832 return CudaArch::UNKNOWN;
3833 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3834 if (Feature.getValue()) {
3835 CudaArch Arch = StringToCudaArch(Feature.getKey());
3836 if (Arch != CudaArch::UNKNOWN)
3837 return Arch;
3838 }
3839 }
3840 return CudaArch::UNKNOWN;
3841}
3842
3843/// Check to see if target architecture supports unified addressing which is
3844/// a restriction for OpenMP requires clause "unified_shared_memory".
3845void CGOpenMPRuntimeGPU::processRequiresDirective(
3846 const OMPRequiresDecl *D) {
3847 for (const OMPClause *Clause : D->clauselists()) {
3848 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3849 CudaArch Arch = getCudaArch(CGM);
3850 switch (Arch) {
3851 case CudaArch::SM_20:
3852 case CudaArch::SM_21:
3853 case CudaArch::SM_30:
3854 case CudaArch::SM_32:
3855 case CudaArch::SM_35:
3856 case CudaArch::SM_37:
3857 case CudaArch::SM_50:
3858 case CudaArch::SM_52:
3859 case CudaArch::SM_53: {
3860 SmallString<256> Buffer;
3861 llvm::raw_svector_ostream Out(Buffer);
3862 Out << "Target architecture " << CudaArchToString(Arch)
3863 << " does not support unified addressing";
3864 CGM.Error(Clause->getBeginLoc(), Out.str());
3865 return;
3866 }
3867 case CudaArch::SM_60:
3868 case CudaArch::SM_61:
3869 case CudaArch::SM_62:
3870 case CudaArch::SM_70:
3871 case CudaArch::SM_72:
3872 case CudaArch::SM_75:
3873 case CudaArch::SM_80:
3874 case CudaArch::SM_86:
3875 case CudaArch::GFX600:
3876 case CudaArch::GFX601:
3877 case CudaArch::GFX602:
3878 case CudaArch::GFX700:
3879 case CudaArch::GFX701:
3880 case CudaArch::GFX702:
3881 case CudaArch::GFX703:
3882 case CudaArch::GFX704:
3883 case CudaArch::GFX705:
3884 case CudaArch::GFX801:
3885 case CudaArch::GFX802:
3886 case CudaArch::GFX803:
3887 case CudaArch::GFX805:
3888 case CudaArch::GFX810:
3889 case CudaArch::GFX900:
3890 case CudaArch::GFX902:
3891 case CudaArch::GFX904:
3892 case CudaArch::GFX906:
3893 case CudaArch::GFX908:
3894 case CudaArch::GFX909:
3895 case CudaArch::GFX90a:
3896 case CudaArch::GFX90c:
3897 case CudaArch::GFX1010:
3898 case CudaArch::GFX1011:
3899 case CudaArch::GFX1012:
3900 case CudaArch::GFX1013:
3901 case CudaArch::GFX1030:
3902 case CudaArch::GFX1031:
3903 case CudaArch::GFX1032:
3904 case CudaArch::GFX1033:
3905 case CudaArch::GFX1034:
3906 case CudaArch::GFX1035:
3907 case CudaArch::Generic:
3908 case CudaArch::UNUSED:
3909 case CudaArch::UNKNOWN:
3910 break;
3911 case CudaArch::LAST:
3912 llvm_unreachable("Unexpected Cuda arch.")::llvm::llvm_unreachable_internal("Unexpected Cuda arch.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3912)
;
3913 }
3914 }
3915 }
3916 CGOpenMPRuntime::processRequiresDirective(D);
3917}
3918
3919void CGOpenMPRuntimeGPU::clear() {
3920
3921 if (!TeamsReductions.empty()) {
3922 ASTContext &C = CGM.getContext();
3923 RecordDecl *StaticRD = C.buildImplicitRecord(
3924 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3925 StaticRD->startDefinition();
3926 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3927 QualType RecTy = C.getRecordType(TeamReductionRec);
3928 auto *Field = FieldDecl::Create(
3929 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3930 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3931 /*BW=*/nullptr, /*Mutable=*/false,
3932 /*InitStyle=*/ICIS_NoInit);
3933 Field->setAccess(AS_public);
3934 StaticRD->addDecl(Field);
3935 }
3936 StaticRD->completeDefinition();
3937 QualType StaticTy = C.getRecordType(StaticRD);
3938 llvm::Type *LLVMReductionsBufferTy =
3939 CGM.getTypes().ConvertTypeForMem(StaticTy);
3940 // FIXME: nvlink does not handle weak linkage correctly (object with the
3941 // different size are reported as erroneous).
3942 // Restore CommonLinkage as soon as nvlink is fixed.
3943 auto *GV = new llvm::GlobalVariable(
3944 CGM.getModule(), LLVMReductionsBufferTy,
3945 /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3946 llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3947 "_openmp_teams_reductions_buffer_$_");
3948 KernelTeamsReductionPtr->setInitializer(
3949 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3950 CGM.VoidPtrTy));
3951 }
3952 CGOpenMPRuntime::clear();
3953}
3954
3955llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
3956 CGBuilderTy &Bld = CGF.Builder;
3957 llvm::Module *M = &CGF.CGM.getModule();
3958 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3959 llvm::Function *F = M->getFunction(LocSize);
3960 if (!F) {
3961 F = llvm::Function::Create(
3962 llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false),
3963 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
3964 }
3965 return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
3966}
3967
3968llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
3969 ArrayRef<llvm::Value *> Args{};
3970 return CGF.EmitRuntimeCall(
3971 OMPBuilder.getOrCreateRuntimeFunction(
3972 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
3973 Args);
3974}
3975
3976llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
3977 ArrayRef<llvm::Value *> Args{};
3978 return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3979 CGM.getModule(), OMPRTL___kmpc_get_warp_size),
3980 Args);
3981}