Bug Summary

File:build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Warning:line 548, column 29
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name CGOpenMPRuntimeGPU.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -relaxed-aliasing -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I tools/clang/lib/CodeGen -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/clang/lib/CodeGen -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/clang/include -I tools/clang/include -I include -I /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-09-04-125545-48738-1 -x c++ /build/llvm-toolchain-snapshot-16~++20220904122748+c444af1c20b3/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
17#include "clang/AST/DeclOpenMP.h"
18#include "clang/AST/StmtOpenMP.h"
19#include "clang/AST/StmtVisitor.h"
20#include "clang/Basic/Cuda.h"
21#include "llvm/ADT/SmallPtrSet.h"
22#include "llvm/Frontend/OpenMP/OMPGridValues.h"
23#include "llvm/Support/MathExtras.h"
24
25using namespace clang;
26using namespace CodeGen;
27using namespace llvm::omp;
28
29namespace {
30/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
31class NVPTXActionTy final : public PrePostActionTy {
32 llvm::FunctionCallee EnterCallee = nullptr;
33 ArrayRef<llvm::Value *> EnterArgs;
34 llvm::FunctionCallee ExitCallee = nullptr;
35 ArrayRef<llvm::Value *> ExitArgs;
36 bool Conditional = false;
37 llvm::BasicBlock *ContBlock = nullptr;
38
39public:
40 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
41 ArrayRef<llvm::Value *> EnterArgs,
42 llvm::FunctionCallee ExitCallee,
43 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
44 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
45 ExitArgs(ExitArgs), Conditional(Conditional) {}
46 void Enter(CodeGenFunction &CGF) override {
47 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
48 if (Conditional) {
49 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
50 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
51 ContBlock = CGF.createBasicBlock("omp_if.end");
52 // Generate the branch (If-stmt)
53 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
54 CGF.EmitBlock(ThenBlock);
55 }
56 }
57 void Done(CodeGenFunction &CGF) {
58 // Emit the rest of blocks/branches
59 CGF.EmitBranch(ContBlock);
60 CGF.EmitBlock(ContBlock, true);
61 }
62 void Exit(CodeGenFunction &CGF) override {
63 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
64 }
65};
66
67/// A class to track the execution mode when codegening directives within
68/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
69/// to the target region and used by containing directives such as 'parallel'
70/// to emit optimized code.
71class ExecutionRuntimeModesRAII {
72private:
73 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
74 CGOpenMPRuntimeGPU::EM_Unknown;
75 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
76 bool SavedRuntimeMode = false;
77 bool *RuntimeMode = nullptr;
78
79public:
80 /// Constructor for Non-SPMD mode.
81 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
82 : ExecMode(ExecMode) {
83 SavedExecMode = ExecMode;
84 ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
85 }
86 /// Constructor for SPMD mode.
87 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
88 bool &RuntimeMode, bool FullRuntimeMode)
89 : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
90 SavedExecMode = ExecMode;
91 SavedRuntimeMode = RuntimeMode;
92 ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
93 RuntimeMode = FullRuntimeMode;
94 }
95 ~ExecutionRuntimeModesRAII() {
96 ExecMode = SavedExecMode;
97 if (RuntimeMode)
98 *RuntimeMode = SavedRuntimeMode;
99 }
100};
101
102/// GPU Configuration: This information can be derived from cuda registers,
103/// however, providing compile time constants helps generate more efficient
104/// code. For all practical purposes this is fine because the configuration
105/// is the same for all known NVPTX architectures.
106enum MachineConfiguration : unsigned {
107 /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
108 /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
109
110 /// Global memory alignment for performance.
111 GlobalMemoryAlignment = 128,
112
113 /// Maximal size of the shared memory buffer.
114 SharedMemorySize = 128,
115};
116
117static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
118 RefExpr = RefExpr->IgnoreParens();
119 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
120 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
121 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
122 Base = TempASE->getBase()->IgnoreParenImpCasts();
123 RefExpr = Base;
124 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
125 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
126 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
127 Base = TempOASE->getBase()->IgnoreParenImpCasts();
128 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
129 Base = TempASE->getBase()->IgnoreParenImpCasts();
130 RefExpr = Base;
131 }
132 RefExpr = RefExpr->IgnoreParenImpCasts();
133 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
134 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
135 const auto *ME = cast<MemberExpr>(RefExpr);
136 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
137}
138
139
140static RecordDecl *buildRecordForGlobalizedVars(
141 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
142 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
143 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
144 &MappedDeclsFields, int BufSize) {
145 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
146 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
147 return nullptr;
148 SmallVector<VarsDataTy, 4> GlobalizedVars;
149 for (const ValueDecl *D : EscapedDecls)
150 GlobalizedVars.emplace_back(
151 CharUnits::fromQuantity(std::max(
152 C.getDeclAlign(D).getQuantity(),
153 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
154 D);
155 for (const ValueDecl *D : EscapedDeclsForTeams)
156 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
157 llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
158 return L.first > R.first;
159 });
160
161 // Build struct _globalized_locals_ty {
162 // /* globalized vars */[WarSize] align (max(decl_align,
163 // GlobalMemoryAlignment))
164 // /* globalized vars */ for EscapedDeclsForTeams
165 // };
166 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
167 GlobalizedRD->startDefinition();
168 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
169 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
170 for (const auto &Pair : GlobalizedVars) {
171 const ValueDecl *VD = Pair.second;
172 QualType Type = VD->getType();
173 if (Type->isLValueReferenceType())
174 Type = C.getPointerType(Type.getNonReferenceType());
175 else
176 Type = Type.getNonReferenceType();
177 SourceLocation Loc = VD->getLocation();
178 FieldDecl *Field;
179 if (SingleEscaped.count(VD)) {
180 Field = FieldDecl::Create(
181 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
182 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
183 /*BW=*/nullptr, /*Mutable=*/false,
184 /*InitStyle=*/ICIS_NoInit);
185 Field->setAccess(AS_public);
186 if (VD->hasAttrs()) {
187 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
188 E(VD->getAttrs().end());
189 I != E; ++I)
190 Field->addAttr(*I);
191 }
192 } else {
193 llvm::APInt ArraySize(32, BufSize);
194 Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
195 0);
196 Field = FieldDecl::Create(
197 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
198 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
199 /*BW=*/nullptr, /*Mutable=*/false,
200 /*InitStyle=*/ICIS_NoInit);
201 Field->setAccess(AS_public);
202 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
203 static_cast<CharUnits::QuantityType>(
204 GlobalMemoryAlignment)));
205 Field->addAttr(AlignedAttr::CreateImplicit(
206 C, /*IsAlignmentExpr=*/true,
207 IntegerLiteral::Create(C, Align,
208 C.getIntTypeForBitwidth(32, /*Signed=*/0),
209 SourceLocation()),
210 {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
211 }
212 GlobalizedRD->addDecl(Field);
213 MappedDeclsFields.try_emplace(VD, Field);
214 }
215 GlobalizedRD->completeDefinition();
216 return GlobalizedRD;
217}
218
219/// Get the list of variables that can escape their declaration context.
220class CheckVarsEscapingDeclContext final
221 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
222 CodeGenFunction &CGF;
223 llvm::SetVector<const ValueDecl *> EscapedDecls;
224 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
225 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
226 RecordDecl *GlobalizedRD = nullptr;
227 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
228 bool AllEscaped = false;
229 bool IsForCombinedParallelRegion = false;
230
231 void markAsEscaped(const ValueDecl *VD) {
232 // Do not globalize declare target variables.
233 if (!isa<VarDecl>(VD) ||
234 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
235 return;
236 VD = cast<ValueDecl>(VD->getCanonicalDecl());
237 // Use user-specified allocation.
238 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
239 return;
240 // Variables captured by value must be globalized.
241 if (auto *CSI = CGF.CapturedStmtInfo) {
242 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
243 // Check if need to capture the variable that was already captured by
244 // value in the outer region.
245 if (!IsForCombinedParallelRegion) {
246 if (!FD->hasAttrs())
247 return;
248 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
249 if (!Attr)
250 return;
251 if (((Attr->getCaptureKind() != OMPC_map) &&
252 !isOpenMPPrivate(Attr->getCaptureKind())) ||
253 ((Attr->getCaptureKind() == OMPC_map) &&
254 !FD->getType()->isAnyPointerType()))
255 return;
256 }
257 if (!FD->getType()->isReferenceType()) {
258 assert(!VD->getType()->isVariablyModifiedType() &&(static_cast <bool> (!VD->getType()->isVariablyModifiedType
() && "Parameter captured by value with variably modified type"
) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 259, __extension__
__PRETTY_FUNCTION__))
259 "Parameter captured by value with variably modified type")(static_cast <bool> (!VD->getType()->isVariablyModifiedType
() && "Parameter captured by value with variably modified type"
) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 259, __extension__
__PRETTY_FUNCTION__))
;
260 EscapedParameters.insert(VD);
261 } else if (!IsForCombinedParallelRegion) {
262 return;
263 }
264 }
265 }
266 if ((!CGF.CapturedStmtInfo ||
267 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
268 VD->getType()->isReferenceType())
269 // Do not globalize variables with reference type.
270 return;
271 if (VD->getType()->isVariablyModifiedType())
272 EscapedVariableLengthDecls.insert(VD);
273 else
274 EscapedDecls.insert(VD);
275 }
276
277 void VisitValueDecl(const ValueDecl *VD) {
278 if (VD->getType()->isLValueReferenceType())
279 markAsEscaped(VD);
280 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
281 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
282 const bool SavedAllEscaped = AllEscaped;
283 AllEscaped = VD->getType()->isLValueReferenceType();
284 Visit(VarD->getInit());
285 AllEscaped = SavedAllEscaped;
286 }
287 }
288 }
289 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
290 ArrayRef<OMPClause *> Clauses,
291 bool IsCombinedParallelRegion) {
292 if (!S)
293 return;
294 for (const CapturedStmt::Capture &C : S->captures()) {
295 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
296 const ValueDecl *VD = C.getCapturedVar();
297 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
298 if (IsCombinedParallelRegion) {
299 // Check if the variable is privatized in the combined construct and
300 // those private copies must be shared in the inner parallel
301 // directive.
302 IsForCombinedParallelRegion = false;
303 for (const OMPClause *C : Clauses) {
304 if (!isOpenMPPrivate(C->getClauseKind()) ||
305 C->getClauseKind() == OMPC_reduction ||
306 C->getClauseKind() == OMPC_linear ||
307 C->getClauseKind() == OMPC_private)
308 continue;
309 ArrayRef<const Expr *> Vars;
310 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
311 Vars = PC->getVarRefs();
312 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
313 Vars = PC->getVarRefs();
314 else
315 llvm_unreachable("Unexpected clause.")::llvm::llvm_unreachable_internal("Unexpected clause.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 315)
;
316 for (const auto *E : Vars) {
317 const Decl *D =
318 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
319 if (D == VD->getCanonicalDecl()) {
320 IsForCombinedParallelRegion = true;
321 break;
322 }
323 }
324 if (IsForCombinedParallelRegion)
325 break;
326 }
327 }
328 markAsEscaped(VD);
329 if (isa<OMPCapturedExprDecl>(VD))
330 VisitValueDecl(VD);
331 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
332 }
333 }
334 }
335
336 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
337 assert(!GlobalizedRD &&(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already."
) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 338, __extension__
__PRETTY_FUNCTION__))
338 "Record for globalized variables is built already.")(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already."
) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 338, __extension__
__PRETTY_FUNCTION__))
;
339 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
340 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
341 if (IsInTTDRegion)
342 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
343 else
344 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
345 GlobalizedRD = ::buildRecordForGlobalizedVars(
346 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
347 MappedDeclsFields, WarpSize);
348 }
349
350public:
351 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
352 ArrayRef<const ValueDecl *> TeamsReductions)
353 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
354 }
355 virtual ~CheckVarsEscapingDeclContext() = default;
356 void VisitDeclStmt(const DeclStmt *S) {
357 if (!S)
358 return;
359 for (const Decl *D : S->decls())
360 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
361 VisitValueDecl(VD);
362 }
363 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
364 if (!D)
365 return;
366 if (!D->hasAssociatedStmt())
367 return;
368 if (const auto *S =
369 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
370 // Do not analyze directives that do not actually require capturing,
371 // like `omp for` or `omp simd` directives.
372 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
373 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
374 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
375 VisitStmt(S->getCapturedStmt());
376 return;
377 }
378 VisitOpenMPCapturedStmt(
379 S, D->clauses(),
380 CaptureRegions.back() == OMPD_parallel &&
381 isOpenMPDistributeDirective(D->getDirectiveKind()));
382 }
383 }
384 void VisitCapturedStmt(const CapturedStmt *S) {
385 if (!S)
386 return;
387 for (const CapturedStmt::Capture &C : S->captures()) {
388 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
389 const ValueDecl *VD = C.getCapturedVar();
390 markAsEscaped(VD);
391 if (isa<OMPCapturedExprDecl>(VD))
392 VisitValueDecl(VD);
393 }
394 }
395 }
396 void VisitLambdaExpr(const LambdaExpr *E) {
397 if (!E)
398 return;
399 for (const LambdaCapture &C : E->captures()) {
400 if (C.capturesVariable()) {
401 if (C.getCaptureKind() == LCK_ByRef) {
402 const ValueDecl *VD = C.getCapturedVar();
403 markAsEscaped(VD);
404 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
405 VisitValueDecl(VD);
406 }
407 }
408 }
409 }
410 void VisitBlockExpr(const BlockExpr *E) {
411 if (!E)
412 return;
413 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
414 if (C.isByRef()) {
415 const VarDecl *VD = C.getVariable();
416 markAsEscaped(VD);
417 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
418 VisitValueDecl(VD);
419 }
420 }
421 }
422 void VisitCallExpr(const CallExpr *E) {
423 if (!E)
424 return;
425 for (const Expr *Arg : E->arguments()) {
426 if (!Arg)
427 continue;
428 if (Arg->isLValue()) {
429 const bool SavedAllEscaped = AllEscaped;
430 AllEscaped = true;
431 Visit(Arg);
432 AllEscaped = SavedAllEscaped;
433 } else {
434 Visit(Arg);
435 }
436 }
437 Visit(E->getCallee());
438 }
439 void VisitDeclRefExpr(const DeclRefExpr *E) {
440 if (!E)
441 return;
442 const ValueDecl *VD = E->getDecl();
443 if (AllEscaped)
444 markAsEscaped(VD);
445 if (isa<OMPCapturedExprDecl>(VD))
446 VisitValueDecl(VD);
447 else if (VD->isInitCapture())
448 VisitValueDecl(VD);
449 }
450 void VisitUnaryOperator(const UnaryOperator *E) {
451 if (!E)
452 return;
453 if (E->getOpcode() == UO_AddrOf) {
454 const bool SavedAllEscaped = AllEscaped;
455 AllEscaped = true;
456 Visit(E->getSubExpr());
457 AllEscaped = SavedAllEscaped;
458 } else {
459 Visit(E->getSubExpr());
460 }
461 }
462 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
463 if (!E)
464 return;
465 if (E->getCastKind() == CK_ArrayToPointerDecay) {
466 const bool SavedAllEscaped = AllEscaped;
467 AllEscaped = true;
468 Visit(E->getSubExpr());
469 AllEscaped = SavedAllEscaped;
470 } else {
471 Visit(E->getSubExpr());
472 }
473 }
474 void VisitExpr(const Expr *E) {
475 if (!E)
476 return;
477 bool SavedAllEscaped = AllEscaped;
478 if (!E->isLValue())
479 AllEscaped = false;
480 for (const Stmt *Child : E->children())
481 if (Child)
482 Visit(Child);
483 AllEscaped = SavedAllEscaped;
484 }
485 void VisitStmt(const Stmt *S) {
486 if (!S)
487 return;
488 for (const Stmt *Child : S->children())
489 if (Child)
490 Visit(Child);
491 }
492
493 /// Returns the record that handles all the escaped local variables and used
494 /// instead of their original storage.
495 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
496 if (!GlobalizedRD)
497 buildRecordForGlobalizedVars(IsInTTDRegion);
498 return GlobalizedRD;
499 }
500
501 /// Returns the field in the globalized record for the escaped variable.
502 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
503 assert(GlobalizedRD &&(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already."
) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 504, __extension__
__PRETTY_FUNCTION__))
504 "Record for globalized variables must be generated already.")(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already."
) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 504, __extension__
__PRETTY_FUNCTION__))
;
505 auto I = MappedDeclsFields.find(VD);
506 if (I == MappedDeclsFields.end())
507 return nullptr;
508 return I->getSecond();
509 }
510
511 /// Returns the list of the escaped local variables/parameters.
512 ArrayRef<const ValueDecl *> getEscapedDecls() const {
513 return EscapedDecls.getArrayRef();
514 }
515
516 /// Checks if the escaped local variable is actually a parameter passed by
517 /// value.
518 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
519 return EscapedParameters;
520 }
521
522 /// Returns the list of the escaped variables with the variably modified
523 /// types.
524 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
525 return EscapedVariableLengthDecls.getArrayRef();
526 }
527};
528} // anonymous namespace
529
530/// Get the id of the warp in the block.
531/// We assume that the warp size is 32, which is always the case
532/// on the NVPTX device, to generate more efficient code.
533static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
534 CGBuilderTy &Bld = CGF.Builder;
535 unsigned LaneIDBits =
536 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
537 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
538 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
539}
540
541/// Get the id of the current lane in the Warp.
542/// We assume that the warp size is 32, which is always the case
543/// on the NVPTX device, to generate more efficient code.
544static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
545 CGBuilderTy &Bld = CGF.Builder;
546 unsigned LaneIDBits =
547 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
548 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
14
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'
549 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
550 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
551 "nvptx_lane_id");
552}
553
554CGOpenMPRuntimeGPU::ExecutionMode
555CGOpenMPRuntimeGPU::getExecutionMode() const {
556 return CurrentExecutionMode;
557}
558
559static CGOpenMPRuntimeGPU::DataSharingMode
560getDataSharingMode(CodeGenModule &CGM) {
561 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
562 : CGOpenMPRuntimeGPU::Generic;
563}
564
565/// Check for inner (nested) SPMD construct, if any
566static bool hasNestedSPMDDirective(ASTContext &Ctx,
567 const OMPExecutableDirective &D) {
568 const auto *CS = D.getInnermostCapturedStmt();
569 const auto *Body =
570 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
571 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
572
573 if (const auto *NestedDir =
574 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
575 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
576 switch (D.getDirectiveKind()) {
577 case OMPD_target:
578 if (isOpenMPParallelDirective(DKind))
579 return true;
580 if (DKind == OMPD_teams) {
581 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
582 /*IgnoreCaptured=*/true);
583 if (!Body)
584 return false;
585 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
586 if (const auto *NND =
587 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
588 DKind = NND->getDirectiveKind();
589 if (isOpenMPParallelDirective(DKind))
590 return true;
591 }
592 }
593 return false;
594 case OMPD_target_teams:
595 return isOpenMPParallelDirective(DKind);
596 case OMPD_target_simd:
597 case OMPD_target_parallel:
598 case OMPD_target_parallel_for:
599 case OMPD_target_parallel_for_simd:
600 case OMPD_target_teams_distribute:
601 case OMPD_target_teams_distribute_simd:
602 case OMPD_target_teams_distribute_parallel_for:
603 case OMPD_target_teams_distribute_parallel_for_simd:
604 case OMPD_parallel:
605 case OMPD_for:
606 case OMPD_parallel_for:
607 case OMPD_parallel_master:
608 case OMPD_parallel_sections:
609 case OMPD_for_simd:
610 case OMPD_parallel_for_simd:
611 case OMPD_cancel:
612 case OMPD_cancellation_point:
613 case OMPD_ordered:
614 case OMPD_threadprivate:
615 case OMPD_allocate:
616 case OMPD_task:
617 case OMPD_simd:
618 case OMPD_sections:
619 case OMPD_section:
620 case OMPD_single:
621 case OMPD_master:
622 case OMPD_critical:
623 case OMPD_taskyield:
624 case OMPD_barrier:
625 case OMPD_taskwait:
626 case OMPD_taskgroup:
627 case OMPD_atomic:
628 case OMPD_flush:
629 case OMPD_depobj:
630 case OMPD_scan:
631 case OMPD_teams:
632 case OMPD_target_data:
633 case OMPD_target_exit_data:
634 case OMPD_target_enter_data:
635 case OMPD_distribute:
636 case OMPD_distribute_simd:
637 case OMPD_distribute_parallel_for:
638 case OMPD_distribute_parallel_for_simd:
639 case OMPD_teams_distribute:
640 case OMPD_teams_distribute_simd:
641 case OMPD_teams_distribute_parallel_for:
642 case OMPD_teams_distribute_parallel_for_simd:
643 case OMPD_target_update:
644 case OMPD_declare_simd:
645 case OMPD_declare_variant:
646 case OMPD_begin_declare_variant:
647 case OMPD_end_declare_variant:
648 case OMPD_declare_target:
649 case OMPD_end_declare_target:
650 case OMPD_declare_reduction:
651 case OMPD_declare_mapper:
652 case OMPD_taskloop:
653 case OMPD_taskloop_simd:
654 case OMPD_master_taskloop:
655 case OMPD_master_taskloop_simd:
656 case OMPD_parallel_master_taskloop:
657 case OMPD_parallel_master_taskloop_simd:
658 case OMPD_requires:
659 case OMPD_unknown:
660 default:
661 llvm_unreachable("Unexpected directive.")::llvm::llvm_unreachable_internal("Unexpected directive.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 661)
;
662 }
663 }
664
665 return false;
666}
667
668static bool supportsSPMDExecutionMode(ASTContext &Ctx,
669 const OMPExecutableDirective &D) {
670 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
671 switch (DirectiveKind) {
672 case OMPD_target:
673 case OMPD_target_teams:
674 return hasNestedSPMDDirective(Ctx, D);
675 case OMPD_target_parallel:
676 case OMPD_target_parallel_for:
677 case OMPD_target_parallel_for_simd:
678 case OMPD_target_teams_distribute_parallel_for:
679 case OMPD_target_teams_distribute_parallel_for_simd:
680 case OMPD_target_simd:
681 case OMPD_target_teams_distribute_simd:
682 return true;
683 case OMPD_target_teams_distribute:
684 return false;
685 case OMPD_parallel:
686 case OMPD_for:
687 case OMPD_parallel_for:
688 case OMPD_parallel_master:
689 case OMPD_parallel_sections:
690 case OMPD_for_simd:
691 case OMPD_parallel_for_simd:
692 case OMPD_cancel:
693 case OMPD_cancellation_point:
694 case OMPD_ordered:
695 case OMPD_threadprivate:
696 case OMPD_allocate:
697 case OMPD_task:
698 case OMPD_simd:
699 case OMPD_sections:
700 case OMPD_section:
701 case OMPD_single:
702 case OMPD_master:
703 case OMPD_critical:
704 case OMPD_taskyield:
705 case OMPD_barrier:
706 case OMPD_taskwait:
707 case OMPD_taskgroup:
708 case OMPD_atomic:
709 case OMPD_flush:
710 case OMPD_depobj:
711 case OMPD_scan:
712 case OMPD_teams:
713 case OMPD_target_data:
714 case OMPD_target_exit_data:
715 case OMPD_target_enter_data:
716 case OMPD_distribute:
717 case OMPD_distribute_simd:
718 case OMPD_distribute_parallel_for:
719 case OMPD_distribute_parallel_for_simd:
720 case OMPD_teams_distribute:
721 case OMPD_teams_distribute_simd:
722 case OMPD_teams_distribute_parallel_for:
723 case OMPD_teams_distribute_parallel_for_simd:
724 case OMPD_target_update:
725 case OMPD_declare_simd:
726 case OMPD_declare_variant:
727 case OMPD_begin_declare_variant:
728 case OMPD_end_declare_variant:
729 case OMPD_declare_target:
730 case OMPD_end_declare_target:
731 case OMPD_declare_reduction:
732 case OMPD_declare_mapper:
733 case OMPD_taskloop:
734 case OMPD_taskloop_simd:
735 case OMPD_master_taskloop:
736 case OMPD_master_taskloop_simd:
737 case OMPD_parallel_master_taskloop:
738 case OMPD_parallel_master_taskloop_simd:
739 case OMPD_requires:
740 case OMPD_unknown:
741 default:
742 break;
743 }
744 llvm_unreachable(::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 745)
745 "Unknown programming model for OpenMP directive on NVPTX target.")::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 745)
;
746}
747
748/// Check if the directive is loops based and has schedule clause at all or has
749/// static scheduling.
750static bool hasStaticScheduling(const OMPExecutableDirective &D) {
751 assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPWorksharingDirective(D.getDirectiveKind
()) && isOpenMPLoopDirective(D.getDirectiveKind()) &&
"Expected loop-based directive.") ? void (0) : __assert_fail
("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 753, __extension__
__PRETTY_FUNCTION__))
752 isOpenMPLoopDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPWorksharingDirective(D.getDirectiveKind
()) && isOpenMPLoopDirective(D.getDirectiveKind()) &&
"Expected loop-based directive.") ? void (0) : __assert_fail
("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 753, __extension__
__PRETTY_FUNCTION__))
753 "Expected loop-based directive.")(static_cast <bool> (isOpenMPWorksharingDirective(D.getDirectiveKind
()) && isOpenMPLoopDirective(D.getDirectiveKind()) &&
"Expected loop-based directive.") ? void (0) : __assert_fail
("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 753, __extension__
__PRETTY_FUNCTION__))
;
754 return !D.hasClausesOfKind<OMPOrderedClause>() &&
755 (!D.hasClausesOfKind<OMPScheduleClause>() ||
756 llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
757 [](const OMPScheduleClause *C) {
758 return C->getScheduleKind() == OMPC_SCHEDULE_static;
759 }));
760}
761
762/// Check for inner (nested) lightweight runtime construct, if any
763static bool hasNestedLightweightDirective(ASTContext &Ctx,
764 const OMPExecutableDirective &D) {
765 assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.")(static_cast <bool> (supportsSPMDExecutionMode(Ctx, D) &&
"Expected SPMD mode directive.") ? void (0) : __assert_fail (
"supportsSPMDExecutionMode(Ctx, D) && \"Expected SPMD mode directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 765, __extension__
__PRETTY_FUNCTION__))
;
766 const auto *CS = D.getInnermostCapturedStmt();
767 const auto *Body =
768 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
769 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
770
771 if (const auto *NestedDir =
772 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
773 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
774 switch (D.getDirectiveKind()) {
775 case OMPD_target:
776 if (isOpenMPParallelDirective(DKind) &&
777 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
778 hasStaticScheduling(*NestedDir))
779 return true;
780 if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
781 return true;
782 if (DKind == OMPD_parallel) {
783 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
784 /*IgnoreCaptured=*/true);
785 if (!Body)
786 return false;
787 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
788 if (const auto *NND =
789 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
790 DKind = NND->getDirectiveKind();
791 if (isOpenMPWorksharingDirective(DKind) &&
792 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
793 return true;
794 }
795 } else if (DKind == OMPD_teams) {
796 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
797 /*IgnoreCaptured=*/true);
798 if (!Body)
799 return false;
800 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
801 if (const auto *NND =
802 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
803 DKind = NND->getDirectiveKind();
804 if (isOpenMPParallelDirective(DKind) &&
805 isOpenMPWorksharingDirective(DKind) &&
806 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
807 return true;
808 if (DKind == OMPD_parallel) {
809 Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
810 /*IgnoreCaptured=*/true);
811 if (!Body)
812 return false;
813 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
814 if (const auto *NND =
815 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
816 DKind = NND->getDirectiveKind();
817 if (isOpenMPWorksharingDirective(DKind) &&
818 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
819 return true;
820 }
821 }
822 }
823 }
824 return false;
825 case OMPD_target_teams:
826 if (isOpenMPParallelDirective(DKind) &&
827 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
828 hasStaticScheduling(*NestedDir))
829 return true;
830 if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
831 return true;
832 if (DKind == OMPD_parallel) {
833 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
834 /*IgnoreCaptured=*/true);
835 if (!Body)
836 return false;
837 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
838 if (const auto *NND =
839 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
840 DKind = NND->getDirectiveKind();
841 if (isOpenMPWorksharingDirective(DKind) &&
842 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
843 return true;
844 }
845 }
846 return false;
847 case OMPD_target_parallel:
848 if (DKind == OMPD_simd)
849 return true;
850 return isOpenMPWorksharingDirective(DKind) &&
851 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
852 case OMPD_target_teams_distribute:
853 case OMPD_target_simd:
854 case OMPD_target_parallel_for:
855 case OMPD_target_parallel_for_simd:
856 case OMPD_target_teams_distribute_simd:
857 case OMPD_target_teams_distribute_parallel_for:
858 case OMPD_target_teams_distribute_parallel_for_simd:
859 case OMPD_parallel:
860 case OMPD_for:
861 case OMPD_parallel_for:
862 case OMPD_parallel_master:
863 case OMPD_parallel_sections:
864 case OMPD_for_simd:
865 case OMPD_parallel_for_simd:
866 case OMPD_cancel:
867 case OMPD_cancellation_point:
868 case OMPD_ordered:
869 case OMPD_threadprivate:
870 case OMPD_allocate:
871 case OMPD_task:
872 case OMPD_simd:
873 case OMPD_sections:
874 case OMPD_section:
875 case OMPD_single:
876 case OMPD_master:
877 case OMPD_critical:
878 case OMPD_taskyield:
879 case OMPD_barrier:
880 case OMPD_taskwait:
881 case OMPD_taskgroup:
882 case OMPD_atomic:
883 case OMPD_flush:
884 case OMPD_depobj:
885 case OMPD_scan:
886 case OMPD_teams:
887 case OMPD_target_data:
888 case OMPD_target_exit_data:
889 case OMPD_target_enter_data:
890 case OMPD_distribute:
891 case OMPD_distribute_simd:
892 case OMPD_distribute_parallel_for:
893 case OMPD_distribute_parallel_for_simd:
894 case OMPD_teams_distribute:
895 case OMPD_teams_distribute_simd:
896 case OMPD_teams_distribute_parallel_for:
897 case OMPD_teams_distribute_parallel_for_simd:
898 case OMPD_target_update:
899 case OMPD_declare_simd:
900 case OMPD_declare_variant:
901 case OMPD_begin_declare_variant:
902 case OMPD_end_declare_variant:
903 case OMPD_declare_target:
904 case OMPD_end_declare_target:
905 case OMPD_declare_reduction:
906 case OMPD_declare_mapper:
907 case OMPD_taskloop:
908 case OMPD_taskloop_simd:
909 case OMPD_master_taskloop:
910 case OMPD_master_taskloop_simd:
911 case OMPD_parallel_master_taskloop:
912 case OMPD_parallel_master_taskloop_simd:
913 case OMPD_requires:
914 case OMPD_unknown:
915 default:
916 llvm_unreachable("Unexpected directive.")::llvm::llvm_unreachable_internal("Unexpected directive.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 916)
;
917 }
918 }
919
920 return false;
921}
922
923/// Checks if the construct supports lightweight runtime. It must be SPMD
924/// construct + inner loop-based construct with static scheduling.
925static bool supportsLightweightRuntime(ASTContext &Ctx,
926 const OMPExecutableDirective &D) {
927 if (!supportsSPMDExecutionMode(Ctx, D))
928 return false;
929 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
930 switch (DirectiveKind) {
931 case OMPD_target:
932 case OMPD_target_teams:
933 case OMPD_target_parallel:
934 return hasNestedLightweightDirective(Ctx, D);
935 case OMPD_target_parallel_for:
936 case OMPD_target_parallel_for_simd:
937 case OMPD_target_teams_distribute_parallel_for:
938 case OMPD_target_teams_distribute_parallel_for_simd:
939 // (Last|First)-privates must be shared in parallel region.
940 return hasStaticScheduling(D);
941 case OMPD_target_simd:
942 case OMPD_target_teams_distribute_simd:
943 return true;
944 case OMPD_target_teams_distribute:
945 return false;
946 case OMPD_parallel:
947 case OMPD_for:
948 case OMPD_parallel_for:
949 case OMPD_parallel_master:
950 case OMPD_parallel_sections:
951 case OMPD_for_simd:
952 case OMPD_parallel_for_simd:
953 case OMPD_cancel:
954 case OMPD_cancellation_point:
955 case OMPD_ordered:
956 case OMPD_threadprivate:
957 case OMPD_allocate:
958 case OMPD_task:
959 case OMPD_simd:
960 case OMPD_sections:
961 case OMPD_section:
962 case OMPD_single:
963 case OMPD_master:
964 case OMPD_critical:
965 case OMPD_taskyield:
966 case OMPD_barrier:
967 case OMPD_taskwait:
968 case OMPD_taskgroup:
969 case OMPD_atomic:
970 case OMPD_flush:
971 case OMPD_depobj:
972 case OMPD_scan:
973 case OMPD_teams:
974 case OMPD_target_data:
975 case OMPD_target_exit_data:
976 case OMPD_target_enter_data:
977 case OMPD_distribute:
978 case OMPD_distribute_simd:
979 case OMPD_distribute_parallel_for:
980 case OMPD_distribute_parallel_for_simd:
981 case OMPD_teams_distribute:
982 case OMPD_teams_distribute_simd:
983 case OMPD_teams_distribute_parallel_for:
984 case OMPD_teams_distribute_parallel_for_simd:
985 case OMPD_target_update:
986 case OMPD_declare_simd:
987 case OMPD_declare_variant:
988 case OMPD_begin_declare_variant:
989 case OMPD_end_declare_variant:
990 case OMPD_declare_target:
991 case OMPD_end_declare_target:
992 case OMPD_declare_reduction:
993 case OMPD_declare_mapper:
994 case OMPD_taskloop:
995 case OMPD_taskloop_simd:
996 case OMPD_master_taskloop:
997 case OMPD_master_taskloop_simd:
998 case OMPD_parallel_master_taskloop:
999 case OMPD_parallel_master_taskloop_simd:
1000 case OMPD_requires:
1001 case OMPD_unknown:
1002 default:
1003 break;
1004 }
1005 llvm_unreachable(::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1006)
1006 "Unknown programming model for OpenMP directive on NVPTX target.")::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1006)
;
1007}
1008
1009void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
1010 StringRef ParentName,
1011 llvm::Function *&OutlinedFn,
1012 llvm::Constant *&OutlinedFnID,
1013 bool IsOffloadEntry,
1014 const RegionCodeGenTy &CodeGen) {
1015 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1016 EntryFunctionState EST;
1017 WrapperFunctionsMap.clear();
1018
1019 // Emit target region as a standalone region.
1020 class NVPTXPrePostActionTy : public PrePostActionTy {
1021 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1022
1023 public:
1024 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1025 : EST(EST) {}
1026 void Enter(CodeGenFunction &CGF) override {
1027 auto &RT =
1028 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1029 RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
1030 // Skip target region initialization.
1031 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1032 }
1033 void Exit(CodeGenFunction &CGF) override {
1034 auto &RT =
1035 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1036 RT.clearLocThreadIdInsertPt(CGF);
1037 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
1038 }
1039 } Action(EST);
1040 CodeGen.setAction(Action);
1041 IsInTTDRegion = true;
1042 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1043 IsOffloadEntry, CodeGen);
1044 IsInTTDRegion = false;
1045}
1046
1047void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
1048 EntryFunctionState &EST, bool IsSPMD) {
1049 CGBuilderTy &Bld = CGF.Builder;
1050 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
1051 IsInTargetMasterThreadRegion = IsSPMD;
1052 if (!IsSPMD)
1053 emitGenericVarsProlog(CGF, EST.Loc);
1054}
1055
1056void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
1057 EntryFunctionState &EST,
1058 bool IsSPMD) {
1059 if (!IsSPMD)
1060 emitGenericVarsEpilog(CGF);
1061
1062 CGBuilderTy &Bld = CGF.Builder;
1063 OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
1064}
1065
1066void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
1067 StringRef ParentName,
1068 llvm::Function *&OutlinedFn,
1069 llvm::Constant *&OutlinedFnID,
1070 bool IsOffloadEntry,
1071 const RegionCodeGenTy &CodeGen) {
1072 ExecutionRuntimeModesRAII ModeRAII(
1073 CurrentExecutionMode, RequiresFullRuntime,
1074 CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1075 !supportsLightweightRuntime(CGM.getContext(), D));
1076 EntryFunctionState EST;
1077
1078 // Emit target region as a standalone region.
1079 class NVPTXPrePostActionTy : public PrePostActionTy {
1080 CGOpenMPRuntimeGPU &RT;
1081 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1082
1083 public:
1084 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
1085 CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1086 : RT(RT), EST(EST) {}
1087 void Enter(CodeGenFunction &CGF) override {
1088 RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
1089 // Skip target region initialization.
1090 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1091 }
1092 void Exit(CodeGenFunction &CGF) override {
1093 RT.clearLocThreadIdInsertPt(CGF);
1094 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
1095 }
1096 } Action(*this, EST);
1097 CodeGen.setAction(Action);
1098 IsInTTDRegion = true;
1099 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1100 IsOffloadEntry, CodeGen);
1101 IsInTTDRegion = false;
1102}
1103
1104// Create a unique global variable to indicate the execution mode of this target
1105// region. The execution mode is either 'generic', or 'spmd' depending on the
1106// target directive. This variable is picked up by the offload library to setup
1107// the device appropriately before kernel launch. If the execution mode is
1108// 'generic', the runtime reserves one warp for the master, otherwise, all
1109// warps participate in parallel work.
1110static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1111 bool Mode) {
1112 auto *GVMode = new llvm::GlobalVariable(
1113 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1114 llvm::GlobalValue::WeakAnyLinkage,
1115 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
1116 : OMP_TGT_EXEC_MODE_GENERIC),
1117 Twine(Name, "_exec_mode"));
1118 CGM.addCompilerUsedGlobal(GVMode);
1119}
1120
1121void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
1122 llvm::Constant *Addr,
1123 uint64_t Size, int32_t,
1124 llvm::GlobalValue::LinkageTypes) {
1125 // TODO: Add support for global variables on the device after declare target
1126 // support.
1127 llvm::Function *Fn = dyn_cast<llvm::Function>(Addr);
1128 if (!Fn)
1129 return;
1130
1131 llvm::Module &M = CGM.getModule();
1132 llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1133
1134 // Get "nvvm.annotations" metadata node.
1135 llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1136
1137 llvm::Metadata *MDVals[] = {
1138 llvm::ConstantAsMetadata::get(Fn), llvm::MDString::get(Ctx, "kernel"),
1139 llvm::ConstantAsMetadata::get(
1140 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1141 // Append metadata to nvvm.annotations.
1142 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1143
1144 // Add a function attribute for the kernel.
1145 Fn->addFnAttr(llvm::Attribute::get(Ctx, "kernel"));
1146}
1147
1148void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1149 const OMPExecutableDirective &D, StringRef ParentName,
1150 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1151 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1152 if (!IsOffloadEntry) // Nothing to do.
1153 return;
1154
1155 assert(!ParentName.empty() && "Invalid target region parent name!")(static_cast <bool> (!ParentName.empty() && "Invalid target region parent name!"
) ? void (0) : __assert_fail ("!ParentName.empty() && \"Invalid target region parent name!\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1155, __extension__
__PRETTY_FUNCTION__))
;
1156
1157 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1158 if (Mode)
1159 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1160 CodeGen);
1161 else
1162 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1163 CodeGen);
1164
1165 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1166}
1167
1168namespace {
1169LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()using ::llvm::BitmaskEnumDetail::operator~; using ::llvm::BitmaskEnumDetail
::operator|; using ::llvm::BitmaskEnumDetail::operator&; using
::llvm::BitmaskEnumDetail::operator^; using ::llvm::BitmaskEnumDetail
::operator|=; using ::llvm::BitmaskEnumDetail::operator&=
; using ::llvm::BitmaskEnumDetail::operator^=
;
1170/// Enum for accessing the reserved_2 field of the ident_t struct.
1171enum ModeFlagsTy : unsigned {
1172 /// Bit set to 1 when in SPMD mode.
1173 KMP_IDENT_SPMD_MODE = 0x01,
1174 /// Bit set to 1 when a simplified runtime is used.
1175 KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1176 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)LLVM_BITMASK_LARGEST_ENUMERATOR = KMP_IDENT_SIMPLE_RT_MODE
1177};
1178
1179/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1180static const ModeFlagsTy UndefinedMode =
1181 (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1182} // anonymous namespace
1183
1184unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
1185 switch (getExecutionMode()) {
1186 case EM_SPMD:
1187 if (requiresFullRuntime())
1188 return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1189 return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1190 case EM_NonSPMD:
1191 assert(requiresFullRuntime() && "Expected full runtime.")(static_cast <bool> (requiresFullRuntime() && "Expected full runtime."
) ? void (0) : __assert_fail ("requiresFullRuntime() && \"Expected full runtime.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1191, __extension__
__PRETTY_FUNCTION__))
;
1192 return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1193 case EM_Unknown:
1194 return UndefinedMode;
1195 }
1196 llvm_unreachable("Unknown flags are requested.")::llvm::llvm_unreachable_internal("Unknown flags are requested."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1196)
;
1197}
1198
1199CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
1200 : CGOpenMPRuntime(CGM, "_", "$") {
1201 if (!CGM.getLangOpts().OpenMPIsDevice)
1202 llvm_unreachable("OpenMP can only handle device code.")::llvm::llvm_unreachable_internal("OpenMP can only handle device code."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1202)
;
1203
1204 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
1205 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
1206 return;
1207
1208 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
1209 "__omp_rtl_debug_kind");
1210 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
1211 "__omp_rtl_assume_teams_oversubscription");
1212 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
1213 "__omp_rtl_assume_threads_oversubscription");
1214 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
1215 "__omp_rtl_assume_no_thread_state");
1216 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
1217 "__omp_rtl_assume_no_nested_parallelism");
1218}
1219
1220void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
1221 ProcBindKind ProcBind,
1222 SourceLocation Loc) {
1223 // Do nothing in case of SPMD mode and L0 parallel.
1224 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1225 return;
1226
1227 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1228}
1229
1230void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
1231 llvm::Value *NumThreads,
1232 SourceLocation Loc) {
1233 // Nothing to do.
1234}
1235
1236void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
1237 const Expr *NumTeams,
1238 const Expr *ThreadLimit,
1239 SourceLocation Loc) {}
1240
1241llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
1242 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1243 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1244 // Emit target region as a standalone region.
1245 class NVPTXPrePostActionTy : public PrePostActionTy {
1246 bool &IsInParallelRegion;
1247 bool PrevIsInParallelRegion;
1248
1249 public:
1250 NVPTXPrePostActionTy(bool &IsInParallelRegion)
1251 : IsInParallelRegion(IsInParallelRegion) {}
1252 void Enter(CodeGenFunction &CGF) override {
1253 PrevIsInParallelRegion = IsInParallelRegion;
1254 IsInParallelRegion = true;
1255 }
1256 void Exit(CodeGenFunction &CGF) override {
1257 IsInParallelRegion = PrevIsInParallelRegion;
1258 }
1259 } Action(IsInParallelRegion);
1260 CodeGen.setAction(Action);
1261 bool PrevIsInTTDRegion = IsInTTDRegion;
1262 IsInTTDRegion = false;
1263 bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1264 IsInTargetMasterThreadRegion = false;
1265 auto *OutlinedFun =
1266 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1267 D, ThreadIDVar, InnermostKind, CodeGen));
1268 IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1269 IsInTTDRegion = PrevIsInTTDRegion;
1270 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
1271 !IsInParallelRegion) {
1272 llvm::Function *WrapperFun =
1273 createParallelDataSharingWrapper(OutlinedFun, D);
1274 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1275 }
1276
1277 return OutlinedFun;
1278}
1279
1280/// Get list of lastprivate variables from the teams distribute ... or
1281/// teams {distribute ...} directives.
1282static void
1283getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1284 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1285 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1286, __extension__
__PRETTY_FUNCTION__))
1286 "expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1286, __extension__
__PRETTY_FUNCTION__))
;
1287 const OMPExecutableDirective *Dir = &D;
1288 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1289 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
1290 Ctx,
1291 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1292 /*IgnoreCaptured=*/true))) {
1293 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1294 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1295 Dir = nullptr;
1296 }
1297 }
1298 if (!Dir)
1299 return;
1300 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1301 for (const Expr *E : C->getVarRefs())
1302 Vars.push_back(getPrivateItem(E));
1303 }
1304}
1305
1306/// Get list of reduction variables from the teams ... directives.
1307static void
1308getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1309 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1310 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1311, __extension__
__PRETTY_FUNCTION__))
1311 "expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1311, __extension__
__PRETTY_FUNCTION__))
;
1312 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1313 for (const Expr *E : C->privates())
1314 Vars.push_back(getPrivateItem(E));
1315 }
1316}
1317
1318llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1319 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1320 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1321 SourceLocation Loc = D.getBeginLoc();
1322
1323 const RecordDecl *GlobalizedRD = nullptr;
1324 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1325 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1326 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1327 // Globalize team reductions variable unconditionally in all modes.
1328 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1329 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1330 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1331 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1332 if (!LastPrivatesReductions.empty()) {
1333 GlobalizedRD = ::buildRecordForGlobalizedVars(
1334 CGM.getContext(), llvm::None, LastPrivatesReductions,
1335 MappedDeclsFields, WarpSize);
1336 }
1337 } else if (!LastPrivatesReductions.empty()) {
1338 assert(!TeamAndReductions.first &&(static_cast <bool> (!TeamAndReductions.first &&
"Previous team declaration is not expected.") ? void (0) : __assert_fail
("!TeamAndReductions.first && \"Previous team declaration is not expected.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1339, __extension__
__PRETTY_FUNCTION__))
1339 "Previous team declaration is not expected.")(static_cast <bool> (!TeamAndReductions.first &&
"Previous team declaration is not expected.") ? void (0) : __assert_fail
("!TeamAndReductions.first && \"Previous team declaration is not expected.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1339, __extension__
__PRETTY_FUNCTION__))
;
1340 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1341 std::swap(TeamAndReductions.second, LastPrivatesReductions);
1342 }
1343
1344 // Emit target region as a standalone region.
1345 class NVPTXPrePostActionTy : public PrePostActionTy {
1346 SourceLocation &Loc;
1347 const RecordDecl *GlobalizedRD;
1348 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1349 &MappedDeclsFields;
1350
1351 public:
1352 NVPTXPrePostActionTy(
1353 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1354 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1355 &MappedDeclsFields)
1356 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1357 MappedDeclsFields(MappedDeclsFields) {}
1358 void Enter(CodeGenFunction &CGF) override {
1359 auto &Rt =
1360 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1361 if (GlobalizedRD) {
1362 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1363 I->getSecond().MappedParams =
1364 std::make_unique<CodeGenFunction::OMPMapVars>();
1365 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1366 for (const auto &Pair : MappedDeclsFields) {
1367 assert(Pair.getFirst()->isCanonicalDecl() &&(static_cast <bool> (Pair.getFirst()->isCanonicalDecl
() && "Expected canonical declaration") ? void (0) : __assert_fail
("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1368, __extension__
__PRETTY_FUNCTION__))
1368 "Expected canonical declaration")(static_cast <bool> (Pair.getFirst()->isCanonicalDecl
() && "Expected canonical declaration") ? void (0) : __assert_fail
("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1368, __extension__
__PRETTY_FUNCTION__))
;
1369 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1370 }
1371 }
1372 Rt.emitGenericVarsProlog(CGF, Loc);
1373 }
1374 void Exit(CodeGenFunction &CGF) override {
1375 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1376 .emitGenericVarsEpilog(CGF);
1377 }
1378 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1379 CodeGen.setAction(Action);
1380 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1381 D, ThreadIDVar, InnermostKind, CodeGen);
1382
1383 return OutlinedFun;
1384}
1385
1386void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1387 SourceLocation Loc,
1388 bool WithSPMDCheck) {
1389 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1390 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1391 return;
1392
1393 CGBuilderTy &Bld = CGF.Builder;
1394
1395 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1396 if (I == FunctionGlobalizedDecls.end())
1397 return;
1398
1399 for (auto &Rec : I->getSecond().LocalVarData) {
1400 const auto *VD = cast<VarDecl>(Rec.first);
1401 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1402 QualType VarTy = VD->getType();
1403
1404 // Get the local allocation of a firstprivate variable before sharing
1405 llvm::Value *ParValue;
1406 if (EscapedParam) {
1407 LValue ParLVal =
1408 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1409 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1410 }
1411
1412 // Allocate space for the variable to be globalized
1413 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1414 llvm::CallBase *VoidPtr =
1415 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1416 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1417 AllocArgs, VD->getName());
1418 // FIXME: We should use the variables actual alignment as an argument.
1419 VoidPtr->addRetAttr(llvm::Attribute::get(
1420 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1421 CGM.getContext().getTargetInfo().getNewAlign() / 8));
1422
1423 // Cast the void pointer and get the address of the globalized variable.
1424 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1425 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1426 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1427 LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1428 Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1429 Rec.second.GlobalizedVal = VoidPtr;
1430
1431 // Assign the local allocation to the newly globalized location.
1432 if (EscapedParam) {
1433 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1434 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1435 }
1436 if (auto *DI = CGF.getDebugInfo())
1437 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1438 }
1439 for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1440 // Use actual memory size of the VLA object including the padding
1441 // for alignment purposes.
1442 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1443 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1444 Size = Bld.CreateNUWAdd(
1445 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1446 llvm::Value *AlignVal =
1447 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1448
1449 Size = Bld.CreateUDiv(Size, AlignVal);
1450 Size = Bld.CreateNUWMul(Size, AlignVal);
1451
1452 // Allocate space for this VLA object to be globalized.
1453 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1454 llvm::CallBase *VoidPtr =
1455 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1456 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1457 AllocArgs, VD->getName());
1458 VoidPtr->addRetAttr(
1459 llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
1460 CGM.getContext().getTargetInfo().getNewAlign()));
1461
1462 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1463 std::pair<llvm::Value *, llvm::Value *>(
1464 {VoidPtr, CGF.getTypeSize(VD->getType())}));
1465 LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1466 CGM.getContext().getDeclAlign(VD),
1467 AlignmentSource::Decl);
1468 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1469 Base.getAddress(CGF));
1470 }
1471 I->getSecond().MappedParams->apply(CGF);
1472}
1473
1474void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1475 bool WithSPMDCheck) {
1476 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1477 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1478 return;
1479
1480 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1481 if (I != FunctionGlobalizedDecls.end()) {
1482 // Deallocate the memory for each globalized VLA object
1483 for (auto AddrSizePair :
1484 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1485 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1486 CGM.getModule(), OMPRTL___kmpc_free_shared),
1487 {AddrSizePair.first, AddrSizePair.second});
1488 }
1489 // Deallocate the memory for each globalized value
1490 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1491 const auto *VD = cast<VarDecl>(Rec.first);
1492 I->getSecond().MappedParams->restore(CGF);
1493
1494 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1495 CGF.getTypeSize(VD->getType())};
1496 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1497 CGM.getModule(), OMPRTL___kmpc_free_shared),
1498 FreeArgs);
1499 }
1500 }
1501}
1502
1503void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1504 const OMPExecutableDirective &D,
1505 SourceLocation Loc,
1506 llvm::Function *OutlinedFn,
1507 ArrayRef<llvm::Value *> CapturedVars) {
1508 if (!CGF.HaveInsertPoint())
1509 return;
1510
1511 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1512 /*Name=*/".zero.addr");
1513 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1514 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1515 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1516 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1517 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1518 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1519}
1520
1521void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1522 SourceLocation Loc,
1523 llvm::Function *OutlinedFn,
1524 ArrayRef<llvm::Value *> CapturedVars,
1525 const Expr *IfCond,
1526 llvm::Value *NumThreads) {
1527 if (!CGF.HaveInsertPoint())
1528 return;
1529
1530 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1531 NumThreads](CodeGenFunction &CGF,
1532 PrePostActionTy &Action) {
1533 CGBuilderTy &Bld = CGF.Builder;
1534 llvm::Value *NumThreadsVal = NumThreads;
1535 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1536 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1537 if (WFn)
1538 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1539 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1540
1541 // Create a private scope that will globalize the arguments
1542 // passed from the outside of the target region.
1543 // TODO: Is that needed?
1544 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1545
1546 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1547 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1548 "captured_vars_addrs");
1549 // There's something to share.
1550 if (!CapturedVars.empty()) {
1551 // Prepare for parallel region. Indicate the outlined function.
1552 ASTContext &Ctx = CGF.getContext();
1553 unsigned Idx = 0;
1554 for (llvm::Value *V : CapturedVars) {
1555 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1556 llvm::Value *PtrV;
1557 if (V->getType()->isIntegerTy())
1558 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1559 else
1560 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1561 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1562 Ctx.getPointerType(Ctx.VoidPtrTy));
1563 ++Idx;
1564 }
1565 }
1566
1567 llvm::Value *IfCondVal = nullptr;
1568 if (IfCond)
1569 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1570 /* isSigned */ false);
1571 else
1572 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1573
1574 if (!NumThreadsVal)
1575 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1576 else
1577 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1578
1579 assert(IfCondVal && "Expected a value")(static_cast <bool> (IfCondVal && "Expected a value"
) ? void (0) : __assert_fail ("IfCondVal && \"Expected a value\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1579, __extension__
__PRETTY_FUNCTION__))
;
1580 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1581 llvm::Value *Args[] = {
1582 RTLoc,
1583 getThreadID(CGF, Loc),
1584 IfCondVal,
1585 NumThreadsVal,
1586 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1587 FnPtr,
1588 ID,
1589 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1590 CGF.VoidPtrPtrTy),
1591 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1592 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1593 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1594 Args);
1595 };
1596
1597 RegionCodeGenTy RCG(ParallelGen);
1598 RCG(CGF);
1599}
1600
1601void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1602 // Always emit simple barriers!
1603 if (!CGF.HaveInsertPoint())
1604 return;
1605 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1606 // This function does not use parameters, so we can emit just default values.
1607 llvm::Value *Args[] = {
1608 llvm::ConstantPointerNull::get(
1609 cast<llvm::PointerType>(getIdentTyPointerTy())),
1610 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1611 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1612 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1613 Args);
1614}
1615
1616void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1617 SourceLocation Loc,
1618 OpenMPDirectiveKind Kind, bool,
1619 bool) {
1620 // Always emit simple barriers!
1621 if (!CGF.HaveInsertPoint())
1622 return;
1623 // Build call __kmpc_cancel_barrier(loc, thread_id);
1624 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1625 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1626 getThreadID(CGF, Loc)};
1627
1628 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1629 CGM.getModule(), OMPRTL___kmpc_barrier),
1630 Args);
1631}
1632
1633void CGOpenMPRuntimeGPU::emitCriticalRegion(
1634 CodeGenFunction &CGF, StringRef CriticalName,
1635 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1636 const Expr *Hint) {
1637 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1638 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1639 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1640 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1641 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1642
1643 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1644
1645 // Get the mask of active threads in the warp.
1646 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1647 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1648 // Fetch team-local id of the thread.
1649 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1650
1651 // Get the width of the team.
1652 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1653
1654 // Initialize the counter variable for the loop.
1655 QualType Int32Ty =
1656 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1657 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1658 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1659 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1660 /*isInit=*/true);
1661
1662 // Block checks if loop counter exceeds upper bound.
1663 CGF.EmitBlock(LoopBB);
1664 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1665 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1666 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1667
1668 // Block tests which single thread should execute region, and which threads
1669 // should go straight to synchronisation point.
1670 CGF.EmitBlock(TestBB);
1671 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1672 llvm::Value *CmpThreadToCounter =
1673 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1674 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1675
1676 // Block emits the body of the critical region.
1677 CGF.EmitBlock(BodyBB);
1678
1679 // Output the critical statement.
1680 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1681 Hint);
1682
1683 // After the body surrounded by the critical region, the single executing
1684 // thread will jump to the synchronisation point.
1685 // Block waits for all threads in current team to finish then increments the
1686 // counter variable and returns to the loop.
1687 CGF.EmitBlock(SyncBB);
1688 // Reconverge active threads in the warp.
1689 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1690 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1691 Mask);
1692
1693 llvm::Value *IncCounterVal =
1694 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1695 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1696 CGF.EmitBranch(LoopBB);
1697
1698 // Block that is reached when all threads in the team complete the region.
1699 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1700}
1701
1702/// Cast value to the specified type.
1703static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1704 QualType ValTy, QualType CastTy,
1705 SourceLocation Loc) {
1706 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(CastTy).isZero() && "Cast type must sized.") ? void (
0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1707, __extension__
__PRETTY_FUNCTION__))
1707 "Cast type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(CastTy).isZero() && "Cast type must sized.") ? void (
0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1707, __extension__
__PRETTY_FUNCTION__))
;
1708 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(ValTy).isZero() && "Val type must sized.") ? void (0
) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1709, __extension__
__PRETTY_FUNCTION__))
1709 "Val type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(ValTy).isZero() && "Val type must sized.") ? void (0
) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1709, __extension__
__PRETTY_FUNCTION__))
;
1710 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1711 if (ValTy == CastTy)
1712 return Val;
1713 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1714 CGF.getContext().getTypeSizeInChars(CastTy))
1715 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1716 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1717 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1718 CastTy->hasSignedIntegerRepresentation());
1719 Address CastItem = CGF.CreateMemTemp(CastTy);
1720 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1721 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()),
1722 Val->getType());
1723 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1724 LValueBaseInfo(AlignmentSource::Type),
1725 TBAAAccessInfo());
1726 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1727 LValueBaseInfo(AlignmentSource::Type),
1728 TBAAAccessInfo());
1729}
1730
1731/// This function creates calls to one of two shuffle functions to copy
1732/// variables between lanes in a warp.
1733static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1734 llvm::Value *Elem,
1735 QualType ElemType,
1736 llvm::Value *Offset,
1737 SourceLocation Loc) {
1738 CodeGenModule &CGM = CGF.CGM;
1739 CGBuilderTy &Bld = CGF.Builder;
1740 CGOpenMPRuntimeGPU &RT =
1741 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1742 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1743
1744 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1745 assert(Size.getQuantity() <= 8 &&(static_cast <bool> (Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.") ? void (0) :
__assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1746, __extension__
__PRETTY_FUNCTION__))
1746 "Unsupported bitwidth in shuffle instruction.")(static_cast <bool> (Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.") ? void (0) :
__assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1746, __extension__
__PRETTY_FUNCTION__))
;
1747
1748 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1749 ? OMPRTL___kmpc_shuffle_int32
1750 : OMPRTL___kmpc_shuffle_int64;
1751
1752 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1753 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1754 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1755 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1756 llvm::Value *WarpSize =
1757 Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1758
1759 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1760 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1761 {ElemCast, Offset, WarpSize});
1762
1763 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1764}
1765
1766static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1767 Address DestAddr, QualType ElemType,
1768 llvm::Value *Offset, SourceLocation Loc) {
1769 CGBuilderTy &Bld = CGF.Builder;
1770
1771 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1772 // Create the loop over the big sized data.
1773 // ptr = (void*)Elem;
1774 // ptrEnd = (void*) Elem + 1;
1775 // Step = 8;
1776 // while (ptr + Step < ptrEnd)
1777 // shuffle((int64_t)*ptr);
1778 // Step = 4;
1779 // while (ptr + Step < ptrEnd)
1780 // shuffle((int32_t)*ptr);
1781 // ...
1782 Address ElemPtr = DestAddr;
1783 Address Ptr = SrcAddr;
1784 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1785 Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
1786 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1787 if (Size < CharUnits::fromQuantity(IntSize))
1788 continue;
1789 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1790 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1791 /*Signed=*/1);
1792 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1793 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
1794 IntTy);
1795 ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1796 ElemPtr, IntTy->getPointerTo(), IntTy);
1797 if (Size.getQuantity() / IntSize > 1) {
1798 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1799 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1800 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1801 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1802 CGF.EmitBlock(PreCondBB);
1803 llvm::PHINode *PhiSrc =
1804 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1805 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1806 llvm::PHINode *PhiDest =
1807 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1808 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1809 Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1810 ElemPtr =
1811 Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1812 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1813 CGF.Int8Ty, PtrEnd.getPointer(),
1814 Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
1815 CGF.VoidPtrTy));
1816 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1817 ThenBB, ExitBB);
1818 CGF.EmitBlock(ThenBB);
1819 llvm::Value *Res = createRuntimeShuffleFunction(
1820 CGF,
1821 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1822 LValueBaseInfo(AlignmentSource::Type),
1823 TBAAAccessInfo()),
1824 IntType, Offset, Loc);
1825 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1826 LValueBaseInfo(AlignmentSource::Type),
1827 TBAAAccessInfo());
1828 Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1829 Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1830 PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1831 PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1832 CGF.EmitBranch(PreCondBB);
1833 CGF.EmitBlock(ExitBB);
1834 } else {
1835 llvm::Value *Res = createRuntimeShuffleFunction(
1836 CGF,
1837 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1838 LValueBaseInfo(AlignmentSource::Type),
1839 TBAAAccessInfo()),
1840 IntType, Offset, Loc);
1841 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1842 LValueBaseInfo(AlignmentSource::Type),
1843 TBAAAccessInfo());
1844 Ptr = Bld.CreateConstGEP(Ptr, 1);
1845 ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1846 }
1847 Size = Size % IntSize;
1848 }
1849}
1850
1851namespace {
1852enum CopyAction : unsigned {
1853 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1854 // the warp using shuffle instructions.
1855 RemoteLaneToThread,
1856 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1857 ThreadCopy,
1858 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1859 ThreadToScratchpad,
1860 // ScratchpadToThread: Copy from a scratchpad array in global memory
1861 // containing team-reduced data to a thread's stack.
1862 ScratchpadToThread,
1863};
1864} // namespace
1865
1866struct CopyOptionsTy {
1867 llvm::Value *RemoteLaneOffset;
1868 llvm::Value *ScratchpadIndex;
1869 llvm::Value *ScratchpadWidth;
1870};
1871
1872/// Emit instructions to copy a Reduce list, which contains partially
1873/// aggregated values, in the specified direction.
1874static void emitReductionListCopy(
1875 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1876 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1877 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1878
1879 CodeGenModule &CGM = CGF.CGM;
1880 ASTContext &C = CGM.getContext();
1881 CGBuilderTy &Bld = CGF.Builder;
1882
1883 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1884 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1885 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1886
1887 // Iterates, element-by-element, through the source Reduce list and
1888 // make a copy.
1889 unsigned Idx = 0;
1890 unsigned Size = Privates.size();
1891 for (const Expr *Private : Privates) {
1892 Address SrcElementAddr = Address::invalid();
1893 Address DestElementAddr = Address::invalid();
1894 Address DestElementPtrAddr = Address::invalid();
1895 // Should we shuffle in an element from a remote lane?
1896 bool ShuffleInElement = false;
1897 // Set to true to update the pointer in the dest Reduce list to a
1898 // newly created element.
1899 bool UpdateDestListPtr = false;
1900 // Increment the src or dest pointer to the scratchpad, for each
1901 // new element.
1902 bool IncrScratchpadSrc = false;
1903 bool IncrScratchpadDest = false;
1904 QualType PrivatePtrType = C.getPointerType(Private->getType());
1905 llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
1906
1907 switch (Action) {
1908 case RemoteLaneToThread: {
1909 // Step 1.1: Get the address for the src element in the Reduce list.
1910 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1911 SrcElementAddr =
1912 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1913 SrcElementPtrAddr, PrivateLlvmPtrType),
1914 PrivatePtrType->castAs<PointerType>());
1915
1916 // Step 1.2: Create a temporary to store the element in the destination
1917 // Reduce list.
1918 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1919 DestElementAddr =
1920 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1921 ShuffleInElement = true;
1922 UpdateDestListPtr = true;
1923 break;
1924 }
1925 case ThreadCopy: {
1926 // Step 1.1: Get the address for the src element in the Reduce list.
1927 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1928 SrcElementAddr =
1929 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1930 SrcElementPtrAddr, PrivateLlvmPtrType),
1931 PrivatePtrType->castAs<PointerType>());
1932
1933 // Step 1.2: Get the address for dest element. The destination
1934 // element has already been created on the thread's stack.
1935 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1936 DestElementAddr =
1937 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1938 DestElementPtrAddr, PrivateLlvmPtrType),
1939 PrivatePtrType->castAs<PointerType>());
1940 break;
1941 }
1942 case ThreadToScratchpad: {
1943 // Step 1.1: Get the address for the src element in the Reduce list.
1944 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1945 SrcElementAddr =
1946 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1947 SrcElementPtrAddr, PrivateLlvmPtrType),
1948 PrivatePtrType->castAs<PointerType>());
1949
1950 // Step 1.2: Get the address for dest element:
1951 // address = base + index * ElementSizeInChars.
1952 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1953 llvm::Value *CurrentOffset =
1954 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1955 llvm::Value *ScratchPadElemAbsolutePtrVal =
1956 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1957 ScratchPadElemAbsolutePtrVal =
1958 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1959 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1960 C.getTypeAlignInChars(Private->getType()));
1961 IncrScratchpadDest = true;
1962 break;
1963 }
1964 case ScratchpadToThread: {
1965 // Step 1.1: Get the address for the src element in the scratchpad.
1966 // address = base + index * ElementSizeInChars.
1967 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1968 llvm::Value *CurrentOffset =
1969 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1970 llvm::Value *ScratchPadElemAbsolutePtrVal =
1971 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1972 ScratchPadElemAbsolutePtrVal =
1973 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1974 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1975 C.getTypeAlignInChars(Private->getType()));
1976 IncrScratchpadSrc = true;
1977
1978 // Step 1.2: Create a temporary to store the element in the destination
1979 // Reduce list.
1980 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1981 DestElementAddr =
1982 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1983 UpdateDestListPtr = true;
1984 break;
1985 }
1986 }
1987
1988 // Regardless of src and dest of copy, we emit the load of src
1989 // element as this is required in all directions
1990 SrcElementAddr = Bld.CreateElementBitCast(
1991 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1992 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1993 SrcElementAddr.getElementType());
1994
1995 // Now that all active lanes have read the element in the
1996 // Reduce list, shuffle over the value from the remote lane.
1997 if (ShuffleInElement) {
1998 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1999 RemoteLaneOffset, Private->getExprLoc());
2000 } else {
2001 switch (CGF.getEvaluationKind(Private->getType())) {
2002 case TEK_Scalar: {
2003 llvm::Value *Elem = CGF.EmitLoadOfScalar(
2004 SrcElementAddr, /*Volatile=*/false, Private->getType(),
2005 Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
2006 TBAAAccessInfo());
2007 // Store the source element value to the dest element address.
2008 CGF.EmitStoreOfScalar(
2009 Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
2010 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2011 break;
2012 }
2013 case TEK_Complex: {
2014 CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
2015 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2016 Private->getExprLoc());
2017 CGF.EmitStoreOfComplex(
2018 Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2019 /*isInit=*/false);
2020 break;
2021 }
2022 case TEK_Aggregate:
2023 CGF.EmitAggregateCopy(
2024 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2025 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2026 Private->getType(), AggValueSlot::DoesNotOverlap);
2027 break;
2028 }
2029 }
2030
2031 // Step 3.1: Modify reference in dest Reduce list as needed.
2032 // Modifying the reference in Reduce list to point to the newly
2033 // created element. The element is live in the current function
2034 // scope and that of functions it invokes (i.e., reduce_function).
2035 // RemoteReduceData[i] = (void*)&RemoteElem
2036 if (UpdateDestListPtr) {
2037 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2038 DestElementAddr.getPointer(), CGF.VoidPtrTy),
2039 DestElementPtrAddr, /*Volatile=*/false,
2040 C.VoidPtrTy);
2041 }
2042
2043 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2044 // address of the next element in scratchpad memory, unless we're currently
2045 // processing the last one. Memory alignment is also taken care of here.
2046 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2047 // FIXME: This code doesn't make any sense, it's trying to perform
2048 // integer arithmetic on pointers.
2049 llvm::Value *ScratchpadBasePtr =
2050 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2051 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2052 ScratchpadBasePtr = Bld.CreateNUWAdd(
2053 ScratchpadBasePtr,
2054 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2055
2056 // Take care of global memory alignment for performance
2057 ScratchpadBasePtr = Bld.CreateNUWSub(
2058 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2059 ScratchpadBasePtr = Bld.CreateUDiv(
2060 ScratchpadBasePtr,
2061 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2062 ScratchpadBasePtr = Bld.CreateNUWAdd(
2063 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2064 ScratchpadBasePtr = Bld.CreateNUWMul(
2065 ScratchpadBasePtr,
2066 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2067
2068 if (IncrScratchpadDest)
2069 DestBase =
2070 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
2071 else /* IncrScratchpadSrc = true */
2072 SrcBase =
2073 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
2074 }
2075
2076 ++Idx;
2077 }
2078}
2079
2080/// This function emits a helper that gathers Reduce lists from the first
2081/// lane of every active warp to lanes in the first warp.
2082///
2083/// void inter_warp_copy_func(void* reduce_data, num_warps)
2084/// shared smem[warp_size];
2085/// For all data entries D in reduce_data:
2086/// sync
2087/// If (I am the first lane in each warp)
2088/// Copy my local D to smem[warp_id]
2089/// sync
2090/// if (I am the first warp)
2091/// Copy smem[thread_id] to my local D
2092static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2093 ArrayRef<const Expr *> Privates,
2094 QualType ReductionArrayTy,
2095 SourceLocation Loc) {
2096 ASTContext &C = CGM.getContext();
2097 llvm::Module &M = CGM.getModule();
2098
2099 // ReduceList: thread local Reduce list.
2100 // At the stage of the computation when this function is called, partially
2101 // aggregated values reside in the first lane of every active warp.
2102 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2103 C.VoidPtrTy, ImplicitParamDecl::Other);
2104 // NumWarps: number of warps active in the parallel region. This could
2105 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2106 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2107 C.getIntTypeForBitwidth(32, /* Signed */ true),
2108 ImplicitParamDecl::Other);
2109 FunctionArgList Args;
2110 Args.push_back(&ReduceListArg);
2111 Args.push_back(&NumWarpsArg);
2112
2113 const CGFunctionInfo &CGFI =
2114 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2115 auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
2116 llvm::GlobalValue::InternalLinkage,
2117 "_omp_reduction_inter_warp_copy_func", &M);
2118 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2119 Fn->setDoesNotRecurse();
2120 CodeGenFunction CGF(CGM);
2121 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2122
2123 CGBuilderTy &Bld = CGF.Builder;
2124
2125 // This array is used as a medium to transfer, one reduce element at a time,
2126 // the data from the first lane of every warp to lanes in the first warp
2127 // in order to perform the final step of a reduction in a parallel region
2128 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2129 // for reduced latency, as well as to have a distinct copy for concurrently
2130 // executing target regions. The array is declared with common linkage so
2131 // as to be shared across compilation units.
2132 StringRef TransferMediumName =
2133 "__openmp_nvptx_data_transfer_temporary_storage";
2134 llvm::GlobalVariable *TransferMedium =
2135 M.getGlobalVariable(TransferMediumName);
2136 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
2137 if (!TransferMedium) {
11
Assuming 'TransferMedium' is non-null
12
Taking false branch
2138 auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
2139 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2140 TransferMedium = new llvm::GlobalVariable(
2141 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
2142 llvm::UndefValue::get(Ty), TransferMediumName,
2143 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2144 SharedAddressSpace);
2145 CGM.addCompilerUsedGlobal(TransferMedium);
2146 }
2147
2148 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2149 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2150 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2151 // nvptx_lane_id = nvptx_id % warpsize
2152 llvm::Value *LaneID = getNVPTXLaneID(CGF);
13
Calling 'getNVPTXLaneID'
2153 // nvptx_warp_id = nvptx_id / warpsize
2154 llvm::Value *WarpID = getNVPTXWarpID(CGF);
2155
2156 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2157 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2158 Address LocalReduceList(
2159 Bld.CreatePointerBitCastOrAddrSpaceCast(
2160 CGF.EmitLoadOfScalar(
2161 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
2162 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
2163 ElemTy->getPointerTo()),
2164 ElemTy, CGF.getPointerAlign());
2165
2166 unsigned Idx = 0;
2167 for (const Expr *Private : Privates) {
2168 //
2169 // Warp master copies reduce element to transfer medium in __shared__
2170 // memory.
2171 //
2172 unsigned RealTySize =
2173 C.getTypeSizeInChars(Private->getType())
2174 .alignTo(C.getTypeAlignInChars(Private->getType()))
2175 .getQuantity();
2176 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2177 unsigned NumIters = RealTySize / TySize;
2178 if (NumIters == 0)
2179 continue;
2180 QualType CType = C.getIntTypeForBitwidth(
2181 C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
2182 llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
2183 CharUnits Align = CharUnits::fromQuantity(TySize);
2184 llvm::Value *Cnt = nullptr;
2185 Address CntAddr = Address::invalid();
2186 llvm::BasicBlock *PrecondBB = nullptr;
2187 llvm::BasicBlock *ExitBB = nullptr;
2188 if (NumIters > 1) {
2189 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
2190 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
2191 /*Volatile=*/false, C.IntTy);
2192 PrecondBB = CGF.createBasicBlock("precond");
2193 ExitBB = CGF.createBasicBlock("exit");
2194 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
2195 // There is no need to emit line number for unconditional branch.
2196 (void)ApplyDebugLocation::CreateEmpty(CGF);
2197 CGF.EmitBlock(PrecondBB);
2198 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
2199 llvm::Value *Cmp =
2200 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
2201 Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2202 CGF.EmitBlock(BodyBB);
2203 }
2204 // kmpc_barrier.
2205 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2206 /*EmitChecks=*/false,
2207 /*ForceSimpleCall=*/true);
2208 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2209 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2210 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2211
2212 // if (lane_id == 0)
2213 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2214 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2215 CGF.EmitBlock(ThenBB);
2216
2217 // Reduce element = LocalReduceList[i]
2218 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2219 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2220 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2221 // elemptr = ((CopyType*)(elemptrptr)) + I
2222 Address ElemPtr(ElemPtrPtr, CGF.Int8Ty, Align);
2223 ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
2224 if (NumIters > 1)
2225 ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
2226
2227 // Get pointer to location in transfer medium.
2228 // MediumPtr = &medium[warp_id]
2229 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2230 TransferMedium->getValueType(), TransferMedium,
2231 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2232 // Casting to actual data type.
2233 // MediumPtr = (CopyType*)MediumPtrAddr;
2234 Address MediumPtr(
2235 Bld.CreateBitCast(
2236 MediumPtrVal,
2237 CopyType->getPointerTo(
2238 MediumPtrVal->getType()->getPointerAddressSpace())),
2239 CopyType, Align);
2240
2241 // elem = *elemptr
2242 //*MediumPtr = elem
2243 llvm::Value *Elem = CGF.EmitLoadOfScalar(
2244 ElemPtr, /*Volatile=*/false, CType, Loc,
2245 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2246 // Store the source element value to the dest element address.
2247 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
2248 LValueBaseInfo(AlignmentSource::Type),
2249 TBAAAccessInfo());
2250
2251 Bld.CreateBr(MergeBB);
2252
2253 CGF.EmitBlock(ElseBB);
2254 Bld.CreateBr(MergeBB);
2255
2256 CGF.EmitBlock(MergeBB);
2257
2258 // kmpc_barrier.
2259 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2260 /*EmitChecks=*/false,
2261 /*ForceSimpleCall=*/true);
2262
2263 //
2264 // Warp 0 copies reduce element from transfer medium.
2265 //
2266 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2267 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2268 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2269
2270 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2271 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2272 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
2273
2274 // Up to 32 threads in warp 0 are active.
2275 llvm::Value *IsActiveThread =
2276 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2277 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2278
2279 CGF.EmitBlock(W0ThenBB);
2280
2281 // SrcMediumPtr = &medium[tid]
2282 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2283 TransferMedium->getValueType(), TransferMedium,
2284 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2285 // SrcMediumVal = *SrcMediumPtr;
2286 Address SrcMediumPtr(
2287 Bld.CreateBitCast(
2288 SrcMediumPtrVal,
2289 CopyType->getPointerTo(
2290 SrcMediumPtrVal->getType()->getPointerAddressSpace())),
2291 CopyType, Align);
2292
2293 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2294 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2295 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2296 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
2297 Address TargetElemPtr(TargetElemPtrVal, CGF.Int8Ty, Align);
2298 TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2299 if (NumIters > 1)
2300 TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
2301
2302 // *TargetElemPtr = SrcMediumVal;
2303 llvm::Value *SrcMediumValue =
2304 CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
2305 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2306 CType);
2307 Bld.CreateBr(W0MergeBB);
2308
2309 CGF.EmitBlock(W0ElseBB);
2310 Bld.CreateBr(W0MergeBB);
2311
2312 CGF.EmitBlock(W0MergeBB);
2313
2314 if (NumIters > 1) {
2315 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
2316 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
2317 CGF.EmitBranch(PrecondBB);
2318 (void)ApplyDebugLocation::CreateEmpty(CGF);
2319 CGF.EmitBlock(ExitBB);
2320 }
2321 RealTySize %= TySize;
2322 }
2323 ++Idx;
2324 }
2325
2326 CGF.FinishFunction();
2327 return Fn;
2328}
2329
2330/// Emit a helper that reduces data across two OpenMP threads (lanes)
2331/// in the same warp. It uses shuffle instructions to copy over data from
2332/// a remote lane's stack. The reduction algorithm performed is specified
2333/// by the fourth parameter.
2334///
2335/// Algorithm Versions.
2336/// Full Warp Reduce (argument value 0):
2337/// This algorithm assumes that all 32 lanes are active and gathers
2338/// data from these 32 lanes, producing a single resultant value.
2339/// Contiguous Partial Warp Reduce (argument value 1):
2340/// This algorithm assumes that only a *contiguous* subset of lanes
2341/// are active. This happens for the last warp in a parallel region
2342/// when the user specified num_threads is not an integer multiple of
2343/// 32. This contiguous subset always starts with the zeroth lane.
2344/// Partial Warp Reduce (argument value 2):
2345/// This algorithm gathers data from any number of lanes at any position.
2346/// All reduced values are stored in the lowest possible lane. The set
2347/// of problems every algorithm addresses is a super set of those
2348/// addressable by algorithms with a lower version number. Overhead
2349/// increases as algorithm version increases.
2350///
2351/// Terminology
2352/// Reduce element:
2353/// Reduce element refers to the individual data field with primitive
2354/// data types to be combined and reduced across threads.
2355/// Reduce list:
2356/// Reduce list refers to a collection of local, thread-private
2357/// reduce elements.
2358/// Remote Reduce list:
2359/// Remote Reduce list refers to a collection of remote (relative to
2360/// the current thread) reduce elements.
2361///
2362/// We distinguish between three states of threads that are important to
2363/// the implementation of this function.
2364/// Alive threads:
2365/// Threads in a warp executing the SIMT instruction, as distinguished from
2366/// threads that are inactive due to divergent control flow.
2367/// Active threads:
2368/// The minimal set of threads that has to be alive upon entry to this
2369/// function. The computation is correct iff active threads are alive.
2370/// Some threads are alive but they are not active because they do not
2371/// contribute to the computation in any useful manner. Turning them off
2372/// may introduce control flow overheads without any tangible benefits.
2373/// Effective threads:
2374/// In order to comply with the argument requirements of the shuffle
2375/// function, we must keep all lanes holding data alive. But at most
2376/// half of them perform value aggregation; we refer to this half of
2377/// threads as effective. The other half is simply handing off their
2378/// data.
2379///
2380/// Procedure
2381/// Value shuffle:
2382/// In this step active threads transfer data from higher lane positions
2383/// in the warp to lower lane positions, creating Remote Reduce list.
2384/// Value aggregation:
2385/// In this step, effective threads combine their thread local Reduce list
2386/// with Remote Reduce list and store the result in the thread local
2387/// Reduce list.
2388/// Value copy:
2389/// In this step, we deal with the assumption made by algorithm 2
2390/// (i.e. contiguity assumption). When we have an odd number of lanes
2391/// active, say 2k+1, only k threads will be effective and therefore k
2392/// new values will be produced. However, the Reduce list owned by the
2393/// (2k+1)th thread is ignored in the value aggregation. Therefore
2394/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2395/// that the contiguity assumption still holds.
2396static llvm::Function *emitShuffleAndReduceFunction(
2397 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2398 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2399 ASTContext &C = CGM.getContext();
2400
2401 // Thread local Reduce list used to host the values of data to be reduced.
2402 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2403 C.VoidPtrTy, ImplicitParamDecl::Other);
2404 // Current lane id; could be logical.
2405 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2406 ImplicitParamDecl::Other);
2407 // Offset of the remote source lane relative to the current lane.
2408 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2409 C.ShortTy, ImplicitParamDecl::Other);
2410 // Algorithm version. This is expected to be known at compile time.
2411 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2412 C.ShortTy, ImplicitParamDecl::Other);
2413 FunctionArgList Args;
2414 Args.push_back(&ReduceListArg);
2415 Args.push_back(&LaneIDArg);
2416 Args.push_back(&RemoteLaneOffsetArg);
2417 Args.push_back(&AlgoVerArg);
2418
2419 const CGFunctionInfo &CGFI =
2420 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2421 auto *Fn = llvm::Function::Create(
2422 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2423 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2424 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2425 Fn->setDoesNotRecurse();
2426
2427 CodeGenFunction CGF(CGM);
2428 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2429
2430 CGBuilderTy &Bld = CGF.Builder;
2431
2432 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2433 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2434 Address LocalReduceList(
2435 Bld.CreatePointerBitCastOrAddrSpaceCast(
2436 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2437 C.VoidPtrTy, SourceLocation()),
2438 ElemTy->getPointerTo()),
2439 ElemTy, CGF.getPointerAlign());
2440
2441 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2442 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2443 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2444
2445 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2446 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2447 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2448
2449 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2450 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2451 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2452
2453 // Create a local thread-private variable to host the Reduce list
2454 // from a remote lane.
2455 Address RemoteReduceList =
2456 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2457
2458 // This loop iterates through the list of reduce elements and copies,
2459 // element by element, from a remote lane in the warp to RemoteReduceList,
2460 // hosted on the thread's stack.
2461 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2462 LocalReduceList, RemoteReduceList,
2463 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2464 /*ScratchpadIndex=*/nullptr,
2465 /*ScratchpadWidth=*/nullptr});
2466
2467 // The actions to be performed on the Remote Reduce list is dependent
2468 // on the algorithm version.
2469 //
2470 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2471 // LaneId % 2 == 0 && Offset > 0):
2472 // do the reduction value aggregation
2473 //
2474 // The thread local variable Reduce list is mutated in place to host the
2475 // reduced data, which is the aggregated value produced from local and
2476 // remote lanes.
2477 //
2478 // Note that AlgoVer is expected to be a constant integer known at compile
2479 // time.
2480 // When AlgoVer==0, the first conjunction evaluates to true, making
2481 // the entire predicate true during compile time.
2482 // When AlgoVer==1, the second conjunction has only the second part to be
2483 // evaluated during runtime. Other conjunctions evaluates to false
2484 // during compile time.
2485 // When AlgoVer==2, the third conjunction has only the second part to be
2486 // evaluated during runtime. Other conjunctions evaluates to false
2487 // during compile time.
2488 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2489
2490 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2491 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2492 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2493
2494 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2495 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2496 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2497 CondAlgo2 = Bld.CreateAnd(
2498 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2499
2500 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2501 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2502
2503 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2504 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2505 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2506 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2507
2508 CGF.EmitBlock(ThenBB);
2509 // reduce_function(LocalReduceList, RemoteReduceList)
2510 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2511 LocalReduceList.getPointer(), CGF.VoidPtrTy);
2512 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2513 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2514 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2515 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2516 Bld.CreateBr(MergeBB);
2517
2518 CGF.EmitBlock(ElseBB);
2519 Bld.CreateBr(MergeBB);
2520
2521 CGF.EmitBlock(MergeBB);
2522
2523 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2524 // Reduce list.
2525 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2526 llvm::Value *CondCopy = Bld.CreateAnd(
2527 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2528
2529 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2530 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2531 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2532 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2533
2534 CGF.EmitBlock(CpyThenBB);
2535 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2536 RemoteReduceList, LocalReduceList);
2537 Bld.CreateBr(CpyMergeBB);
2538
2539 CGF.EmitBlock(CpyElseBB);
2540 Bld.CreateBr(CpyMergeBB);
2541
2542 CGF.EmitBlock(CpyMergeBB);
2543
2544 CGF.FinishFunction();
2545 return Fn;
2546}
2547
2548/// This function emits a helper that copies all the reduction variables from
2549/// the team into the provided global buffer for the reduction variables.
2550///
2551/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2552/// For all data entries D in reduce_data:
2553/// Copy local D to buffer.D[Idx]
2554static llvm::Value *emitListToGlobalCopyFunction(
2555 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2556 QualType ReductionArrayTy, SourceLocation Loc,
2557 const RecordDecl *TeamReductionRec,
2558 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2559 &VarFieldMap) {
2560 ASTContext &C = CGM.getContext();
2561
2562 // Buffer: global reduction buffer.
2563 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2564 C.VoidPtrTy, ImplicitParamDecl::Other);
2565 // Idx: index of the buffer.
2566 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2567 ImplicitParamDecl::Other);
2568 // ReduceList: thread local Reduce list.
2569 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2570 C.VoidPtrTy, ImplicitParamDecl::Other);
2571 FunctionArgList Args;
2572 Args.push_back(&BufferArg);
2573 Args.push_back(&IdxArg);
2574 Args.push_back(&ReduceListArg);
2575
2576 const CGFunctionInfo &CGFI =
2577 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2578 auto *Fn = llvm::Function::Create(
2579 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2580 "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2581 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2582 Fn->setDoesNotRecurse();
2583 CodeGenFunction CGF(CGM);
2584 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2585
2586 CGBuilderTy &Bld = CGF.Builder;
2587
2588 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2589 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2590 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2591 Address LocalReduceList(
2592 Bld.CreatePointerBitCastOrAddrSpaceCast(
2593 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2594 C.VoidPtrTy, Loc),
2595 ElemTy->getPointerTo()),
2596 ElemTy, CGF.getPointerAlign());
2597 QualType StaticTy = C.getRecordType(TeamReductionRec);
2598 llvm::Type *LLVMReductionsBufferTy =
2599 CGM.getTypes().ConvertTypeForMem(StaticTy);
2600 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2601 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2602 LLVMReductionsBufferTy->getPointerTo());
2603 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2604 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2605 /*Volatile=*/false, C.IntTy,
2606 Loc)};
2607 unsigned Idx = 0;
2608 for (const Expr *Private : Privates) {
2609 // Reduce element = LocalReduceList[i]
2610 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2611 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2612 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2613 // elemptr = ((CopyType*)(elemptrptr)) + I
2614 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2615 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2616 ElemPtrPtr, ElemTy->getPointerTo());
2617 Address ElemPtr =
2618 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2619 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2620 // Global = Buffer.VD[Idx];
2621 const FieldDecl *FD = VarFieldMap.lookup(VD);
2622 LValue GlobLVal = CGF.EmitLValueForField(
2623 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2624 Address GlobAddr = GlobLVal.getAddress(CGF);
2625 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2626 GlobAddr.getPointer(), Idxs);
2627 GlobLVal.setAddress(Address(BufferPtr,
2628 CGF.ConvertTypeForMem(Private->getType()),
2629 GlobAddr.getAlignment()));
2630 switch (CGF.getEvaluationKind(Private->getType())) {
2631 case TEK_Scalar: {
2632 llvm::Value *V = CGF.EmitLoadOfScalar(
2633 ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2634 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2635 CGF.EmitStoreOfScalar(V, GlobLVal);
2636 break;
2637 }
2638 case TEK_Complex: {
2639 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2640 CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2641 CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2642 break;
2643 }
2644 case TEK_Aggregate:
2645 CGF.EmitAggregateCopy(GlobLVal,
2646 CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2647 Private->getType(), AggValueSlot::DoesNotOverlap);
2648 break;
2649 }
2650 ++Idx;
2651 }
2652
2653 CGF.FinishFunction();
2654 return Fn;
2655}
2656
2657/// This function emits a helper that reduces all the reduction variables from
2658/// the team into the provided global buffer for the reduction variables.
2659///
2660/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2661/// void *GlobPtrs[];
2662/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2663/// ...
2664/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2665/// reduce_function(GlobPtrs, reduce_data);
2666static llvm::Value *emitListToGlobalReduceFunction(
2667 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2668 QualType ReductionArrayTy, SourceLocation Loc,
2669 const RecordDecl *TeamReductionRec,
2670 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2671 &VarFieldMap,
2672 llvm::Function *ReduceFn) {
2673 ASTContext &C = CGM.getContext();
2674
2675 // Buffer: global reduction buffer.
2676 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2677 C.VoidPtrTy, ImplicitParamDecl::Other);
2678 // Idx: index of the buffer.
2679 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2680 ImplicitParamDecl::Other);
2681 // ReduceList: thread local Reduce list.
2682 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2683 C.VoidPtrTy, ImplicitParamDecl::Other);
2684 FunctionArgList Args;
2685 Args.push_back(&BufferArg);
2686 Args.push_back(&IdxArg);
2687 Args.push_back(&ReduceListArg);
2688
2689 const CGFunctionInfo &CGFI =
2690 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2691 auto *Fn = llvm::Function::Create(
2692 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2693 "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2694 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2695 Fn->setDoesNotRecurse();
2696 CodeGenFunction CGF(CGM);
2697 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2698
2699 CGBuilderTy &Bld = CGF.Builder;
2700
2701 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2702 QualType StaticTy = C.getRecordType(TeamReductionRec);
2703 llvm::Type *LLVMReductionsBufferTy =
2704 CGM.getTypes().ConvertTypeForMem(StaticTy);
2705 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2706 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2707 LLVMReductionsBufferTy->getPointerTo());
2708
2709 // 1. Build a list of reduction variables.
2710 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2711 Address ReductionList =
2712 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2713 auto IPriv = Privates.begin();
2714 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2715 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2716 /*Volatile=*/false, C.IntTy,
2717 Loc)};
2718 unsigned Idx = 0;
2719 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2720 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2721 // Global = Buffer.VD[Idx];
2722 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2723 const FieldDecl *FD = VarFieldMap.lookup(VD);
2724 LValue GlobLVal = CGF.EmitLValueForField(
2725 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2726 Address GlobAddr = GlobLVal.getAddress(CGF);
2727 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2728 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2729 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2730 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2731 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2732 // Store array size.
2733 ++Idx;
2734 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2735 llvm::Value *Size = CGF.Builder.CreateIntCast(
2736 CGF.getVLASize(
2737 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2738 .NumElts,
2739 CGF.SizeTy, /*isSigned=*/false);
2740 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2741 Elem);
2742 }
2743 }
2744
2745 // Call reduce_function(GlobalReduceList, ReduceList)
2746 llvm::Value *GlobalReduceList =
2747 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2748 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2749 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2750 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2751 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2752 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2753 CGF.FinishFunction();
2754 return Fn;
2755}
2756
2757/// This function emits a helper that copies all the reduction variables from
2758/// the team into the provided global buffer for the reduction variables.
2759///
2760/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2761/// For all data entries D in reduce_data:
2762/// Copy buffer.D[Idx] to local D;
2763static llvm::Value *emitGlobalToListCopyFunction(
2764 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2765 QualType ReductionArrayTy, SourceLocation Loc,
2766 const RecordDecl *TeamReductionRec,
2767 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2768 &VarFieldMap) {
2769 ASTContext &C = CGM.getContext();
2770
2771 // Buffer: global reduction buffer.
2772 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2773 C.VoidPtrTy, ImplicitParamDecl::Other);
2774 // Idx: index of the buffer.
2775 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2776 ImplicitParamDecl::Other);
2777 // ReduceList: thread local Reduce list.
2778 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2779 C.VoidPtrTy, ImplicitParamDecl::Other);
2780 FunctionArgList Args;
2781 Args.push_back(&BufferArg);
2782 Args.push_back(&IdxArg);
2783 Args.push_back(&ReduceListArg);
2784
2785 const CGFunctionInfo &CGFI =
2786 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2787 auto *Fn = llvm::Function::Create(
2788 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2789 "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2790 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2791 Fn->setDoesNotRecurse();
2792 CodeGenFunction CGF(CGM);
2793 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2794
2795 CGBuilderTy &Bld = CGF.Builder;
2796
2797 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2798 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2799 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2800 Address LocalReduceList(
2801 Bld.CreatePointerBitCastOrAddrSpaceCast(
2802 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2803 C.VoidPtrTy, Loc),
2804 ElemTy->getPointerTo()),
2805 ElemTy, CGF.getPointerAlign());
2806 QualType StaticTy = C.getRecordType(TeamReductionRec);
2807 llvm::Type *LLVMReductionsBufferTy =
2808 CGM.getTypes().ConvertTypeForMem(StaticTy);
2809 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2810 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2811 LLVMReductionsBufferTy->getPointerTo());
2812
2813 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2814 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2815 /*Volatile=*/false, C.IntTy,
2816 Loc)};
2817 unsigned Idx = 0;
2818 for (const Expr *Private : Privates) {
2819 // Reduce element = LocalReduceList[i]
2820 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2821 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2822 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2823 // elemptr = ((CopyType*)(elemptrptr)) + I
2824 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2825 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2826 ElemPtrPtr, ElemTy->getPointerTo());
2827 Address ElemPtr =
2828 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2829 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2830 // Global = Buffer.VD[Idx];
2831 const FieldDecl *FD = VarFieldMap.lookup(VD);
2832 LValue GlobLVal = CGF.EmitLValueForField(
2833 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2834 Address GlobAddr = GlobLVal.getAddress(CGF);
2835 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2836 GlobAddr.getPointer(), Idxs);
2837 GlobLVal.setAddress(Address(BufferPtr,
2838 CGF.ConvertTypeForMem(Private->getType()),
2839 GlobAddr.getAlignment()));
2840 switch (CGF.getEvaluationKind(Private->getType())) {
2841 case TEK_Scalar: {
2842 llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2843 CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2844 LValueBaseInfo(AlignmentSource::Type),
2845 TBAAAccessInfo());
2846 break;
2847 }
2848 case TEK_Complex: {
2849 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
2850 CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2851 /*isInit=*/false);
2852 break;
2853 }
2854 case TEK_Aggregate:
2855 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2856 GlobLVal, Private->getType(),
2857 AggValueSlot::DoesNotOverlap);
2858 break;
2859 }
2860 ++Idx;
2861 }
2862
2863 CGF.FinishFunction();
2864 return Fn;
2865}
2866
2867/// This function emits a helper that reduces all the reduction variables from
2868/// the team into the provided global buffer for the reduction variables.
2869///
2870/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2871/// void *GlobPtrs[];
2872/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2873/// ...
2874/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2875/// reduce_function(reduce_data, GlobPtrs);
2876static llvm::Value *emitGlobalToListReduceFunction(
2877 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2878 QualType ReductionArrayTy, SourceLocation Loc,
2879 const RecordDecl *TeamReductionRec,
2880 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2881 &VarFieldMap,
2882 llvm::Function *ReduceFn) {
2883 ASTContext &C = CGM.getContext();
2884
2885 // Buffer: global reduction buffer.
2886 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2887 C.VoidPtrTy, ImplicitParamDecl::Other);
2888 // Idx: index of the buffer.
2889 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2890 ImplicitParamDecl::Other);
2891 // ReduceList: thread local Reduce list.
2892 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2893 C.VoidPtrTy, ImplicitParamDecl::Other);
2894 FunctionArgList Args;
2895 Args.push_back(&BufferArg);
2896 Args.push_back(&IdxArg);
2897 Args.push_back(&ReduceListArg);
2898
2899 const CGFunctionInfo &CGFI =
2900 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2901 auto *Fn = llvm::Function::Create(
2902 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2903 "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2904 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2905 Fn->setDoesNotRecurse();
2906 CodeGenFunction CGF(CGM);
2907 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2908
2909 CGBuilderTy &Bld = CGF.Builder;
2910
2911 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2912 QualType StaticTy = C.getRecordType(TeamReductionRec);
2913 llvm::Type *LLVMReductionsBufferTy =
2914 CGM.getTypes().ConvertTypeForMem(StaticTy);
2915 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2916 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2917 LLVMReductionsBufferTy->getPointerTo());
2918
2919 // 1. Build a list of reduction variables.
2920 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2921 Address ReductionList =
2922 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2923 auto IPriv = Privates.begin();
2924 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2925 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2926 /*Volatile=*/false, C.IntTy,
2927 Loc)};
2928 unsigned Idx = 0;
2929 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2930 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2931 // Global = Buffer.VD[Idx];
2932 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2933 const FieldDecl *FD = VarFieldMap.lookup(VD);
2934 LValue GlobLVal = CGF.EmitLValueForField(
2935 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2936 Address GlobAddr = GlobLVal.getAddress(CGF);
2937 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2938 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2939 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2940 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2941 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2942 // Store array size.
2943 ++Idx;
2944 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2945 llvm::Value *Size = CGF.Builder.CreateIntCast(
2946 CGF.getVLASize(
2947 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2948 .NumElts,
2949 CGF.SizeTy, /*isSigned=*/false);
2950 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2951 Elem);
2952 }
2953 }
2954
2955 // Call reduce_function(ReduceList, GlobalReduceList)
2956 llvm::Value *GlobalReduceList =
2957 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2958 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2959 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2960 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2961 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2962 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2963 CGF.FinishFunction();
2964 return Fn;
2965}
2966
2967///
2968/// Design of OpenMP reductions on the GPU
2969///
2970/// Consider a typical OpenMP program with one or more reduction
2971/// clauses:
2972///
2973/// float foo;
2974/// double bar;
2975/// #pragma omp target teams distribute parallel for \
2976/// reduction(+:foo) reduction(*:bar)
2977/// for (int i = 0; i < N; i++) {
2978/// foo += A[i]; bar *= B[i];
2979/// }
2980///
2981/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2982/// all teams. In our OpenMP implementation on the NVPTX device an
2983/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2984/// within a team are mapped to CUDA threads within a threadblock.
2985/// Our goal is to efficiently aggregate values across all OpenMP
2986/// threads such that:
2987///
2988/// - the compiler and runtime are logically concise, and
2989/// - the reduction is performed efficiently in a hierarchical
2990/// manner as follows: within OpenMP threads in the same warp,
2991/// across warps in a threadblock, and finally across teams on
2992/// the NVPTX device.
2993///
2994/// Introduction to Decoupling
2995///
2996/// We would like to decouple the compiler and the runtime so that the
2997/// latter is ignorant of the reduction variables (number, data types)
2998/// and the reduction operators. This allows a simpler interface
2999/// and implementation while still attaining good performance.
3000///
3001/// Pseudocode for the aforementioned OpenMP program generated by the
3002/// compiler is as follows:
3003///
3004/// 1. Create private copies of reduction variables on each OpenMP
3005/// thread: 'foo_private', 'bar_private'
3006/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3007/// to it and writes the result in 'foo_private' and 'bar_private'
3008/// respectively.
3009/// 3. Call the OpenMP runtime on the GPU to reduce within a team
3010/// and store the result on the team master:
3011///
3012/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
3013/// reduceData, shuffleReduceFn, interWarpCpyFn)
3014///
3015/// where:
3016/// struct ReduceData {
3017/// double *foo;
3018/// double *bar;
3019/// } reduceData
3020/// reduceData.foo = &foo_private
3021/// reduceData.bar = &bar_private
3022///
3023/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3024/// auxiliary functions generated by the compiler that operate on
3025/// variables of type 'ReduceData'. They aid the runtime perform
3026/// algorithmic steps in a data agnostic manner.
3027///
3028/// 'shuffleReduceFn' is a pointer to a function that reduces data
3029/// of type 'ReduceData' across two OpenMP threads (lanes) in the
3030/// same warp. It takes the following arguments as input:
3031///
3032/// a. variable of type 'ReduceData' on the calling lane,
3033/// b. its lane_id,
3034/// c. an offset relative to the current lane_id to generate a
3035/// remote_lane_id. The remote lane contains the second
3036/// variable of type 'ReduceData' that is to be reduced.
3037/// d. an algorithm version parameter determining which reduction
3038/// algorithm to use.
3039///
3040/// 'shuffleReduceFn' retrieves data from the remote lane using
3041/// efficient GPU shuffle intrinsics and reduces, using the
3042/// algorithm specified by the 4th parameter, the two operands
3043/// element-wise. The result is written to the first operand.
3044///
3045/// Different reduction algorithms are implemented in different
3046/// runtime functions, all calling 'shuffleReduceFn' to perform
3047/// the essential reduction step. Therefore, based on the 4th
3048/// parameter, this function behaves slightly differently to
3049/// cooperate with the runtime to ensure correctness under
3050/// different circumstances.
3051///
3052/// 'InterWarpCpyFn' is a pointer to a function that transfers
3053/// reduced variables across warps. It tunnels, through CUDA
3054/// shared memory, the thread-private data of type 'ReduceData'
3055/// from lane 0 of each warp to a lane in the first warp.
3056/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3057/// The last team writes the global reduced value to memory.
3058///
3059/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3060/// reduceData, shuffleReduceFn, interWarpCpyFn,
3061/// scratchpadCopyFn, loadAndReduceFn)
3062///
3063/// 'scratchpadCopyFn' is a helper that stores reduced
3064/// data from the team master to a scratchpad array in
3065/// global memory.
3066///
3067/// 'loadAndReduceFn' is a helper that loads data from
3068/// the scratchpad array and reduces it with the input
3069/// operand.
3070///
3071/// These compiler generated functions hide address
3072/// calculation and alignment information from the runtime.
3073/// 5. if ret == 1:
3074/// The team master of the last team stores the reduced
3075/// result to the globals in memory.
3076/// foo += reduceData.foo; bar *= reduceData.bar
3077///
3078///
3079/// Warp Reduction Algorithms
3080///
3081/// On the warp level, we have three algorithms implemented in the
3082/// OpenMP runtime depending on the number of active lanes:
3083///
3084/// Full Warp Reduction
3085///
3086/// The reduce algorithm within a warp where all lanes are active
3087/// is implemented in the runtime as follows:
3088///
3089/// full_warp_reduce(void *reduce_data,
3090/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3091/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3092/// ShuffleReduceFn(reduce_data, 0, offset, 0);
3093/// }
3094///
3095/// The algorithm completes in log(2, WARPSIZE) steps.
3096///
3097/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3098/// not used therefore we save instructions by not retrieving lane_id
3099/// from the corresponding special registers. The 4th parameter, which
3100/// represents the version of the algorithm being used, is set to 0 to
3101/// signify full warp reduction.
3102///
3103/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3104///
3105/// #reduce_elem refers to an element in the local lane's data structure
3106/// #remote_elem is retrieved from a remote lane
3107/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3108/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3109///
3110/// Contiguous Partial Warp Reduction
3111///
3112/// This reduce algorithm is used within a warp where only the first
3113/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3114/// number of OpenMP threads in a parallel region is not a multiple of
3115/// WARPSIZE. The algorithm is implemented in the runtime as follows:
3116///
3117/// void
3118/// contiguous_partial_reduce(void *reduce_data,
3119/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3120/// int size, int lane_id) {
3121/// int curr_size;
3122/// int offset;
3123/// curr_size = size;
3124/// mask = curr_size/2;
3125/// while (offset>0) {
3126/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3127/// curr_size = (curr_size+1)/2;
3128/// offset = curr_size/2;
3129/// }
3130/// }
3131///
3132/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3133///
3134/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3135/// if (lane_id < offset)
3136/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3137/// else
3138/// reduce_elem = remote_elem
3139///
3140/// This algorithm assumes that the data to be reduced are located in a
3141/// contiguous subset of lanes starting from the first. When there is
3142/// an odd number of active lanes, the data in the last lane is not
3143/// aggregated with any other lane's dat but is instead copied over.
3144///
3145/// Dispersed Partial Warp Reduction
3146///
3147/// This algorithm is used within a warp when any discontiguous subset of
3148/// lanes are active. It is used to implement the reduction operation
3149/// across lanes in an OpenMP simd region or in a nested parallel region.
3150///
3151/// void
3152/// dispersed_partial_reduce(void *reduce_data,
3153/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3154/// int size, remote_id;
3155/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3156/// do {
3157/// remote_id = next_active_lane_id_right_after_me();
3158/// # the above function returns 0 of no active lane
3159/// # is present right after the current lane.
3160/// size = number_of_active_lanes_in_this_warp();
3161/// logical_lane_id /= 2;
3162/// ShuffleReduceFn(reduce_data, logical_lane_id,
3163/// remote_id-1-threadIdx.x, 2);
3164/// } while (logical_lane_id % 2 == 0 && size > 1);
3165/// }
3166///
3167/// There is no assumption made about the initial state of the reduction.
3168/// Any number of lanes (>=1) could be active at any position. The reduction
3169/// result is returned in the first active lane.
3170///
3171/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3172///
3173/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3174/// if (lane_id % 2 == 0 && offset > 0)
3175/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3176/// else
3177/// reduce_elem = remote_elem
3178///
3179///
3180/// Intra-Team Reduction
3181///
3182/// This function, as implemented in the runtime call
3183/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3184/// threads in a team. It first reduces within a warp using the
3185/// aforementioned algorithms. We then proceed to gather all such
3186/// reduced values at the first warp.
3187///
3188/// The runtime makes use of the function 'InterWarpCpyFn', which copies
3189/// data from each of the "warp master" (zeroth lane of each warp, where
3190/// warp-reduced data is held) to the zeroth warp. This step reduces (in
3191/// a mathematical sense) the problem of reduction across warp masters in
3192/// a block to the problem of warp reduction.
3193///
3194///
3195/// Inter-Team Reduction
3196///
3197/// Once a team has reduced its data to a single value, it is stored in
3198/// a global scratchpad array. Since each team has a distinct slot, this
3199/// can be done without locking.
3200///
3201/// The last team to write to the scratchpad array proceeds to reduce the
3202/// scratchpad array. One or more workers in the last team use the helper
3203/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3204/// the k'th worker reduces every k'th element.
3205///
3206/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3207/// reduce across workers and compute a globally reduced value.
3208///
3209void CGOpenMPRuntimeGPU::emitReduction(
3210 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3211 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3212 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3213 if (!CGF.HaveInsertPoint())
1
Taking false branch
3214 return;
3215
3216 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3217#ifndef NDEBUG
3218 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3219#endif
3220
3221 if (Options.SimpleReduction) {
2
Assuming field 'SimpleReduction' is false
3222 assert(!TeamsReduction && !ParallelReduction &&(static_cast <bool> (!TeamsReduction && !ParallelReduction
&& "Invalid reduction selection in emitReduction.") ?
void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3223, __extension__
__PRETTY_FUNCTION__))
3223 "Invalid reduction selection in emitReduction.")(static_cast <bool> (!TeamsReduction && !ParallelReduction
&& "Invalid reduction selection in emitReduction.") ?
void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3223, __extension__
__PRETTY_FUNCTION__))
;
3224 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3225 ReductionOps, Options);
3226 return;
3227 }
3228
3229 assert((TeamsReduction || ParallelReduction) &&(static_cast <bool> ((TeamsReduction || ParallelReduction
) && "Invalid reduction selection in emitReduction.")
? void (0) : __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3230, __extension__
__PRETTY_FUNCTION__))
3
Taking false branch
4
Assuming 'TeamsReduction' is false
5
Assuming 'ParallelReduction' is true
6
'?' condition is true
3230 "Invalid reduction selection in emitReduction.")(static_cast <bool> ((TeamsReduction || ParallelReduction
) && "Invalid reduction selection in emitReduction.")
? void (0) : __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3230, __extension__
__PRETTY_FUNCTION__))
;
3231
3232 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3233 // RedList, shuffle_reduce_func, interwarp_copy_func);
3234 // or
3235 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3236 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3237 llvm::Value *ThreadId = getThreadID(CGF, Loc);
3238
3239 llvm::Value *Res;
3240 ASTContext &C = CGM.getContext();
3241 // 1. Build a list of reduction variables.
3242 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3243 auto Size = RHSExprs.size();
3244 for (const Expr *E : Privates) {
7
Assuming '__begin1' is equal to '__end1'
3245 if (E->getType()->isVariablyModifiedType())
3246 // Reserve place for array size.
3247 ++Size;
3248 }
3249 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3250 QualType ReductionArrayTy =
3251 C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
3252 /*IndexTypeQuals=*/0);
3253 Address ReductionList =
3254 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3255 auto IPriv = Privates.begin();
3256 unsigned Idx = 0;
3257 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
8
Assuming 'I' is >= 'E'
9
Loop condition is false. Execution continues on line 3277
3258 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3259 CGF.Builder.CreateStore(
3260 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3261 CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
3262 Elem);
3263 if ((*IPriv)->getType()->isVariablyModifiedType()) {
3264 // Store array size.
3265 ++Idx;
3266 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3267 llvm::Value *Size = CGF.Builder.CreateIntCast(
3268 CGF.getVLASize(
3269 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3270 .NumElts,
3271 CGF.SizeTy, /*isSigned=*/false);
3272 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3273 Elem);
3274 }
3275 }
3276
3277 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3278 ReductionList.getPointer(), CGF.VoidPtrTy);
3279 llvm::Function *ReductionFn =
3280 emitReductionFunction(Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
3281 Privates, LHSExprs, RHSExprs, ReductionOps);
3282 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3283 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3284 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3285 llvm::Value *InterWarpCopyFn =
3286 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
10
Calling 'emitInterWarpCopyFunction'
3287
3288 if (ParallelReduction) {
3289 llvm::Value *Args[] = {RTLoc,
3290 ThreadId,
3291 CGF.Builder.getInt32(RHSExprs.size()),
3292 ReductionArrayTySize,
3293 RL,
3294 ShuffleAndReduceFn,
3295 InterWarpCopyFn};
3296
3297 Res = CGF.EmitRuntimeCall(
3298 OMPBuilder.getOrCreateRuntimeFunction(
3299 CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3300 Args);
3301 } else {
3302 assert(TeamsReduction && "expected teams reduction.")(static_cast <bool> (TeamsReduction && "expected teams reduction."
) ? void (0) : __assert_fail ("TeamsReduction && \"expected teams reduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3302, __extension__
__PRETTY_FUNCTION__))
;
3303 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3304 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3305 int Cnt = 0;
3306 for (const Expr *DRE : Privates) {
3307 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3308 ++Cnt;
3309 }
3310 const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3311 CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3312 C.getLangOpts().OpenMPCUDAReductionBufNum);
3313 TeamsReductions.push_back(TeamReductionRec);
3314 if (!KernelTeamsReductionPtr) {
3315 KernelTeamsReductionPtr = new llvm::GlobalVariable(
3316 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3317 llvm::GlobalValue::InternalLinkage, nullptr,
3318 "_openmp_teams_reductions_buffer_$_$ptr");
3319 }
3320 llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3321 Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
3322 /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3323 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3324 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3325 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3326 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3327 ReductionFn);
3328 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3329 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3330 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3331 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3332 ReductionFn);
3333
3334 llvm::Value *Args[] = {
3335 RTLoc,
3336 ThreadId,
3337 GlobalBufferPtr,
3338 CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3339 RL,
3340 ShuffleAndReduceFn,
3341 InterWarpCopyFn,
3342 GlobalToBufferCpyFn,
3343 GlobalToBufferRedFn,
3344 BufferToGlobalCpyFn,
3345 BufferToGlobalRedFn};
3346
3347 Res = CGF.EmitRuntimeCall(
3348 OMPBuilder.getOrCreateRuntimeFunction(
3349 CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3350 Args);
3351 }
3352
3353 // 5. Build if (res == 1)
3354 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3355 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3356 llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3357 Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3358 CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3359
3360 // 6. Build then branch: where we have reduced values in the master
3361 // thread in each team.
3362 // __kmpc_end_reduce{_nowait}(<gtid>);
3363 // break;
3364 CGF.EmitBlock(ThenBB);
3365
3366 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3367 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3368 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3369 auto IPriv = Privates.begin();
3370 auto ILHS = LHSExprs.begin();
3371 auto IRHS = RHSExprs.begin();
3372 for (const Expr *E : ReductionOps) {
3373 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3374 cast<DeclRefExpr>(*IRHS));
3375 ++IPriv;
3376 ++ILHS;
3377 ++IRHS;
3378 }
3379 };
3380 llvm::Value *EndArgs[] = {ThreadId};
3381 RegionCodeGenTy RCG(CodeGen);
3382 NVPTXActionTy Action(
3383 nullptr, llvm::None,
3384 OMPBuilder.getOrCreateRuntimeFunction(
3385 CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3386 EndArgs);
3387 RCG.setAction(Action);
3388 RCG(CGF);
3389 // There is no need to emit line number for unconditional branch.
3390 (void)ApplyDebugLocation::CreateEmpty(CGF);
3391 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3392}
3393
3394const VarDecl *
3395CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3396 const VarDecl *NativeParam) const {
3397 if (!NativeParam->getType()->isReferenceType())
3398 return NativeParam;
3399 QualType ArgType = NativeParam->getType();
3400 QualifierCollector QC;
3401 const Type *NonQualTy = QC.strip(ArgType);
3402 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3403 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3404 if (Attr->getCaptureKind() == OMPC_map) {
3405 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3406 LangAS::opencl_global);
3407 }
3408 }
3409 ArgType = CGM.getContext().getPointerType(PointeeTy);
3410 QC.addRestrict();
3411 enum { NVPTX_local_addr = 5 };
3412 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3413 ArgType = QC.apply(CGM.getContext(), ArgType);
3414 if (isa<ImplicitParamDecl>(NativeParam))
3415 return ImplicitParamDecl::Create(
3416 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3417 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3418 return ParmVarDecl::Create(
3419 CGM.getContext(),
3420 const_cast<DeclContext *>(NativeParam->getDeclContext()),
3421 NativeParam->getBeginLoc(), NativeParam->getLocation(),
3422 NativeParam->getIdentifier(), ArgType,
3423 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3424}
3425
3426Address
3427CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3428 const VarDecl *NativeParam,
3429 const VarDecl *TargetParam) const {
3430 assert(NativeParam != TargetParam &&(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3432, __extension__
__PRETTY_FUNCTION__))
3431 NativeParam->getType()->isReferenceType() &&(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3432, __extension__
__PRETTY_FUNCTION__))
3432 "Native arg must not be the same as target arg.")(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3432, __extension__
__PRETTY_FUNCTION__))
;
3433 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3434 QualType NativeParamType = NativeParam->getType();
3435 QualifierCollector QC;
3436 const Type *NonQualTy = QC.strip(NativeParamType);
3437 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3438 unsigned NativePointeeAddrSpace =
3439 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3440 QualType TargetTy = TargetParam->getType();
3441 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3442 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3443 // First cast to generic.
3444 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3445 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3446 cast<llvm::PointerType>(TargetAddr->getType()), /*AddrSpace=*/0));
3447 // Cast from generic to native address space.
3448 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3449 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3450 cast<llvm::PointerType>(TargetAddr->getType()),
3451 NativePointeeAddrSpace));
3452 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3453 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3454 NativeParamType);
3455 return NativeParamAddr;
3456}
3457
3458void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3459 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3460 ArrayRef<llvm::Value *> Args) const {
3461 SmallVector<llvm::Value *, 4> TargetArgs;
3462 TargetArgs.reserve(Args.size());
3463 auto *FnType = OutlinedFn.getFunctionType();
3464 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3465 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3466 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3467 break;
3468 }
3469 llvm::Type *TargetType = FnType->getParamType(I);
3470 llvm::Value *NativeArg = Args[I];
3471 if (!TargetType->isPointerTy()) {
3472 TargetArgs.emplace_back(NativeArg);
3473 continue;
3474 }
3475 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3476 NativeArg, llvm::PointerType::getWithSamePointeeType(
3477 cast<llvm::PointerType>(NativeArg->getType()), /*AddrSpace*/ 0));
3478 TargetArgs.emplace_back(
3479 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3480 }
3481 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3482}
3483
3484/// Emit function which wraps the outline parallel region
3485/// and controls the arguments which are passed to this function.
3486/// The wrapper ensures that the outlined function is called
3487/// with the correct arguments when data is shared.
3488llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3489 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3490 ASTContext &Ctx = CGM.getContext();
3491 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3492
3493 // Create a function that takes as argument the source thread.
3494 FunctionArgList WrapperArgs;
3495 QualType Int16QTy =
3496 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3497 QualType Int32QTy =
3498 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3499 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3500 /*Id=*/nullptr, Int16QTy,
3501 ImplicitParamDecl::Other);
3502 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3503 /*Id=*/nullptr, Int32QTy,
3504 ImplicitParamDecl::Other);
3505 WrapperArgs.emplace_back(&ParallelLevelArg);
3506 WrapperArgs.emplace_back(&WrapperArg);
3507
3508 const CGFunctionInfo &CGFI =
3509 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3510
3511 auto *Fn = llvm::Function::Create(
3512 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3513 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3514
3515 // Ensure we do not inline the function. This is trivially true for the ones
3516 // passed to __kmpc_fork_call but the ones calles in serialized regions
3517 // could be inlined. This is not a perfect but it is closer to the invariant
3518 // we want, namely, every data environment starts with a new function.
3519 // TODO: We should pass the if condition to the runtime function and do the
3520 // handling there. Much cleaner code.
3521 Fn->addFnAttr(llvm::Attribute::NoInline);
3522
3523 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3524 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3525 Fn->setDoesNotRecurse();
3526
3527 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3528 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3529 D.getBeginLoc(), D.getBeginLoc());
3530
3531 const auto *RD = CS.getCapturedRecordDecl();
3532 auto CurField = RD->field_begin();
3533
3534 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3535 /*Name=*/".zero.addr");
3536 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3537 // Get the array of arguments.
3538 SmallVector<llvm::Value *, 8> Args;
3539
3540 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3541 Args.emplace_back(ZeroAddr.getPointer());
3542
3543 CGBuilderTy &Bld = CGF.Builder;
3544 auto CI = CS.capture_begin();
3545
3546 // Use global memory for data sharing.
3547 // Handle passing of global args to workers.
3548 Address GlobalArgs =
3549 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3550 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3551 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3552 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3553 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3554 DataSharingArgs);
3555
3556 // Retrieve the shared variables from the list of references returned
3557 // by the runtime. Pass the variables to the outlined function.
3558 Address SharedArgListAddress = Address::invalid();
3559 if (CS.capture_size() > 0 ||
3560 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3561 SharedArgListAddress = CGF.EmitLoadOfPointer(
3562 GlobalArgs, CGF.getContext()
3563 .getPointerType(CGF.getContext().VoidPtrTy)
3564 .castAs<PointerType>());
3565 }
3566 unsigned Idx = 0;
3567 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3568 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3569 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3570 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3571 llvm::Value *LB = CGF.EmitLoadOfScalar(
3572 TypedAddress,
3573 /*Volatile=*/false,
3574 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3575 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3576 Args.emplace_back(LB);
3577 ++Idx;
3578 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3579 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3580 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3581 llvm::Value *UB = CGF.EmitLoadOfScalar(
3582 TypedAddress,
3583 /*Volatile=*/false,
3584 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3585 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3586 Args.emplace_back(UB);
3587 ++Idx;
3588 }
3589 if (CS.capture_size() > 0) {
3590 ASTContext &CGFContext = CGF.getContext();
3591 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3592 QualType ElemTy = CurField->getType();
3593 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3594 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3595 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
3596 CGF.ConvertTypeForMem(ElemTy));
3597 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3598 /*Volatile=*/false,
3599 CGFContext.getPointerType(ElemTy),
3600 CI->getLocation());
3601 if (CI->capturesVariableByCopy() &&
3602 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3603 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3604 CI->getLocation());
3605 }
3606 Args.emplace_back(Arg);
3607 }
3608 }
3609
3610 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3611 CGF.FinishFunction();
3612 return Fn;
3613}
3614
3615void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3616 const Decl *D) {
3617 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3618 return;
3619
3620 assert(D && "Expected function or captured|block decl.")(static_cast <bool> (D && "Expected function or captured|block decl."
) ? void (0) : __assert_fail ("D && \"Expected function or captured|block decl.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3620, __extension__
__PRETTY_FUNCTION__))
;
3621 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&(static_cast <bool> (FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 && "Function is registered already.") ? void
(0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3622, __extension__
__PRETTY_FUNCTION__))
3622 "Function is registered already.")(static_cast <bool> (FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 && "Function is registered already.") ? void
(0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3622, __extension__
__PRETTY_FUNCTION__))
;
3623 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&(static_cast <bool> ((!TeamAndReductions.first || TeamAndReductions
.first == D) && "Team is set but not processed.") ? void
(0) : __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3624, __extension__
__PRETTY_FUNCTION__))
3624 "Team is set but not processed.")(static_cast <bool> ((!TeamAndReductions.first || TeamAndReductions
.first == D) && "Team is set but not processed.") ? void
(0) : __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3624, __extension__
__PRETTY_FUNCTION__))
;
3625 const Stmt *Body = nullptr;
3626 bool NeedToDelayGlobalization = false;
3627 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3628 Body = FD->getBody();
3629 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3630 Body = BD->getBody();
3631 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3632 Body = CD->getBody();
3633 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3634 if (NeedToDelayGlobalization &&
3635 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3636 return;
3637 }
3638 if (!Body)
3639 return;
3640 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3641 VarChecker.Visit(Body);
3642 const RecordDecl *GlobalizedVarsRecord =
3643 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3644 TeamAndReductions.first = nullptr;
3645 TeamAndReductions.second.clear();
3646 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3647 VarChecker.getEscapedVariableLengthDecls();
3648 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3649 return;
3650 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3651 I->getSecond().MappedParams =
3652 std::make_unique<CodeGenFunction::OMPMapVars>();
3653 I->getSecond().EscapedParameters.insert(
3654 VarChecker.getEscapedParameters().begin(),
3655 VarChecker.getEscapedParameters().end());
3656 I->getSecond().EscapedVariableLengthDecls.append(
3657 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3658 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3659 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3660 assert(VD->isCanonicalDecl() && "Expected canonical declaration")(static_cast <bool> (VD->isCanonicalDecl() &&
"Expected canonical declaration") ? void (0) : __assert_fail
("VD->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3660, __extension__
__PRETTY_FUNCTION__))
;
3661 Data.insert(std::make_pair(VD, MappedVarData()));
3662 }
3663 if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
3664 CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
3665 VarChecker.Visit(Body);
3666 I->getSecond().SecondaryLocalVarData.emplace();
3667 DeclToAddrMapTy &Data = *I->getSecond().SecondaryLocalVarData;
3668 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3669 assert(VD->isCanonicalDecl() && "Expected canonical declaration")(static_cast <bool> (VD->isCanonicalDecl() &&
"Expected canonical declaration") ? void (0) : __assert_fail
("VD->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3669, __extension__
__PRETTY_FUNCTION__))
;
3670 Data.insert(std::make_pair(VD, MappedVarData()));
3671 }
3672 }
3673 if (!NeedToDelayGlobalization) {
3674 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
3675 struct GlobalizationScope final : EHScopeStack::Cleanup {
3676 GlobalizationScope() = default;
3677
3678 void Emit(CodeGenFunction &CGF, Flags flags) override {
3679 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3680 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
3681 }
3682 };
3683 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3684 }
3685}
3686
3687Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3688 const VarDecl *VD) {
3689 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3690 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3691 auto AS = LangAS::Default;
3692 switch (A->getAllocatorType()) {
3693 // Use the default allocator here as by default local vars are
3694 // threadlocal.
3695 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3696 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3697 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3698 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3699 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3700 // Follow the user decision - use default allocation.
3701 return Address::invalid();
3702 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3703 // TODO: implement aupport for user-defined allocators.
3704 return Address::invalid();
3705 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3706 AS = LangAS::cuda_constant;
3707 break;
3708 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3709 AS = LangAS::cuda_shared;
3710 break;
3711 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3712 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3713 break;
3714 }
3715 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3716 auto *GV = new llvm::GlobalVariable(
3717 CGM.getModule(), VarTy, /*isConstant=*/false,
3718 llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
3719 VD->getName(),
3720 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3721 CGM.getContext().getTargetAddressSpace(AS));
3722 CharUnits Align = CGM.getContext().getDeclAlign(VD);
3723 GV->setAlignment(Align.getAsAlign());
3724 return Address(
3725 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3726 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3727 VD->getType().getAddressSpace()))),
3728 VarTy, Align);
3729 }
3730
3731 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3732 return Address::invalid();
3733
3734 VD = VD->getCanonicalDecl();
3735 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3736 if (I == FunctionGlobalizedDecls.end())
3737 return Address::invalid();
3738 auto VDI = I->getSecond().LocalVarData.find(VD);
3739 if (VDI != I->getSecond().LocalVarData.end())
3740 return VDI->second.PrivateAddr;
3741 if (VD->hasAttrs()) {
3742 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3743 E(VD->attr_end());
3744 IT != E; ++IT) {
3745 auto VDI = I->getSecond().LocalVarData.find(
3746 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3747 ->getCanonicalDecl());
3748 if (VDI != I->getSecond().LocalVarData.end())
3749 return VDI->second.PrivateAddr;
3750 }
3751 }
3752
3753 return Address::invalid();
3754}
3755
3756void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3757 FunctionGlobalizedDecls.erase(CGF.CurFn);
3758 CGOpenMPRuntime::functionFinished(CGF);
3759}
3760
3761void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3762 CodeGenFunction &CGF, const OMPLoopDirective &S,
3763 OpenMPDistScheduleClauseKind &ScheduleKind,
3764 llvm::Value *&Chunk) const {
3765 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3766 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3767 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3768 Chunk = CGF.EmitScalarConversion(
3769 RT.getGPUNumThreads(CGF),
3770 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3771 S.getIterationVariable()->getType(), S.getBeginLoc());
3772 return;
3773 }
3774 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3775 CGF, S, ScheduleKind, Chunk);
3776}
3777
3778void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3779 CodeGenFunction &CGF, const OMPLoopDirective &S,
3780 OpenMPScheduleClauseKind &ScheduleKind,
3781 const Expr *&ChunkExpr) const {
3782 ScheduleKind = OMPC_SCHEDULE_static;
3783 // Chunk size is 1 in this case.
3784 llvm::APInt ChunkSize(32, 1);
3785 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3786 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3787 SourceLocation());
3788}
3789
3790void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3791 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3792 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) && " Expected target-based directive."
) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3793, __extension__
__PRETTY_FUNCTION__))
3793 " Expected target-based directive.")(static_cast <bool> (isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) && " Expected target-based directive."
) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3793, __extension__
__PRETTY_FUNCTION__))
;
3794 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3795 for (const CapturedStmt::Capture &C : CS->captures()) {
3796 // Capture variables captured by reference in lambdas for target-based
3797 // directives.
3798 if (!C.capturesVariable())
3799 continue;
3800 const VarDecl *VD = C.getCapturedVar();
3801 const auto *RD = VD->getType()
3802 .getCanonicalType()
3803 .getNonReferenceType()
3804 ->getAsCXXRecordDecl();
3805 if (!RD || !RD->isLambda())
3806 continue;
3807 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3808 LValue VDLVal;
3809 if (VD->getType().getCanonicalType()->isReferenceType())
3810 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3811 else
3812 VDLVal = CGF.MakeAddrLValue(
3813 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3814 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3815 FieldDecl *ThisCapture = nullptr;
3816 RD->getCaptureFields(Captures, ThisCapture);
3817 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3818 LValue ThisLVal =
3819 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3820 llvm::Value *CXXThis = CGF.LoadCXXThis();
3821 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3822 }
3823 for (const LambdaCapture &LC : RD->captures()) {
3824 if (LC.getCaptureKind() != LCK_ByRef)
3825 continue;
3826 const ValueDecl *VD = LC.getCapturedVar();
3827 // FIXME: For now VD is always a VarDecl because OpenMP does not support
3828 // capturing structured bindings in lambdas yet.
3829 if (!CS->capturesVariable(cast<VarDecl>(VD)))
3830 continue;
3831 auto It = Captures.find(VD);
3832 assert(It != Captures.end() && "Found lambda capture without field.")(static_cast <bool> (It != Captures.end() && "Found lambda capture without field."
) ? void (0) : __assert_fail ("It != Captures.end() && \"Found lambda capture without field.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3832, __extension__
__PRETTY_FUNCTION__))
;
3833 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3834 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3835 if (VD->getType().getCanonicalType()->isReferenceType())
3836 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3837 VD->getType().getCanonicalType())
3838 .getAddress(CGF);
3839 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3840 }
3841 }
3842}
3843
3844bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3845 LangAS &AS) {
3846 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3847 return false;
3848 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3849 switch(A->getAllocatorType()) {
3850 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3851 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3852 // Not supported, fallback to the default mem space.
3853 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3854 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3855 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3856 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3857 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3858 AS = LangAS::Default;
3859 return true;
3860 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3861 AS = LangAS::cuda_constant;
3862 return true;
3863 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3864 AS = LangAS::cuda_shared;
3865 return true;
3866 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3867 llvm_unreachable("Expected predefined allocator for the variables with the "::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the "
"static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3868)
3868 "static storage.")::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the "
"static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3868)
;
3869 }
3870 return false;
3871}
3872
3873// Get current CudaArch and ignore any unknown values
3874static CudaArch getCudaArch(CodeGenModule &CGM) {
3875 if (!CGM.getTarget().hasFeature("ptx"))
3876 return CudaArch::UNKNOWN;
3877 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3878 if (Feature.getValue()) {
3879 CudaArch Arch = StringToCudaArch(Feature.getKey());
3880 if (Arch != CudaArch::UNKNOWN)
3881 return Arch;
3882 }
3883 }
3884 return CudaArch::UNKNOWN;
3885}
3886
3887/// Check to see if target architecture supports unified addressing which is
3888/// a restriction for OpenMP requires clause "unified_shared_memory".
3889void CGOpenMPRuntimeGPU::processRequiresDirective(
3890 const OMPRequiresDecl *D) {
3891 for (const OMPClause *Clause : D->clauselists()) {
3892 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3893 CudaArch Arch = getCudaArch(CGM);
3894 switch (Arch) {
3895 case CudaArch::SM_20:
3896 case CudaArch::SM_21:
3897 case CudaArch::SM_30:
3898 case CudaArch::SM_32:
3899 case CudaArch::SM_35:
3900 case CudaArch::SM_37:
3901 case CudaArch::SM_50:
3902 case CudaArch::SM_52:
3903 case CudaArch::SM_53: {
3904 SmallString<256> Buffer;
3905 llvm::raw_svector_ostream Out(Buffer);
3906 Out << "Target architecture " << CudaArchToString(Arch)
3907 << " does not support unified addressing";
3908 CGM.Error(Clause->getBeginLoc(), Out.str());
3909 return;
3910 }
3911 case CudaArch::SM_60:
3912 case CudaArch::SM_61:
3913 case CudaArch::SM_62:
3914 case CudaArch::SM_70:
3915 case CudaArch::SM_72:
3916 case CudaArch::SM_75:
3917 case CudaArch::SM_80:
3918 case CudaArch::SM_86:
3919 case CudaArch::GFX600:
3920 case CudaArch::GFX601:
3921 case CudaArch::GFX602:
3922 case CudaArch::GFX700:
3923 case CudaArch::GFX701:
3924 case CudaArch::GFX702:
3925 case CudaArch::GFX703:
3926 case CudaArch::GFX704:
3927 case CudaArch::GFX705:
3928 case CudaArch::GFX801:
3929 case CudaArch::GFX802:
3930 case CudaArch::GFX803:
3931 case CudaArch::GFX805:
3932 case CudaArch::GFX810:
3933 case CudaArch::GFX900:
3934 case CudaArch::GFX902:
3935 case CudaArch::GFX904:
3936 case CudaArch::GFX906:
3937 case CudaArch::GFX908:
3938 case CudaArch::GFX909:
3939 case CudaArch::GFX90a:
3940 case CudaArch::GFX90c:
3941 case CudaArch::GFX940:
3942 case CudaArch::GFX1010:
3943 case CudaArch::GFX1011:
3944 case CudaArch::GFX1012:
3945 case CudaArch::GFX1013:
3946 case CudaArch::GFX1030:
3947 case CudaArch::GFX1031:
3948 case CudaArch::GFX1032:
3949 case CudaArch::GFX1033:
3950 case CudaArch::GFX1034:
3951 case CudaArch::GFX1035:
3952 case CudaArch::GFX1036:
3953 case CudaArch::GFX1100:
3954 case CudaArch::GFX1101:
3955 case CudaArch::GFX1102:
3956 case CudaArch::GFX1103:
3957 case CudaArch::Generic:
3958 case CudaArch::UNUSED:
3959 case CudaArch::UNKNOWN:
3960 break;
3961 case CudaArch::LAST:
3962 llvm_unreachable("Unexpected Cuda arch.")::llvm::llvm_unreachable_internal("Unexpected Cuda arch.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3962)
;
3963 }
3964 }
3965 }
3966 CGOpenMPRuntime::processRequiresDirective(D);
3967}
3968
3969void CGOpenMPRuntimeGPU::clear() {
3970
3971 if (!TeamsReductions.empty()) {
3972 ASTContext &C = CGM.getContext();
3973 RecordDecl *StaticRD = C.buildImplicitRecord(
3974 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3975 StaticRD->startDefinition();
3976 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3977 QualType RecTy = C.getRecordType(TeamReductionRec);
3978 auto *Field = FieldDecl::Create(
3979 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3980 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3981 /*BW=*/nullptr, /*Mutable=*/false,
3982 /*InitStyle=*/ICIS_NoInit);
3983 Field->setAccess(AS_public);
3984 StaticRD->addDecl(Field);
3985 }
3986 StaticRD->completeDefinition();
3987 QualType StaticTy = C.getRecordType(StaticRD);
3988 llvm::Type *LLVMReductionsBufferTy =
3989 CGM.getTypes().ConvertTypeForMem(StaticTy);
3990 // FIXME: nvlink does not handle weak linkage correctly (object with the
3991 // different size are reported as erroneous).
3992 // Restore CommonLinkage as soon as nvlink is fixed.
3993 auto *GV = new llvm::GlobalVariable(
3994 CGM.getModule(), LLVMReductionsBufferTy,
3995 /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3996 llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3997 "_openmp_teams_reductions_buffer_$_");
3998 KernelTeamsReductionPtr->setInitializer(
3999 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
4000 CGM.VoidPtrTy));
4001 }
4002 CGOpenMPRuntime::clear();
4003}
4004
4005llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
4006 CGBuilderTy &Bld = CGF.Builder;
4007 llvm::Module *M = &CGF.CGM.getModule();
4008 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
4009 llvm::Function *F = M->getFunction(LocSize);
4010 if (!F) {
4011 F = llvm::Function::Create(
4012 llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false),
4013 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
4014 }
4015 return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
4016}
4017
4018llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
4019 ArrayRef<llvm::Value *> Args{};
4020 return CGF.EmitRuntimeCall(
4021 OMPBuilder.getOrCreateRuntimeFunction(
4022 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
4023 Args);
4024}
4025
4026llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
4027 ArrayRef<llvm::Value *> Args{};
4028 return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
4029 CGM.getModule(), OMPRTL___kmpc_get_warp_size),
4030 Args);
4031}