Bug Summary

File:build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Warning:line 549, column 29
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name CGOpenMPRuntimeGPU.cpp -analyzer-store=region -analyzer-opt-analyze-nested-blocks -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -relaxed-aliasing -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm -resource-dir /usr/lib/llvm-15/lib/clang/15.0.0 -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I tools/clang/lib/CodeGen -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/clang/lib/CodeGen -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/clang/include -I tools/clang/include -I include -I /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-15/lib/clang/15.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm=build-llvm -fmacro-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm=build-llvm -fcoverage-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -std=c++14 -fdeprecated-macro -fdebug-compilation-dir=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/build-llvm=build-llvm -fdebug-prefix-map=/build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-04-20-140412-16051-1 -x c++ /build/llvm-toolchain-snapshot-15~++20220420111733+e13d2efed663/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
17#include "clang/AST/DeclOpenMP.h"
18#include "clang/AST/StmtOpenMP.h"
19#include "clang/AST/StmtVisitor.h"
20#include "clang/Basic/Cuda.h"
21#include "llvm/ADT/SmallPtrSet.h"
22#include "llvm/Frontend/OpenMP/OMPGridValues.h"
23#include "llvm/Support/MathExtras.h"
24
25using namespace clang;
26using namespace CodeGen;
27using namespace llvm::omp;
28
29namespace {
30/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
31class NVPTXActionTy final : public PrePostActionTy {
32 llvm::FunctionCallee EnterCallee = nullptr;
33 ArrayRef<llvm::Value *> EnterArgs;
34 llvm::FunctionCallee ExitCallee = nullptr;
35 ArrayRef<llvm::Value *> ExitArgs;
36 bool Conditional = false;
37 llvm::BasicBlock *ContBlock = nullptr;
38
39public:
40 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
41 ArrayRef<llvm::Value *> EnterArgs,
42 llvm::FunctionCallee ExitCallee,
43 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
44 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
45 ExitArgs(ExitArgs), Conditional(Conditional) {}
46 void Enter(CodeGenFunction &CGF) override {
47 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
48 if (Conditional) {
49 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
50 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
51 ContBlock = CGF.createBasicBlock("omp_if.end");
52 // Generate the branch (If-stmt)
53 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
54 CGF.EmitBlock(ThenBlock);
55 }
56 }
57 void Done(CodeGenFunction &CGF) {
58 // Emit the rest of blocks/branches
59 CGF.EmitBranch(ContBlock);
60 CGF.EmitBlock(ContBlock, true);
61 }
62 void Exit(CodeGenFunction &CGF) override {
63 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
64 }
65};
66
67/// A class to track the execution mode when codegening directives within
68/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
69/// to the target region and used by containing directives such as 'parallel'
70/// to emit optimized code.
71class ExecutionRuntimeModesRAII {
72private:
73 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
74 CGOpenMPRuntimeGPU::EM_Unknown;
75 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
76 bool SavedRuntimeMode = false;
77 bool *RuntimeMode = nullptr;
78
79public:
80 /// Constructor for Non-SPMD mode.
81 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode)
82 : ExecMode(ExecMode) {
83 SavedExecMode = ExecMode;
84 ExecMode = CGOpenMPRuntimeGPU::EM_NonSPMD;
85 }
86 /// Constructor for SPMD mode.
87 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
88 bool &RuntimeMode, bool FullRuntimeMode)
89 : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
90 SavedExecMode = ExecMode;
91 SavedRuntimeMode = RuntimeMode;
92 ExecMode = CGOpenMPRuntimeGPU::EM_SPMD;
93 RuntimeMode = FullRuntimeMode;
94 }
95 ~ExecutionRuntimeModesRAII() {
96 ExecMode = SavedExecMode;
97 if (RuntimeMode)
98 *RuntimeMode = SavedRuntimeMode;
99 }
100};
101
102/// GPU Configuration: This information can be derived from cuda registers,
103/// however, providing compile time constants helps generate more efficient
104/// code. For all practical purposes this is fine because the configuration
105/// is the same for all known NVPTX architectures.
106enum MachineConfiguration : unsigned {
107 /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
108 /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
109
110 /// Global memory alignment for performance.
111 GlobalMemoryAlignment = 128,
112
113 /// Maximal size of the shared memory buffer.
114 SharedMemorySize = 128,
115};
116
117static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
118 RefExpr = RefExpr->IgnoreParens();
119 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
120 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
121 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
122 Base = TempASE->getBase()->IgnoreParenImpCasts();
123 RefExpr = Base;
124 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
125 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
126 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
127 Base = TempOASE->getBase()->IgnoreParenImpCasts();
128 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
129 Base = TempASE->getBase()->IgnoreParenImpCasts();
130 RefExpr = Base;
131 }
132 RefExpr = RefExpr->IgnoreParenImpCasts();
133 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
134 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
135 const auto *ME = cast<MemberExpr>(RefExpr);
136 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
137}
138
139
140static RecordDecl *buildRecordForGlobalizedVars(
141 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
142 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
143 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
144 &MappedDeclsFields, int BufSize) {
145 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
146 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
147 return nullptr;
148 SmallVector<VarsDataTy, 4> GlobalizedVars;
149 for (const ValueDecl *D : EscapedDecls)
150 GlobalizedVars.emplace_back(
151 CharUnits::fromQuantity(std::max(
152 C.getDeclAlign(D).getQuantity(),
153 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
154 D);
155 for (const ValueDecl *D : EscapedDeclsForTeams)
156 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
157 llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
158 return L.first > R.first;
159 });
160
161 // Build struct _globalized_locals_ty {
162 // /* globalized vars */[WarSize] align (max(decl_align,
163 // GlobalMemoryAlignment))
164 // /* globalized vars */ for EscapedDeclsForTeams
165 // };
166 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
167 GlobalizedRD->startDefinition();
168 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
169 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
170 for (const auto &Pair : GlobalizedVars) {
171 const ValueDecl *VD = Pair.second;
172 QualType Type = VD->getType();
173 if (Type->isLValueReferenceType())
174 Type = C.getPointerType(Type.getNonReferenceType());
175 else
176 Type = Type.getNonReferenceType();
177 SourceLocation Loc = VD->getLocation();
178 FieldDecl *Field;
179 if (SingleEscaped.count(VD)) {
180 Field = FieldDecl::Create(
181 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
182 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
183 /*BW=*/nullptr, /*Mutable=*/false,
184 /*InitStyle=*/ICIS_NoInit);
185 Field->setAccess(AS_public);
186 if (VD->hasAttrs()) {
187 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
188 E(VD->getAttrs().end());
189 I != E; ++I)
190 Field->addAttr(*I);
191 }
192 } else {
193 llvm::APInt ArraySize(32, BufSize);
194 Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
195 0);
196 Field = FieldDecl::Create(
197 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
198 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
199 /*BW=*/nullptr, /*Mutable=*/false,
200 /*InitStyle=*/ICIS_NoInit);
201 Field->setAccess(AS_public);
202 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
203 static_cast<CharUnits::QuantityType>(
204 GlobalMemoryAlignment)));
205 Field->addAttr(AlignedAttr::CreateImplicit(
206 C, /*IsAlignmentExpr=*/true,
207 IntegerLiteral::Create(C, Align,
208 C.getIntTypeForBitwidth(32, /*Signed=*/0),
209 SourceLocation()),
210 {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
211 }
212 GlobalizedRD->addDecl(Field);
213 MappedDeclsFields.try_emplace(VD, Field);
214 }
215 GlobalizedRD->completeDefinition();
216 return GlobalizedRD;
217}
218
219/// Get the list of variables that can escape their declaration context.
220class CheckVarsEscapingDeclContext final
221 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
222 CodeGenFunction &CGF;
223 llvm::SetVector<const ValueDecl *> EscapedDecls;
224 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
225 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
226 RecordDecl *GlobalizedRD = nullptr;
227 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
228 bool AllEscaped = false;
229 bool IsForCombinedParallelRegion = false;
230
231 void markAsEscaped(const ValueDecl *VD) {
232 // Do not globalize declare target variables.
233 if (!isa<VarDecl>(VD) ||
234 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
235 return;
236 VD = cast<ValueDecl>(VD->getCanonicalDecl());
237 // Use user-specified allocation.
238 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
239 return;
240 // Variables captured by value must be globalized.
241 if (auto *CSI = CGF.CapturedStmtInfo) {
242 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
243 // Check if need to capture the variable that was already captured by
244 // value in the outer region.
245 if (!IsForCombinedParallelRegion) {
246 if (!FD->hasAttrs())
247 return;
248 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
249 if (!Attr)
250 return;
251 if (((Attr->getCaptureKind() != OMPC_map) &&
252 !isOpenMPPrivate(Attr->getCaptureKind())) ||
253 ((Attr->getCaptureKind() == OMPC_map) &&
254 !FD->getType()->isAnyPointerType()))
255 return;
256 }
257 if (!FD->getType()->isReferenceType()) {
258 assert(!VD->getType()->isVariablyModifiedType() &&(static_cast <bool> (!VD->getType()->isVariablyModifiedType
() && "Parameter captured by value with variably modified type"
) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 259, __extension__
__PRETTY_FUNCTION__))
259 "Parameter captured by value with variably modified type")(static_cast <bool> (!VD->getType()->isVariablyModifiedType
() && "Parameter captured by value with variably modified type"
) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 259, __extension__
__PRETTY_FUNCTION__))
;
260 EscapedParameters.insert(VD);
261 } else if (!IsForCombinedParallelRegion) {
262 return;
263 }
264 }
265 }
266 if ((!CGF.CapturedStmtInfo ||
267 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
268 VD->getType()->isReferenceType())
269 // Do not globalize variables with reference type.
270 return;
271 if (VD->getType()->isVariablyModifiedType())
272 EscapedVariableLengthDecls.insert(VD);
273 else
274 EscapedDecls.insert(VD);
275 }
276
277 void VisitValueDecl(const ValueDecl *VD) {
278 if (VD->getType()->isLValueReferenceType())
279 markAsEscaped(VD);
280 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
281 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
282 const bool SavedAllEscaped = AllEscaped;
283 AllEscaped = VD->getType()->isLValueReferenceType();
284 Visit(VarD->getInit());
285 AllEscaped = SavedAllEscaped;
286 }
287 }
288 }
289 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
290 ArrayRef<OMPClause *> Clauses,
291 bool IsCombinedParallelRegion) {
292 if (!S)
293 return;
294 for (const CapturedStmt::Capture &C : S->captures()) {
295 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
296 const ValueDecl *VD = C.getCapturedVar();
297 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
298 if (IsCombinedParallelRegion) {
299 // Check if the variable is privatized in the combined construct and
300 // those private copies must be shared in the inner parallel
301 // directive.
302 IsForCombinedParallelRegion = false;
303 for (const OMPClause *C : Clauses) {
304 if (!isOpenMPPrivate(C->getClauseKind()) ||
305 C->getClauseKind() == OMPC_reduction ||
306 C->getClauseKind() == OMPC_linear ||
307 C->getClauseKind() == OMPC_private)
308 continue;
309 ArrayRef<const Expr *> Vars;
310 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
311 Vars = PC->getVarRefs();
312 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
313 Vars = PC->getVarRefs();
314 else
315 llvm_unreachable("Unexpected clause.")::llvm::llvm_unreachable_internal("Unexpected clause.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 315)
;
316 for (const auto *E : Vars) {
317 const Decl *D =
318 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
319 if (D == VD->getCanonicalDecl()) {
320 IsForCombinedParallelRegion = true;
321 break;
322 }
323 }
324 if (IsForCombinedParallelRegion)
325 break;
326 }
327 }
328 markAsEscaped(VD);
329 if (isa<OMPCapturedExprDecl>(VD))
330 VisitValueDecl(VD);
331 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
332 }
333 }
334 }
335
336 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
337 assert(!GlobalizedRD &&(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already."
) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 338, __extension__
__PRETTY_FUNCTION__))
338 "Record for globalized variables is built already.")(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already."
) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 338, __extension__
__PRETTY_FUNCTION__))
;
339 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
340 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
341 if (IsInTTDRegion)
342 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
343 else
344 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
345 GlobalizedRD = ::buildRecordForGlobalizedVars(
346 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
347 MappedDeclsFields, WarpSize);
348 }
349
350public:
351 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
352 ArrayRef<const ValueDecl *> TeamsReductions)
353 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
354 }
355 virtual ~CheckVarsEscapingDeclContext() = default;
356 void VisitDeclStmt(const DeclStmt *S) {
357 if (!S)
358 return;
359 for (const Decl *D : S->decls())
360 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
361 VisitValueDecl(VD);
362 }
363 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
364 if (!D)
365 return;
366 if (!D->hasAssociatedStmt())
367 return;
368 if (const auto *S =
369 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
370 // Do not analyze directives that do not actually require capturing,
371 // like `omp for` or `omp simd` directives.
372 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
373 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
374 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
375 VisitStmt(S->getCapturedStmt());
376 return;
377 }
378 VisitOpenMPCapturedStmt(
379 S, D->clauses(),
380 CaptureRegions.back() == OMPD_parallel &&
381 isOpenMPDistributeDirective(D->getDirectiveKind()));
382 }
383 }
384 void VisitCapturedStmt(const CapturedStmt *S) {
385 if (!S)
386 return;
387 for (const CapturedStmt::Capture &C : S->captures()) {
388 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
389 const ValueDecl *VD = C.getCapturedVar();
390 markAsEscaped(VD);
391 if (isa<OMPCapturedExprDecl>(VD))
392 VisitValueDecl(VD);
393 }
394 }
395 }
396 void VisitLambdaExpr(const LambdaExpr *E) {
397 if (!E)
398 return;
399 for (const LambdaCapture &C : E->captures()) {
400 if (C.capturesVariable()) {
401 if (C.getCaptureKind() == LCK_ByRef) {
402 const ValueDecl *VD = C.getCapturedVar();
403 markAsEscaped(VD);
404 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
405 VisitValueDecl(VD);
406 }
407 }
408 }
409 }
410 void VisitBlockExpr(const BlockExpr *E) {
411 if (!E)
412 return;
413 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
414 if (C.isByRef()) {
415 const VarDecl *VD = C.getVariable();
416 markAsEscaped(VD);
417 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
418 VisitValueDecl(VD);
419 }
420 }
421 }
422 void VisitCallExpr(const CallExpr *E) {
423 if (!E)
424 return;
425 for (const Expr *Arg : E->arguments()) {
426 if (!Arg)
427 continue;
428 if (Arg->isLValue()) {
429 const bool SavedAllEscaped = AllEscaped;
430 AllEscaped = true;
431 Visit(Arg);
432 AllEscaped = SavedAllEscaped;
433 } else {
434 Visit(Arg);
435 }
436 }
437 Visit(E->getCallee());
438 }
439 void VisitDeclRefExpr(const DeclRefExpr *E) {
440 if (!E)
441 return;
442 const ValueDecl *VD = E->getDecl();
443 if (AllEscaped)
444 markAsEscaped(VD);
445 if (isa<OMPCapturedExprDecl>(VD))
446 VisitValueDecl(VD);
447 else if (const auto *VarD = dyn_cast<VarDecl>(VD))
448 if (VarD->isInitCapture())
449 VisitValueDecl(VD);
450 }
451 void VisitUnaryOperator(const UnaryOperator *E) {
452 if (!E)
453 return;
454 if (E->getOpcode() == UO_AddrOf) {
455 const bool SavedAllEscaped = AllEscaped;
456 AllEscaped = true;
457 Visit(E->getSubExpr());
458 AllEscaped = SavedAllEscaped;
459 } else {
460 Visit(E->getSubExpr());
461 }
462 }
463 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
464 if (!E)
465 return;
466 if (E->getCastKind() == CK_ArrayToPointerDecay) {
467 const bool SavedAllEscaped = AllEscaped;
468 AllEscaped = true;
469 Visit(E->getSubExpr());
470 AllEscaped = SavedAllEscaped;
471 } else {
472 Visit(E->getSubExpr());
473 }
474 }
475 void VisitExpr(const Expr *E) {
476 if (!E)
477 return;
478 bool SavedAllEscaped = AllEscaped;
479 if (!E->isLValue())
480 AllEscaped = false;
481 for (const Stmt *Child : E->children())
482 if (Child)
483 Visit(Child);
484 AllEscaped = SavedAllEscaped;
485 }
486 void VisitStmt(const Stmt *S) {
487 if (!S)
488 return;
489 for (const Stmt *Child : S->children())
490 if (Child)
491 Visit(Child);
492 }
493
494 /// Returns the record that handles all the escaped local variables and used
495 /// instead of their original storage.
496 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
497 if (!GlobalizedRD)
498 buildRecordForGlobalizedVars(IsInTTDRegion);
499 return GlobalizedRD;
500 }
501
502 /// Returns the field in the globalized record for the escaped variable.
503 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
504 assert(GlobalizedRD &&(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already."
) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 505, __extension__
__PRETTY_FUNCTION__))
505 "Record for globalized variables must be generated already.")(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already."
) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 505, __extension__
__PRETTY_FUNCTION__))
;
506 auto I = MappedDeclsFields.find(VD);
507 if (I == MappedDeclsFields.end())
508 return nullptr;
509 return I->getSecond();
510 }
511
512 /// Returns the list of the escaped local variables/parameters.
513 ArrayRef<const ValueDecl *> getEscapedDecls() const {
514 return EscapedDecls.getArrayRef();
515 }
516
517 /// Checks if the escaped local variable is actually a parameter passed by
518 /// value.
519 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
520 return EscapedParameters;
521 }
522
523 /// Returns the list of the escaped variables with the variably modified
524 /// types.
525 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
526 return EscapedVariableLengthDecls.getArrayRef();
527 }
528};
529} // anonymous namespace
530
531/// Get the id of the warp in the block.
532/// We assume that the warp size is 32, which is always the case
533/// on the NVPTX device, to generate more efficient code.
534static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
535 CGBuilderTy &Bld = CGF.Builder;
536 unsigned LaneIDBits =
537 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
538 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
539 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
540}
541
542/// Get the id of the current lane in the Warp.
543/// We assume that the warp size is 32, which is always the case
544/// on the NVPTX device, to generate more efficient code.
545static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
546 CGBuilderTy &Bld = CGF.Builder;
547 unsigned LaneIDBits =
548 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
549 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
14
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'
550 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
551 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
552 "nvptx_lane_id");
553}
554
555CGOpenMPRuntimeGPU::ExecutionMode
556CGOpenMPRuntimeGPU::getExecutionMode() const {
557 return CurrentExecutionMode;
558}
559
560static CGOpenMPRuntimeGPU::DataSharingMode
561getDataSharingMode(CodeGenModule &CGM) {
562 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
563 : CGOpenMPRuntimeGPU::Generic;
564}
565
566/// Check for inner (nested) SPMD construct, if any
567static bool hasNestedSPMDDirective(ASTContext &Ctx,
568 const OMPExecutableDirective &D) {
569 const auto *CS = D.getInnermostCapturedStmt();
570 const auto *Body =
571 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
572 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
573
574 if (const auto *NestedDir =
575 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
576 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
577 switch (D.getDirectiveKind()) {
578 case OMPD_target:
579 if (isOpenMPParallelDirective(DKind))
580 return true;
581 if (DKind == OMPD_teams) {
582 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
583 /*IgnoreCaptured=*/true);
584 if (!Body)
585 return false;
586 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
587 if (const auto *NND =
588 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
589 DKind = NND->getDirectiveKind();
590 if (isOpenMPParallelDirective(DKind))
591 return true;
592 }
593 }
594 return false;
595 case OMPD_target_teams:
596 return isOpenMPParallelDirective(DKind);
597 case OMPD_target_simd:
598 case OMPD_target_parallel:
599 case OMPD_target_parallel_for:
600 case OMPD_target_parallel_for_simd:
601 case OMPD_target_teams_distribute:
602 case OMPD_target_teams_distribute_simd:
603 case OMPD_target_teams_distribute_parallel_for:
604 case OMPD_target_teams_distribute_parallel_for_simd:
605 case OMPD_parallel:
606 case OMPD_for:
607 case OMPD_parallel_for:
608 case OMPD_parallel_master:
609 case OMPD_parallel_sections:
610 case OMPD_for_simd:
611 case OMPD_parallel_for_simd:
612 case OMPD_cancel:
613 case OMPD_cancellation_point:
614 case OMPD_ordered:
615 case OMPD_threadprivate:
616 case OMPD_allocate:
617 case OMPD_task:
618 case OMPD_simd:
619 case OMPD_sections:
620 case OMPD_section:
621 case OMPD_single:
622 case OMPD_master:
623 case OMPD_critical:
624 case OMPD_taskyield:
625 case OMPD_barrier:
626 case OMPD_taskwait:
627 case OMPD_taskgroup:
628 case OMPD_atomic:
629 case OMPD_flush:
630 case OMPD_depobj:
631 case OMPD_scan:
632 case OMPD_teams:
633 case OMPD_target_data:
634 case OMPD_target_exit_data:
635 case OMPD_target_enter_data:
636 case OMPD_distribute:
637 case OMPD_distribute_simd:
638 case OMPD_distribute_parallel_for:
639 case OMPD_distribute_parallel_for_simd:
640 case OMPD_teams_distribute:
641 case OMPD_teams_distribute_simd:
642 case OMPD_teams_distribute_parallel_for:
643 case OMPD_teams_distribute_parallel_for_simd:
644 case OMPD_target_update:
645 case OMPD_declare_simd:
646 case OMPD_declare_variant:
647 case OMPD_begin_declare_variant:
648 case OMPD_end_declare_variant:
649 case OMPD_declare_target:
650 case OMPD_end_declare_target:
651 case OMPD_declare_reduction:
652 case OMPD_declare_mapper:
653 case OMPD_taskloop:
654 case OMPD_taskloop_simd:
655 case OMPD_master_taskloop:
656 case OMPD_master_taskloop_simd:
657 case OMPD_parallel_master_taskloop:
658 case OMPD_parallel_master_taskloop_simd:
659 case OMPD_requires:
660 case OMPD_unknown:
661 default:
662 llvm_unreachable("Unexpected directive.")::llvm::llvm_unreachable_internal("Unexpected directive.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 662)
;
663 }
664 }
665
666 return false;
667}
668
669static bool supportsSPMDExecutionMode(ASTContext &Ctx,
670 const OMPExecutableDirective &D) {
671 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
672 switch (DirectiveKind) {
673 case OMPD_target:
674 case OMPD_target_teams:
675 return hasNestedSPMDDirective(Ctx, D);
676 case OMPD_target_parallel:
677 case OMPD_target_parallel_for:
678 case OMPD_target_parallel_for_simd:
679 case OMPD_target_teams_distribute_parallel_for:
680 case OMPD_target_teams_distribute_parallel_for_simd:
681 case OMPD_target_simd:
682 case OMPD_target_teams_distribute_simd:
683 return true;
684 case OMPD_target_teams_distribute:
685 return false;
686 case OMPD_parallel:
687 case OMPD_for:
688 case OMPD_parallel_for:
689 case OMPD_parallel_master:
690 case OMPD_parallel_sections:
691 case OMPD_for_simd:
692 case OMPD_parallel_for_simd:
693 case OMPD_cancel:
694 case OMPD_cancellation_point:
695 case OMPD_ordered:
696 case OMPD_threadprivate:
697 case OMPD_allocate:
698 case OMPD_task:
699 case OMPD_simd:
700 case OMPD_sections:
701 case OMPD_section:
702 case OMPD_single:
703 case OMPD_master:
704 case OMPD_critical:
705 case OMPD_taskyield:
706 case OMPD_barrier:
707 case OMPD_taskwait:
708 case OMPD_taskgroup:
709 case OMPD_atomic:
710 case OMPD_flush:
711 case OMPD_depobj:
712 case OMPD_scan:
713 case OMPD_teams:
714 case OMPD_target_data:
715 case OMPD_target_exit_data:
716 case OMPD_target_enter_data:
717 case OMPD_distribute:
718 case OMPD_distribute_simd:
719 case OMPD_distribute_parallel_for:
720 case OMPD_distribute_parallel_for_simd:
721 case OMPD_teams_distribute:
722 case OMPD_teams_distribute_simd:
723 case OMPD_teams_distribute_parallel_for:
724 case OMPD_teams_distribute_parallel_for_simd:
725 case OMPD_target_update:
726 case OMPD_declare_simd:
727 case OMPD_declare_variant:
728 case OMPD_begin_declare_variant:
729 case OMPD_end_declare_variant:
730 case OMPD_declare_target:
731 case OMPD_end_declare_target:
732 case OMPD_declare_reduction:
733 case OMPD_declare_mapper:
734 case OMPD_taskloop:
735 case OMPD_taskloop_simd:
736 case OMPD_master_taskloop:
737 case OMPD_master_taskloop_simd:
738 case OMPD_parallel_master_taskloop:
739 case OMPD_parallel_master_taskloop_simd:
740 case OMPD_requires:
741 case OMPD_unknown:
742 default:
743 break;
744 }
745 llvm_unreachable(::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 746)
746 "Unknown programming model for OpenMP directive on NVPTX target.")::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 746)
;
747}
748
749/// Check if the directive is loops based and has schedule clause at all or has
750/// static scheduling.
751static bool hasStaticScheduling(const OMPExecutableDirective &D) {
752 assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPWorksharingDirective(D.getDirectiveKind
()) && isOpenMPLoopDirective(D.getDirectiveKind()) &&
"Expected loop-based directive.") ? void (0) : __assert_fail
("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 754, __extension__
__PRETTY_FUNCTION__))
753 isOpenMPLoopDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPWorksharingDirective(D.getDirectiveKind
()) && isOpenMPLoopDirective(D.getDirectiveKind()) &&
"Expected loop-based directive.") ? void (0) : __assert_fail
("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 754, __extension__
__PRETTY_FUNCTION__))
754 "Expected loop-based directive.")(static_cast <bool> (isOpenMPWorksharingDirective(D.getDirectiveKind
()) && isOpenMPLoopDirective(D.getDirectiveKind()) &&
"Expected loop-based directive.") ? void (0) : __assert_fail
("isOpenMPWorksharingDirective(D.getDirectiveKind()) && isOpenMPLoopDirective(D.getDirectiveKind()) && \"Expected loop-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 754, __extension__
__PRETTY_FUNCTION__))
;
755 return !D.hasClausesOfKind<OMPOrderedClause>() &&
756 (!D.hasClausesOfKind<OMPScheduleClause>() ||
757 llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
758 [](const OMPScheduleClause *C) {
759 return C->getScheduleKind() == OMPC_SCHEDULE_static;
760 }));
761}
762
763/// Check for inner (nested) lightweight runtime construct, if any
764static bool hasNestedLightweightDirective(ASTContext &Ctx,
765 const OMPExecutableDirective &D) {
766 assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.")(static_cast <bool> (supportsSPMDExecutionMode(Ctx, D) &&
"Expected SPMD mode directive.") ? void (0) : __assert_fail (
"supportsSPMDExecutionMode(Ctx, D) && \"Expected SPMD mode directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 766, __extension__
__PRETTY_FUNCTION__))
;
767 const auto *CS = D.getInnermostCapturedStmt();
768 const auto *Body =
769 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
770 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
771
772 if (const auto *NestedDir =
773 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
774 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
775 switch (D.getDirectiveKind()) {
776 case OMPD_target:
777 if (isOpenMPParallelDirective(DKind) &&
778 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
779 hasStaticScheduling(*NestedDir))
780 return true;
781 if (DKind == OMPD_teams_distribute_simd || DKind == OMPD_simd)
782 return true;
783 if (DKind == OMPD_parallel) {
784 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
785 /*IgnoreCaptured=*/true);
786 if (!Body)
787 return false;
788 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
789 if (const auto *NND =
790 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
791 DKind = NND->getDirectiveKind();
792 if (isOpenMPWorksharingDirective(DKind) &&
793 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
794 return true;
795 }
796 } else if (DKind == OMPD_teams) {
797 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
798 /*IgnoreCaptured=*/true);
799 if (!Body)
800 return false;
801 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
802 if (const auto *NND =
803 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
804 DKind = NND->getDirectiveKind();
805 if (isOpenMPParallelDirective(DKind) &&
806 isOpenMPWorksharingDirective(DKind) &&
807 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
808 return true;
809 if (DKind == OMPD_parallel) {
810 Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
811 /*IgnoreCaptured=*/true);
812 if (!Body)
813 return false;
814 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
815 if (const auto *NND =
816 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
817 DKind = NND->getDirectiveKind();
818 if (isOpenMPWorksharingDirective(DKind) &&
819 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
820 return true;
821 }
822 }
823 }
824 }
825 return false;
826 case OMPD_target_teams:
827 if (isOpenMPParallelDirective(DKind) &&
828 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
829 hasStaticScheduling(*NestedDir))
830 return true;
831 if (DKind == OMPD_distribute_simd || DKind == OMPD_simd)
832 return true;
833 if (DKind == OMPD_parallel) {
834 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
835 /*IgnoreCaptured=*/true);
836 if (!Body)
837 return false;
838 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
839 if (const auto *NND =
840 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
841 DKind = NND->getDirectiveKind();
842 if (isOpenMPWorksharingDirective(DKind) &&
843 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
844 return true;
845 }
846 }
847 return false;
848 case OMPD_target_parallel:
849 if (DKind == OMPD_simd)
850 return true;
851 return isOpenMPWorksharingDirective(DKind) &&
852 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
853 case OMPD_target_teams_distribute:
854 case OMPD_target_simd:
855 case OMPD_target_parallel_for:
856 case OMPD_target_parallel_for_simd:
857 case OMPD_target_teams_distribute_simd:
858 case OMPD_target_teams_distribute_parallel_for:
859 case OMPD_target_teams_distribute_parallel_for_simd:
860 case OMPD_parallel:
861 case OMPD_for:
862 case OMPD_parallel_for:
863 case OMPD_parallel_master:
864 case OMPD_parallel_sections:
865 case OMPD_for_simd:
866 case OMPD_parallel_for_simd:
867 case OMPD_cancel:
868 case OMPD_cancellation_point:
869 case OMPD_ordered:
870 case OMPD_threadprivate:
871 case OMPD_allocate:
872 case OMPD_task:
873 case OMPD_simd:
874 case OMPD_sections:
875 case OMPD_section:
876 case OMPD_single:
877 case OMPD_master:
878 case OMPD_critical:
879 case OMPD_taskyield:
880 case OMPD_barrier:
881 case OMPD_taskwait:
882 case OMPD_taskgroup:
883 case OMPD_atomic:
884 case OMPD_flush:
885 case OMPD_depobj:
886 case OMPD_scan:
887 case OMPD_teams:
888 case OMPD_target_data:
889 case OMPD_target_exit_data:
890 case OMPD_target_enter_data:
891 case OMPD_distribute:
892 case OMPD_distribute_simd:
893 case OMPD_distribute_parallel_for:
894 case OMPD_distribute_parallel_for_simd:
895 case OMPD_teams_distribute:
896 case OMPD_teams_distribute_simd:
897 case OMPD_teams_distribute_parallel_for:
898 case OMPD_teams_distribute_parallel_for_simd:
899 case OMPD_target_update:
900 case OMPD_declare_simd:
901 case OMPD_declare_variant:
902 case OMPD_begin_declare_variant:
903 case OMPD_end_declare_variant:
904 case OMPD_declare_target:
905 case OMPD_end_declare_target:
906 case OMPD_declare_reduction:
907 case OMPD_declare_mapper:
908 case OMPD_taskloop:
909 case OMPD_taskloop_simd:
910 case OMPD_master_taskloop:
911 case OMPD_master_taskloop_simd:
912 case OMPD_parallel_master_taskloop:
913 case OMPD_parallel_master_taskloop_simd:
914 case OMPD_requires:
915 case OMPD_unknown:
916 default:
917 llvm_unreachable("Unexpected directive.")::llvm::llvm_unreachable_internal("Unexpected directive.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 917)
;
918 }
919 }
920
921 return false;
922}
923
924/// Checks if the construct supports lightweight runtime. It must be SPMD
925/// construct + inner loop-based construct with static scheduling.
926static bool supportsLightweightRuntime(ASTContext &Ctx,
927 const OMPExecutableDirective &D) {
928 if (!supportsSPMDExecutionMode(Ctx, D))
929 return false;
930 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
931 switch (DirectiveKind) {
932 case OMPD_target:
933 case OMPD_target_teams:
934 case OMPD_target_parallel:
935 return hasNestedLightweightDirective(Ctx, D);
936 case OMPD_target_parallel_for:
937 case OMPD_target_parallel_for_simd:
938 case OMPD_target_teams_distribute_parallel_for:
939 case OMPD_target_teams_distribute_parallel_for_simd:
940 // (Last|First)-privates must be shared in parallel region.
941 return hasStaticScheduling(D);
942 case OMPD_target_simd:
943 case OMPD_target_teams_distribute_simd:
944 return true;
945 case OMPD_target_teams_distribute:
946 return false;
947 case OMPD_parallel:
948 case OMPD_for:
949 case OMPD_parallel_for:
950 case OMPD_parallel_master:
951 case OMPD_parallel_sections:
952 case OMPD_for_simd:
953 case OMPD_parallel_for_simd:
954 case OMPD_cancel:
955 case OMPD_cancellation_point:
956 case OMPD_ordered:
957 case OMPD_threadprivate:
958 case OMPD_allocate:
959 case OMPD_task:
960 case OMPD_simd:
961 case OMPD_sections:
962 case OMPD_section:
963 case OMPD_single:
964 case OMPD_master:
965 case OMPD_critical:
966 case OMPD_taskyield:
967 case OMPD_barrier:
968 case OMPD_taskwait:
969 case OMPD_taskgroup:
970 case OMPD_atomic:
971 case OMPD_flush:
972 case OMPD_depobj:
973 case OMPD_scan:
974 case OMPD_teams:
975 case OMPD_target_data:
976 case OMPD_target_exit_data:
977 case OMPD_target_enter_data:
978 case OMPD_distribute:
979 case OMPD_distribute_simd:
980 case OMPD_distribute_parallel_for:
981 case OMPD_distribute_parallel_for_simd:
982 case OMPD_teams_distribute:
983 case OMPD_teams_distribute_simd:
984 case OMPD_teams_distribute_parallel_for:
985 case OMPD_teams_distribute_parallel_for_simd:
986 case OMPD_target_update:
987 case OMPD_declare_simd:
988 case OMPD_declare_variant:
989 case OMPD_begin_declare_variant:
990 case OMPD_end_declare_variant:
991 case OMPD_declare_target:
992 case OMPD_end_declare_target:
993 case OMPD_declare_reduction:
994 case OMPD_declare_mapper:
995 case OMPD_taskloop:
996 case OMPD_taskloop_simd:
997 case OMPD_master_taskloop:
998 case OMPD_master_taskloop_simd:
999 case OMPD_parallel_master_taskloop:
1000 case OMPD_parallel_master_taskloop_simd:
1001 case OMPD_requires:
1002 case OMPD_unknown:
1003 default:
1004 break;
1005 }
1006 llvm_unreachable(::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1007)
1007 "Unknown programming model for OpenMP directive on NVPTX target.")::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1007)
;
1008}
1009
1010void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
1011 StringRef ParentName,
1012 llvm::Function *&OutlinedFn,
1013 llvm::Constant *&OutlinedFnID,
1014 bool IsOffloadEntry,
1015 const RegionCodeGenTy &CodeGen) {
1016 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
1017 EntryFunctionState EST;
1018 WrapperFunctionsMap.clear();
1019
1020 // Emit target region as a standalone region.
1021 class NVPTXPrePostActionTy : public PrePostActionTy {
1022 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1023
1024 public:
1025 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1026 : EST(EST) {}
1027 void Enter(CodeGenFunction &CGF) override {
1028 auto &RT =
1029 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1030 RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
1031 // Skip target region initialization.
1032 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1033 }
1034 void Exit(CodeGenFunction &CGF) override {
1035 auto &RT =
1036 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1037 RT.clearLocThreadIdInsertPt(CGF);
1038 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
1039 }
1040 } Action(EST);
1041 CodeGen.setAction(Action);
1042 IsInTTDRegion = true;
1043 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1044 IsOffloadEntry, CodeGen);
1045 IsInTTDRegion = false;
1046}
1047
1048void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
1049 EntryFunctionState &EST, bool IsSPMD) {
1050 CGBuilderTy &Bld = CGF.Builder;
1051 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD, requiresFullRuntime()));
1052 IsInTargetMasterThreadRegion = IsSPMD;
1053 if (!IsSPMD)
1054 emitGenericVarsProlog(CGF, EST.Loc);
1055}
1056
1057void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
1058 EntryFunctionState &EST,
1059 bool IsSPMD) {
1060 if (!IsSPMD)
1061 emitGenericVarsEpilog(CGF);
1062
1063 CGBuilderTy &Bld = CGF.Builder;
1064 OMPBuilder.createTargetDeinit(Bld, IsSPMD, requiresFullRuntime());
1065}
1066
1067void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
1068 StringRef ParentName,
1069 llvm::Function *&OutlinedFn,
1070 llvm::Constant *&OutlinedFnID,
1071 bool IsOffloadEntry,
1072 const RegionCodeGenTy &CodeGen) {
1073 ExecutionRuntimeModesRAII ModeRAII(
1074 CurrentExecutionMode, RequiresFullRuntime,
1075 CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1076 !supportsLightweightRuntime(CGM.getContext(), D));
1077 EntryFunctionState EST;
1078
1079 // Emit target region as a standalone region.
1080 class NVPTXPrePostActionTy : public PrePostActionTy {
1081 CGOpenMPRuntimeGPU &RT;
1082 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
1083
1084 public:
1085 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
1086 CGOpenMPRuntimeGPU::EntryFunctionState &EST)
1087 : RT(RT), EST(EST) {}
1088 void Enter(CodeGenFunction &CGF) override {
1089 RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
1090 // Skip target region initialization.
1091 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
1092 }
1093 void Exit(CodeGenFunction &CGF) override {
1094 RT.clearLocThreadIdInsertPt(CGF);
1095 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
1096 }
1097 } Action(*this, EST);
1098 CodeGen.setAction(Action);
1099 IsInTTDRegion = true;
1100 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1101 IsOffloadEntry, CodeGen);
1102 IsInTTDRegion = false;
1103}
1104
1105// Create a unique global variable to indicate the execution mode of this target
1106// region. The execution mode is either 'generic', or 'spmd' depending on the
1107// target directive. This variable is picked up by the offload library to setup
1108// the device appropriately before kernel launch. If the execution mode is
1109// 'generic', the runtime reserves one warp for the master, otherwise, all
1110// warps participate in parallel work.
1111static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
1112 bool Mode) {
1113 auto *GVMode = new llvm::GlobalVariable(
1114 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1115 llvm::GlobalValue::WeakAnyLinkage,
1116 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
1117 : OMP_TGT_EXEC_MODE_GENERIC),
1118 Twine(Name, "_exec_mode"));
1119 CGM.addCompilerUsedGlobal(GVMode);
1120}
1121
1122void CGOpenMPRuntimeGPU::createOffloadEntry(llvm::Constant *ID,
1123 llvm::Constant *Addr,
1124 uint64_t Size, int32_t,
1125 llvm::GlobalValue::LinkageTypes) {
1126 // TODO: Add support for global variables on the device after declare target
1127 // support.
1128 llvm::Function *Fn = dyn_cast<llvm::Function>(Addr);
1129 if (!Fn)
1130 return;
1131
1132 llvm::Module &M = CGM.getModule();
1133 llvm::LLVMContext &Ctx = CGM.getLLVMContext();
1134
1135 // Get "nvvm.annotations" metadata node.
1136 llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
1137
1138 llvm::Metadata *MDVals[] = {
1139 llvm::ConstantAsMetadata::get(Fn), llvm::MDString::get(Ctx, "kernel"),
1140 llvm::ConstantAsMetadata::get(
1141 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1142 // Append metadata to nvvm.annotations.
1143 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1144
1145 // Add a function attribute for the kernel.
1146 Fn->addFnAttr(llvm::Attribute::get(Ctx, "kernel"));
1147}
1148
1149void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
1150 const OMPExecutableDirective &D, StringRef ParentName,
1151 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
1152 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
1153 if (!IsOffloadEntry) // Nothing to do.
1154 return;
1155
1156 assert(!ParentName.empty() && "Invalid target region parent name!")(static_cast <bool> (!ParentName.empty() && "Invalid target region parent name!"
) ? void (0) : __assert_fail ("!ParentName.empty() && \"Invalid target region parent name!\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1156, __extension__
__PRETTY_FUNCTION__))
;
1157
1158 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
1159 if (Mode)
1160 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1161 CodeGen);
1162 else
1163 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1164 CodeGen);
1165
1166 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
1167}
1168
1169namespace {
1170LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()using ::llvm::BitmaskEnumDetail::operator~; using ::llvm::BitmaskEnumDetail
::operator|; using ::llvm::BitmaskEnumDetail::operator&; using
::llvm::BitmaskEnumDetail::operator^; using ::llvm::BitmaskEnumDetail
::operator|=; using ::llvm::BitmaskEnumDetail::operator&=
; using ::llvm::BitmaskEnumDetail::operator^=
;
1171/// Enum for accesseing the reserved_2 field of the ident_t struct.
1172enum ModeFlagsTy : unsigned {
1173 /// Bit set to 1 when in SPMD mode.
1174 KMP_IDENT_SPMD_MODE = 0x01,
1175 /// Bit set to 1 when a simplified runtime is used.
1176 KMP_IDENT_SIMPLE_RT_MODE = 0x02,
1177 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/KMP_IDENT_SIMPLE_RT_MODE)LLVM_BITMASK_LARGEST_ENUMERATOR = KMP_IDENT_SIMPLE_RT_MODE
1178};
1179
1180/// Special mode Undefined. Is the combination of Non-SPMD mode + SimpleRuntime.
1181static const ModeFlagsTy UndefinedMode =
1182 (~KMP_IDENT_SPMD_MODE) & KMP_IDENT_SIMPLE_RT_MODE;
1183} // anonymous namespace
1184
1185unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
1186 switch (getExecutionMode()) {
1187 case EM_SPMD:
1188 if (requiresFullRuntime())
1189 return KMP_IDENT_SPMD_MODE & (~KMP_IDENT_SIMPLE_RT_MODE);
1190 return KMP_IDENT_SPMD_MODE | KMP_IDENT_SIMPLE_RT_MODE;
1191 case EM_NonSPMD:
1192 assert(requiresFullRuntime() && "Expected full runtime.")(static_cast <bool> (requiresFullRuntime() && "Expected full runtime."
) ? void (0) : __assert_fail ("requiresFullRuntime() && \"Expected full runtime.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1192, __extension__
__PRETTY_FUNCTION__))
;
1193 return (~KMP_IDENT_SPMD_MODE) & (~KMP_IDENT_SIMPLE_RT_MODE);
1194 case EM_Unknown:
1195 return UndefinedMode;
1196 }
1197 llvm_unreachable("Unknown flags are requested.")::llvm::llvm_unreachable_internal("Unknown flags are requested."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1197)
;
1198}
1199
1200CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
1201 : CGOpenMPRuntime(CGM, "_", "$") {
1202 if (!CGM.getLangOpts().OpenMPIsDevice)
1203 llvm_unreachable("OpenMP can only handle device code.")::llvm::llvm_unreachable_internal("OpenMP can only handle device code."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1203)
;
1204
1205 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
1206 if (!CGM.getLangOpts().OMPHostIRFile.empty()) {
1207 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
1208 "__omp_rtl_debug_kind");
1209 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
1210 "__omp_rtl_assume_teams_oversubscription");
1211 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
1212 "__omp_rtl_assume_threads_oversubscription");
1213 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
1214 "__omp_rtl_assume_no_thread_state");
1215 }
1216}
1217
1218void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
1219 ProcBindKind ProcBind,
1220 SourceLocation Loc) {
1221 // Do nothing in case of SPMD mode and L0 parallel.
1222 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
1223 return;
1224
1225 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1226}
1227
1228void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
1229 llvm::Value *NumThreads,
1230 SourceLocation Loc) {
1231 // Nothing to do.
1232}
1233
1234void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
1235 const Expr *NumTeams,
1236 const Expr *ThreadLimit,
1237 SourceLocation Loc) {}
1238
1239llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
1240 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1241 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1242 // Emit target region as a standalone region.
1243 class NVPTXPrePostActionTy : public PrePostActionTy {
1244 bool &IsInParallelRegion;
1245 bool PrevIsInParallelRegion;
1246
1247 public:
1248 NVPTXPrePostActionTy(bool &IsInParallelRegion)
1249 : IsInParallelRegion(IsInParallelRegion) {}
1250 void Enter(CodeGenFunction &CGF) override {
1251 PrevIsInParallelRegion = IsInParallelRegion;
1252 IsInParallelRegion = true;
1253 }
1254 void Exit(CodeGenFunction &CGF) override {
1255 IsInParallelRegion = PrevIsInParallelRegion;
1256 }
1257 } Action(IsInParallelRegion);
1258 CodeGen.setAction(Action);
1259 bool PrevIsInTTDRegion = IsInTTDRegion;
1260 IsInTTDRegion = false;
1261 bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1262 IsInTargetMasterThreadRegion = false;
1263 auto *OutlinedFun =
1264 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1265 D, ThreadIDVar, InnermostKind, CodeGen));
1266 IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
1267 IsInTTDRegion = PrevIsInTTDRegion;
1268 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD &&
1269 !IsInParallelRegion) {
1270 llvm::Function *WrapperFun =
1271 createParallelDataSharingWrapper(OutlinedFun, D);
1272 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1273 }
1274
1275 return OutlinedFun;
1276}
1277
1278/// Get list of lastprivate variables from the teams distribute ... or
1279/// teams {distribute ...} directives.
1280static void
1281getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1282 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1283 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1284, __extension__
__PRETTY_FUNCTION__))
1284 "expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1284, __extension__
__PRETTY_FUNCTION__))
;
1285 const OMPExecutableDirective *Dir = &D;
1286 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1287 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
1288 Ctx,
1289 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1290 /*IgnoreCaptured=*/true))) {
1291 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
1292 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1293 Dir = nullptr;
1294 }
1295 }
1296 if (!Dir)
1297 return;
1298 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
1299 for (const Expr *E : C->getVarRefs())
1300 Vars.push_back(getPrivateItem(E));
1301 }
1302}
1303
1304/// Get list of reduction variables from the teams ... directives.
1305static void
1306getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
1307 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1308 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1309, __extension__
__PRETTY_FUNCTION__))
1309 "expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1309, __extension__
__PRETTY_FUNCTION__))
;
1310 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
1311 for (const Expr *E : C->privates())
1312 Vars.push_back(getPrivateItem(E));
1313 }
1314}
1315
1316llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
1317 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1318 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
1319 SourceLocation Loc = D.getBeginLoc();
1320
1321 const RecordDecl *GlobalizedRD = nullptr;
1322 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1323 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1324 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1325 // Globalize team reductions variable unconditionally in all modes.
1326 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1327 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1328 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1329 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1330 if (!LastPrivatesReductions.empty()) {
1331 GlobalizedRD = ::buildRecordForGlobalizedVars(
1332 CGM.getContext(), llvm::None, LastPrivatesReductions,
1333 MappedDeclsFields, WarpSize);
1334 }
1335 } else if (!LastPrivatesReductions.empty()) {
1336 assert(!TeamAndReductions.first &&(static_cast <bool> (!TeamAndReductions.first &&
"Previous team declaration is not expected.") ? void (0) : __assert_fail
("!TeamAndReductions.first && \"Previous team declaration is not expected.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1337, __extension__
__PRETTY_FUNCTION__))
1337 "Previous team declaration is not expected.")(static_cast <bool> (!TeamAndReductions.first &&
"Previous team declaration is not expected.") ? void (0) : __assert_fail
("!TeamAndReductions.first && \"Previous team declaration is not expected.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1337, __extension__
__PRETTY_FUNCTION__))
;
1338 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1339 std::swap(TeamAndReductions.second, LastPrivatesReductions);
1340 }
1341
1342 // Emit target region as a standalone region.
1343 class NVPTXPrePostActionTy : public PrePostActionTy {
1344 SourceLocation &Loc;
1345 const RecordDecl *GlobalizedRD;
1346 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1347 &MappedDeclsFields;
1348
1349 public:
1350 NVPTXPrePostActionTy(
1351 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1352 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1353 &MappedDeclsFields)
1354 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1355 MappedDeclsFields(MappedDeclsFields) {}
1356 void Enter(CodeGenFunction &CGF) override {
1357 auto &Rt =
1358 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1359 if (GlobalizedRD) {
1360 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1361 I->getSecond().MappedParams =
1362 std::make_unique<CodeGenFunction::OMPMapVars>();
1363 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1364 for (const auto &Pair : MappedDeclsFields) {
1365 assert(Pair.getFirst()->isCanonicalDecl() &&(static_cast <bool> (Pair.getFirst()->isCanonicalDecl
() && "Expected canonical declaration") ? void (0) : __assert_fail
("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1366, __extension__
__PRETTY_FUNCTION__))
1366 "Expected canonical declaration")(static_cast <bool> (Pair.getFirst()->isCanonicalDecl
() && "Expected canonical declaration") ? void (0) : __assert_fail
("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1366, __extension__
__PRETTY_FUNCTION__))
;
1367 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1368 }
1369 }
1370 Rt.emitGenericVarsProlog(CGF, Loc);
1371 }
1372 void Exit(CodeGenFunction &CGF) override {
1373 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1374 .emitGenericVarsEpilog(CGF);
1375 }
1376 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1377 CodeGen.setAction(Action);
1378 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1379 D, ThreadIDVar, InnermostKind, CodeGen);
1380
1381 return OutlinedFun;
1382}
1383
1384void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1385 SourceLocation Loc,
1386 bool WithSPMDCheck) {
1387 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1388 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1389 return;
1390
1391 CGBuilderTy &Bld = CGF.Builder;
1392
1393 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1394 if (I == FunctionGlobalizedDecls.end())
1395 return;
1396
1397 for (auto &Rec : I->getSecond().LocalVarData) {
1398 const auto *VD = cast<VarDecl>(Rec.first);
1399 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1400 QualType VarTy = VD->getType();
1401
1402 // Get the local allocation of a firstprivate variable before sharing
1403 llvm::Value *ParValue;
1404 if (EscapedParam) {
1405 LValue ParLVal =
1406 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1407 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1408 }
1409
1410 // Allocate space for the variable to be globalized
1411 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1412 llvm::CallBase *VoidPtr =
1413 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1414 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1415 AllocArgs, VD->getName());
1416 // FIXME: We should use the variables actual alignment as an argument.
1417 VoidPtr->addRetAttr(llvm::Attribute::get(
1418 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1419 CGM.getContext().getTargetInfo().getNewAlign() / 8));
1420
1421 // Cast the void pointer and get the address of the globalized variable.
1422 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1423 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1424 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1425 LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1426 Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1427 Rec.second.GlobalizedVal = VoidPtr;
1428
1429 // Assign the local allocation to the newly globalized location.
1430 if (EscapedParam) {
1431 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1432 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1433 }
1434 if (auto *DI = CGF.getDebugInfo())
1435 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1436 }
1437 for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1438 // Use actual memory size of the VLA object including the padding
1439 // for alignment purposes.
1440 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1441 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1442 Size = Bld.CreateNUWAdd(
1443 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1444 llvm::Value *AlignVal =
1445 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1446
1447 Size = Bld.CreateUDiv(Size, AlignVal);
1448 Size = Bld.CreateNUWMul(Size, AlignVal);
1449
1450 // Allocate space for this VLA object to be globalized.
1451 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1452 llvm::CallBase *VoidPtr =
1453 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1454 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1455 AllocArgs, VD->getName());
1456 VoidPtr->addRetAttr(
1457 llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
1458 CGM.getContext().getTargetInfo().getNewAlign()));
1459
1460 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1461 std::pair<llvm::Value *, llvm::Value *>(
1462 {VoidPtr, CGF.getTypeSize(VD->getType())}));
1463 LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1464 CGM.getContext().getDeclAlign(VD),
1465 AlignmentSource::Decl);
1466 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1467 Base.getAddress(CGF));
1468 }
1469 I->getSecond().MappedParams->apply(CGF);
1470}
1471
1472void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1473 bool WithSPMDCheck) {
1474 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1475 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1476 return;
1477
1478 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1479 if (I != FunctionGlobalizedDecls.end()) {
1480 // Deallocate the memory for each globalized VLA object
1481 for (auto AddrSizePair :
1482 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1483 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1484 CGM.getModule(), OMPRTL___kmpc_free_shared),
1485 {AddrSizePair.first, AddrSizePair.second});
1486 }
1487 // Deallocate the memory for each globalized value
1488 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1489 const auto *VD = cast<VarDecl>(Rec.first);
1490 I->getSecond().MappedParams->restore(CGF);
1491
1492 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1493 CGF.getTypeSize(VD->getType())};
1494 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1495 CGM.getModule(), OMPRTL___kmpc_free_shared),
1496 FreeArgs);
1497 }
1498 }
1499}
1500
1501void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1502 const OMPExecutableDirective &D,
1503 SourceLocation Loc,
1504 llvm::Function *OutlinedFn,
1505 ArrayRef<llvm::Value *> CapturedVars) {
1506 if (!CGF.HaveInsertPoint())
1507 return;
1508
1509 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1510 /*Name=*/".zero.addr");
1511 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1512 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1513 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1514 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1515 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1516 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1517}
1518
1519void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1520 SourceLocation Loc,
1521 llvm::Function *OutlinedFn,
1522 ArrayRef<llvm::Value *> CapturedVars,
1523 const Expr *IfCond,
1524 llvm::Value *NumThreads) {
1525 if (!CGF.HaveInsertPoint())
1526 return;
1527
1528 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1529 NumThreads](CodeGenFunction &CGF,
1530 PrePostActionTy &Action) {
1531 CGBuilderTy &Bld = CGF.Builder;
1532 llvm::Value *NumThreadsVal = NumThreads;
1533 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1534 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1535 if (WFn)
1536 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1537 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1538
1539 // Create a private scope that will globalize the arguments
1540 // passed from the outside of the target region.
1541 // TODO: Is that needed?
1542 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1543
1544 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1545 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1546 "captured_vars_addrs");
1547 // There's something to share.
1548 if (!CapturedVars.empty()) {
1549 // Prepare for parallel region. Indicate the outlined function.
1550 ASTContext &Ctx = CGF.getContext();
1551 unsigned Idx = 0;
1552 for (llvm::Value *V : CapturedVars) {
1553 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1554 llvm::Value *PtrV;
1555 if (V->getType()->isIntegerTy())
1556 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1557 else
1558 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1559 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1560 Ctx.getPointerType(Ctx.VoidPtrTy));
1561 ++Idx;
1562 }
1563 }
1564
1565 llvm::Value *IfCondVal = nullptr;
1566 if (IfCond)
1567 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1568 /* isSigned */ false);
1569 else
1570 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1571
1572 if (!NumThreadsVal)
1573 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1574 else
1575 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1576
1577 assert(IfCondVal && "Expected a value")(static_cast <bool> (IfCondVal && "Expected a value"
) ? void (0) : __assert_fail ("IfCondVal && \"Expected a value\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1577, __extension__
__PRETTY_FUNCTION__))
;
1578 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1579 llvm::Value *Args[] = {
1580 RTLoc,
1581 getThreadID(CGF, Loc),
1582 IfCondVal,
1583 NumThreadsVal,
1584 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1585 FnPtr,
1586 ID,
1587 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1588 CGF.VoidPtrPtrTy),
1589 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1590 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1591 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1592 Args);
1593 };
1594
1595 RegionCodeGenTy RCG(ParallelGen);
1596 RCG(CGF);
1597}
1598
1599void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1600 // Always emit simple barriers!
1601 if (!CGF.HaveInsertPoint())
1602 return;
1603 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1604 // This function does not use parameters, so we can emit just default values.
1605 llvm::Value *Args[] = {
1606 llvm::ConstantPointerNull::get(
1607 cast<llvm::PointerType>(getIdentTyPointerTy())),
1608 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1609 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1610 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1611 Args);
1612}
1613
1614void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1615 SourceLocation Loc,
1616 OpenMPDirectiveKind Kind, bool,
1617 bool) {
1618 // Always emit simple barriers!
1619 if (!CGF.HaveInsertPoint())
1620 return;
1621 // Build call __kmpc_cancel_barrier(loc, thread_id);
1622 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1623 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1624 getThreadID(CGF, Loc)};
1625
1626 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1627 CGM.getModule(), OMPRTL___kmpc_barrier),
1628 Args);
1629}
1630
1631void CGOpenMPRuntimeGPU::emitCriticalRegion(
1632 CodeGenFunction &CGF, StringRef CriticalName,
1633 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1634 const Expr *Hint) {
1635 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1636 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1637 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1638 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1639 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1640
1641 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1642
1643 // Get the mask of active threads in the warp.
1644 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1645 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1646 // Fetch team-local id of the thread.
1647 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1648
1649 // Get the width of the team.
1650 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1651
1652 // Initialize the counter variable for the loop.
1653 QualType Int32Ty =
1654 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1655 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1656 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1657 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1658 /*isInit=*/true);
1659
1660 // Block checks if loop counter exceeds upper bound.
1661 CGF.EmitBlock(LoopBB);
1662 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1663 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1664 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1665
1666 // Block tests which single thread should execute region, and which threads
1667 // should go straight to synchronisation point.
1668 CGF.EmitBlock(TestBB);
1669 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1670 llvm::Value *CmpThreadToCounter =
1671 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1672 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1673
1674 // Block emits the body of the critical region.
1675 CGF.EmitBlock(BodyBB);
1676
1677 // Output the critical statement.
1678 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1679 Hint);
1680
1681 // After the body surrounded by the critical region, the single executing
1682 // thread will jump to the synchronisation point.
1683 // Block waits for all threads in current team to finish then increments the
1684 // counter variable and returns to the loop.
1685 CGF.EmitBlock(SyncBB);
1686 // Reconverge active threads in the warp.
1687 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1688 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1689 Mask);
1690
1691 llvm::Value *IncCounterVal =
1692 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1693 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1694 CGF.EmitBranch(LoopBB);
1695
1696 // Block that is reached when all threads in the team complete the region.
1697 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1698}
1699
1700/// Cast value to the specified type.
1701static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1702 QualType ValTy, QualType CastTy,
1703 SourceLocation Loc) {
1704 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(CastTy).isZero() && "Cast type must sized.") ? void (
0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1705, __extension__
__PRETTY_FUNCTION__))
1705 "Cast type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(CastTy).isZero() && "Cast type must sized.") ? void (
0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1705, __extension__
__PRETTY_FUNCTION__))
;
1706 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(ValTy).isZero() && "Val type must sized.") ? void (0
) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1707, __extension__
__PRETTY_FUNCTION__))
1707 "Val type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(ValTy).isZero() && "Val type must sized.") ? void (0
) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1707, __extension__
__PRETTY_FUNCTION__))
;
1708 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1709 if (ValTy == CastTy)
1710 return Val;
1711 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1712 CGF.getContext().getTypeSizeInChars(CastTy))
1713 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1714 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1715 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1716 CastTy->hasSignedIntegerRepresentation());
1717 Address CastItem = CGF.CreateMemTemp(CastTy);
1718 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1719 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()),
1720 Val->getType());
1721 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1722 LValueBaseInfo(AlignmentSource::Type),
1723 TBAAAccessInfo());
1724 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1725 LValueBaseInfo(AlignmentSource::Type),
1726 TBAAAccessInfo());
1727}
1728
1729/// This function creates calls to one of two shuffle functions to copy
1730/// variables between lanes in a warp.
1731static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1732 llvm::Value *Elem,
1733 QualType ElemType,
1734 llvm::Value *Offset,
1735 SourceLocation Loc) {
1736 CodeGenModule &CGM = CGF.CGM;
1737 CGBuilderTy &Bld = CGF.Builder;
1738 CGOpenMPRuntimeGPU &RT =
1739 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1740 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1741
1742 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1743 assert(Size.getQuantity() <= 8 &&(static_cast <bool> (Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.") ? void (0) :
__assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1744, __extension__
__PRETTY_FUNCTION__))
1744 "Unsupported bitwidth in shuffle instruction.")(static_cast <bool> (Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.") ? void (0) :
__assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1744, __extension__
__PRETTY_FUNCTION__))
;
1745
1746 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1747 ? OMPRTL___kmpc_shuffle_int32
1748 : OMPRTL___kmpc_shuffle_int64;
1749
1750 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1751 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1752 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1753 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1754 llvm::Value *WarpSize =
1755 Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1756
1757 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1758 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1759 {ElemCast, Offset, WarpSize});
1760
1761 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1762}
1763
1764static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1765 Address DestAddr, QualType ElemType,
1766 llvm::Value *Offset, SourceLocation Loc) {
1767 CGBuilderTy &Bld = CGF.Builder;
1768
1769 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1770 // Create the loop over the big sized data.
1771 // ptr = (void*)Elem;
1772 // ptrEnd = (void*) Elem + 1;
1773 // Step = 8;
1774 // while (ptr + Step < ptrEnd)
1775 // shuffle((int64_t)*ptr);
1776 // Step = 4;
1777 // while (ptr + Step < ptrEnd)
1778 // shuffle((int32_t)*ptr);
1779 // ...
1780 Address ElemPtr = DestAddr;
1781 Address Ptr = SrcAddr;
1782 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1783 Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
1784 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1785 if (Size < CharUnits::fromQuantity(IntSize))
1786 continue;
1787 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1788 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1789 /*Signed=*/1);
1790 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1791 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
1792 IntTy);
1793 ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1794 ElemPtr, IntTy->getPointerTo(), IntTy);
1795 if (Size.getQuantity() / IntSize > 1) {
1796 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1797 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1798 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1799 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1800 CGF.EmitBlock(PreCondBB);
1801 llvm::PHINode *PhiSrc =
1802 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1803 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1804 llvm::PHINode *PhiDest =
1805 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1806 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1807 Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1808 ElemPtr =
1809 Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1810 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1811 CGF.Int8Ty, PtrEnd.getPointer(),
1812 Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
1813 CGF.VoidPtrTy));
1814 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1815 ThenBB, ExitBB);
1816 CGF.EmitBlock(ThenBB);
1817 llvm::Value *Res = createRuntimeShuffleFunction(
1818 CGF,
1819 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1820 LValueBaseInfo(AlignmentSource::Type),
1821 TBAAAccessInfo()),
1822 IntType, Offset, Loc);
1823 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1824 LValueBaseInfo(AlignmentSource::Type),
1825 TBAAAccessInfo());
1826 Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1827 Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1828 PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1829 PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1830 CGF.EmitBranch(PreCondBB);
1831 CGF.EmitBlock(ExitBB);
1832 } else {
1833 llvm::Value *Res = createRuntimeShuffleFunction(
1834 CGF,
1835 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1836 LValueBaseInfo(AlignmentSource::Type),
1837 TBAAAccessInfo()),
1838 IntType, Offset, Loc);
1839 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1840 LValueBaseInfo(AlignmentSource::Type),
1841 TBAAAccessInfo());
1842 Ptr = Bld.CreateConstGEP(Ptr, 1);
1843 ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1844 }
1845 Size = Size % IntSize;
1846 }
1847}
1848
1849namespace {
1850enum CopyAction : unsigned {
1851 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1852 // the warp using shuffle instructions.
1853 RemoteLaneToThread,
1854 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1855 ThreadCopy,
1856 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1857 ThreadToScratchpad,
1858 // ScratchpadToThread: Copy from a scratchpad array in global memory
1859 // containing team-reduced data to a thread's stack.
1860 ScratchpadToThread,
1861};
1862} // namespace
1863
1864struct CopyOptionsTy {
1865 llvm::Value *RemoteLaneOffset;
1866 llvm::Value *ScratchpadIndex;
1867 llvm::Value *ScratchpadWidth;
1868};
1869
1870/// Emit instructions to copy a Reduce list, which contains partially
1871/// aggregated values, in the specified direction.
1872static void emitReductionListCopy(
1873 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1874 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1875 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1876
1877 CodeGenModule &CGM = CGF.CGM;
1878 ASTContext &C = CGM.getContext();
1879 CGBuilderTy &Bld = CGF.Builder;
1880
1881 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1882 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1883 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1884
1885 // Iterates, element-by-element, through the source Reduce list and
1886 // make a copy.
1887 unsigned Idx = 0;
1888 unsigned Size = Privates.size();
1889 for (const Expr *Private : Privates) {
1890 Address SrcElementAddr = Address::invalid();
1891 Address DestElementAddr = Address::invalid();
1892 Address DestElementPtrAddr = Address::invalid();
1893 // Should we shuffle in an element from a remote lane?
1894 bool ShuffleInElement = false;
1895 // Set to true to update the pointer in the dest Reduce list to a
1896 // newly created element.
1897 bool UpdateDestListPtr = false;
1898 // Increment the src or dest pointer to the scratchpad, for each
1899 // new element.
1900 bool IncrScratchpadSrc = false;
1901 bool IncrScratchpadDest = false;
1902 QualType PrivatePtrType = C.getPointerType(Private->getType());
1903 llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
1904
1905 switch (Action) {
1906 case RemoteLaneToThread: {
1907 // Step 1.1: Get the address for the src element in the Reduce list.
1908 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1909 SrcElementAddr =
1910 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1911 SrcElementPtrAddr, PrivateLlvmPtrType),
1912 PrivatePtrType->castAs<PointerType>());
1913
1914 // Step 1.2: Create a temporary to store the element in the destination
1915 // Reduce list.
1916 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1917 DestElementAddr =
1918 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1919 ShuffleInElement = true;
1920 UpdateDestListPtr = true;
1921 break;
1922 }
1923 case ThreadCopy: {
1924 // Step 1.1: Get the address for the src element in the Reduce list.
1925 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1926 SrcElementAddr =
1927 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1928 SrcElementPtrAddr, PrivateLlvmPtrType),
1929 PrivatePtrType->castAs<PointerType>());
1930
1931 // Step 1.2: Get the address for dest element. The destination
1932 // element has already been created on the thread's stack.
1933 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1934 DestElementAddr =
1935 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1936 DestElementPtrAddr, PrivateLlvmPtrType),
1937 PrivatePtrType->castAs<PointerType>());
1938 break;
1939 }
1940 case ThreadToScratchpad: {
1941 // Step 1.1: Get the address for the src element in the Reduce list.
1942 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1943 SrcElementAddr =
1944 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1945 SrcElementPtrAddr, PrivateLlvmPtrType),
1946 PrivatePtrType->castAs<PointerType>());
1947
1948 // Step 1.2: Get the address for dest element:
1949 // address = base + index * ElementSizeInChars.
1950 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1951 llvm::Value *CurrentOffset =
1952 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1953 llvm::Value *ScratchPadElemAbsolutePtrVal =
1954 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1955 ScratchPadElemAbsolutePtrVal =
1956 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1957 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1958 C.getTypeAlignInChars(Private->getType()));
1959 IncrScratchpadDest = true;
1960 break;
1961 }
1962 case ScratchpadToThread: {
1963 // Step 1.1: Get the address for the src element in the scratchpad.
1964 // address = base + index * ElementSizeInChars.
1965 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1966 llvm::Value *CurrentOffset =
1967 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1968 llvm::Value *ScratchPadElemAbsolutePtrVal =
1969 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1970 ScratchPadElemAbsolutePtrVal =
1971 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1972 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1973 C.getTypeAlignInChars(Private->getType()));
1974 IncrScratchpadSrc = true;
1975
1976 // Step 1.2: Create a temporary to store the element in the destination
1977 // Reduce list.
1978 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1979 DestElementAddr =
1980 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1981 UpdateDestListPtr = true;
1982 break;
1983 }
1984 }
1985
1986 // Regardless of src and dest of copy, we emit the load of src
1987 // element as this is required in all directions
1988 SrcElementAddr = Bld.CreateElementBitCast(
1989 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1990 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1991 SrcElementAddr.getElementType());
1992
1993 // Now that all active lanes have read the element in the
1994 // Reduce list, shuffle over the value from the remote lane.
1995 if (ShuffleInElement) {
1996 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1997 RemoteLaneOffset, Private->getExprLoc());
1998 } else {
1999 switch (CGF.getEvaluationKind(Private->getType())) {
2000 case TEK_Scalar: {
2001 llvm::Value *Elem = CGF.EmitLoadOfScalar(
2002 SrcElementAddr, /*Volatile=*/false, Private->getType(),
2003 Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
2004 TBAAAccessInfo());
2005 // Store the source element value to the dest element address.
2006 CGF.EmitStoreOfScalar(
2007 Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
2008 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2009 break;
2010 }
2011 case TEK_Complex: {
2012 CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
2013 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2014 Private->getExprLoc());
2015 CGF.EmitStoreOfComplex(
2016 Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2017 /*isInit=*/false);
2018 break;
2019 }
2020 case TEK_Aggregate:
2021 CGF.EmitAggregateCopy(
2022 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2023 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2024 Private->getType(), AggValueSlot::DoesNotOverlap);
2025 break;
2026 }
2027 }
2028
2029 // Step 3.1: Modify reference in dest Reduce list as needed.
2030 // Modifying the reference in Reduce list to point to the newly
2031 // created element. The element is live in the current function
2032 // scope and that of functions it invokes (i.e., reduce_function).
2033 // RemoteReduceData[i] = (void*)&RemoteElem
2034 if (UpdateDestListPtr) {
2035 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2036 DestElementAddr.getPointer(), CGF.VoidPtrTy),
2037 DestElementPtrAddr, /*Volatile=*/false,
2038 C.VoidPtrTy);
2039 }
2040
2041 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2042 // address of the next element in scratchpad memory, unless we're currently
2043 // processing the last one. Memory alignment is also taken care of here.
2044 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2045 // FIXME: This code doesn't make any sense, it's trying to perform
2046 // integer arithmetic on pointers.
2047 llvm::Value *ScratchpadBasePtr =
2048 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
2049 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2050 ScratchpadBasePtr = Bld.CreateNUWAdd(
2051 ScratchpadBasePtr,
2052 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
2053
2054 // Take care of global memory alignment for performance
2055 ScratchpadBasePtr = Bld.CreateNUWSub(
2056 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2057 ScratchpadBasePtr = Bld.CreateUDiv(
2058 ScratchpadBasePtr,
2059 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2060 ScratchpadBasePtr = Bld.CreateNUWAdd(
2061 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2062 ScratchpadBasePtr = Bld.CreateNUWMul(
2063 ScratchpadBasePtr,
2064 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2065
2066 if (IncrScratchpadDest)
2067 DestBase =
2068 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
2069 else /* IncrScratchpadSrc = true */
2070 SrcBase =
2071 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
2072 }
2073
2074 ++Idx;
2075 }
2076}
2077
2078/// This function emits a helper that gathers Reduce lists from the first
2079/// lane of every active warp to lanes in the first warp.
2080///
2081/// void inter_warp_copy_func(void* reduce_data, num_warps)
2082/// shared smem[warp_size];
2083/// For all data entries D in reduce_data:
2084/// sync
2085/// If (I am the first lane in each warp)
2086/// Copy my local D to smem[warp_id]
2087/// sync
2088/// if (I am the first warp)
2089/// Copy smem[thread_id] to my local D
2090static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2091 ArrayRef<const Expr *> Privates,
2092 QualType ReductionArrayTy,
2093 SourceLocation Loc) {
2094 ASTContext &C = CGM.getContext();
2095 llvm::Module &M = CGM.getModule();
2096
2097 // ReduceList: thread local Reduce list.
2098 // At the stage of the computation when this function is called, partially
2099 // aggregated values reside in the first lane of every active warp.
2100 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2101 C.VoidPtrTy, ImplicitParamDecl::Other);
2102 // NumWarps: number of warps active in the parallel region. This could
2103 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2104 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2105 C.getIntTypeForBitwidth(32, /* Signed */ true),
2106 ImplicitParamDecl::Other);
2107 FunctionArgList Args;
2108 Args.push_back(&ReduceListArg);
2109 Args.push_back(&NumWarpsArg);
2110
2111 const CGFunctionInfo &CGFI =
2112 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2113 auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
2114 llvm::GlobalValue::InternalLinkage,
2115 "_omp_reduction_inter_warp_copy_func", &M);
2116 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2117 Fn->setDoesNotRecurse();
2118 CodeGenFunction CGF(CGM);
2119 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2120
2121 CGBuilderTy &Bld = CGF.Builder;
2122
2123 // This array is used as a medium to transfer, one reduce element at a time,
2124 // the data from the first lane of every warp to lanes in the first warp
2125 // in order to perform the final step of a reduction in a parallel region
2126 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2127 // for reduced latency, as well as to have a distinct copy for concurrently
2128 // executing target regions. The array is declared with common linkage so
2129 // as to be shared across compilation units.
2130 StringRef TransferMediumName =
2131 "__openmp_nvptx_data_transfer_temporary_storage";
2132 llvm::GlobalVariable *TransferMedium =
2133 M.getGlobalVariable(TransferMediumName);
2134 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
2135 if (!TransferMedium) {
11
Assuming 'TransferMedium' is non-null
12
Taking false branch
2136 auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
2137 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2138 TransferMedium = new llvm::GlobalVariable(
2139 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
2140 llvm::UndefValue::get(Ty), TransferMediumName,
2141 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2142 SharedAddressSpace);
2143 CGM.addCompilerUsedGlobal(TransferMedium);
2144 }
2145
2146 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
2147 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2148 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
2149 // nvptx_lane_id = nvptx_id % warpsize
2150 llvm::Value *LaneID = getNVPTXLaneID(CGF);
13
Calling 'getNVPTXLaneID'
2151 // nvptx_warp_id = nvptx_id / warpsize
2152 llvm::Value *WarpID = getNVPTXWarpID(CGF);
2153
2154 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2155 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2156 Address LocalReduceList(
2157 Bld.CreatePointerBitCastOrAddrSpaceCast(
2158 CGF.EmitLoadOfScalar(
2159 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
2160 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
2161 ElemTy->getPointerTo()),
2162 ElemTy, CGF.getPointerAlign());
2163
2164 unsigned Idx = 0;
2165 for (const Expr *Private : Privates) {
2166 //
2167 // Warp master copies reduce element to transfer medium in __shared__
2168 // memory.
2169 //
2170 unsigned RealTySize =
2171 C.getTypeSizeInChars(Private->getType())
2172 .alignTo(C.getTypeAlignInChars(Private->getType()))
2173 .getQuantity();
2174 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
2175 unsigned NumIters = RealTySize / TySize;
2176 if (NumIters == 0)
2177 continue;
2178 QualType CType = C.getIntTypeForBitwidth(
2179 C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
2180 llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
2181 CharUnits Align = CharUnits::fromQuantity(TySize);
2182 llvm::Value *Cnt = nullptr;
2183 Address CntAddr = Address::invalid();
2184 llvm::BasicBlock *PrecondBB = nullptr;
2185 llvm::BasicBlock *ExitBB = nullptr;
2186 if (NumIters > 1) {
2187 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
2188 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
2189 /*Volatile=*/false, C.IntTy);
2190 PrecondBB = CGF.createBasicBlock("precond");
2191 ExitBB = CGF.createBasicBlock("exit");
2192 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
2193 // There is no need to emit line number for unconditional branch.
2194 (void)ApplyDebugLocation::CreateEmpty(CGF);
2195 CGF.EmitBlock(PrecondBB);
2196 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
2197 llvm::Value *Cmp =
2198 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
2199 Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
2200 CGF.EmitBlock(BodyBB);
2201 }
2202 // kmpc_barrier.
2203 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2204 /*EmitChecks=*/false,
2205 /*ForceSimpleCall=*/true);
2206 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2207 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2208 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2209
2210 // if (lane_id == 0)
2211 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
2212 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2213 CGF.EmitBlock(ThenBB);
2214
2215 // Reduce element = LocalReduceList[i]
2216 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2217 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2218 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2219 // elemptr = ((CopyType*)(elemptrptr)) + I
2220 Address ElemPtr(ElemPtrPtr, CGF.Int8Ty, Align);
2221 ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
2222 if (NumIters > 1)
2223 ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
2224
2225 // Get pointer to location in transfer medium.
2226 // MediumPtr = &medium[warp_id]
2227 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2228 TransferMedium->getValueType(), TransferMedium,
2229 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2230 // Casting to actual data type.
2231 // MediumPtr = (CopyType*)MediumPtrAddr;
2232 Address MediumPtr(
2233 Bld.CreateBitCast(
2234 MediumPtrVal,
2235 CopyType->getPointerTo(
2236 MediumPtrVal->getType()->getPointerAddressSpace())),
2237 CopyType, Align);
2238
2239 // elem = *elemptr
2240 //*MediumPtr = elem
2241 llvm::Value *Elem = CGF.EmitLoadOfScalar(
2242 ElemPtr, /*Volatile=*/false, CType, Loc,
2243 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2244 // Store the source element value to the dest element address.
2245 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
2246 LValueBaseInfo(AlignmentSource::Type),
2247 TBAAAccessInfo());
2248
2249 Bld.CreateBr(MergeBB);
2250
2251 CGF.EmitBlock(ElseBB);
2252 Bld.CreateBr(MergeBB);
2253
2254 CGF.EmitBlock(MergeBB);
2255
2256 // kmpc_barrier.
2257 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
2258 /*EmitChecks=*/false,
2259 /*ForceSimpleCall=*/true);
2260
2261 //
2262 // Warp 0 copies reduce element from transfer medium.
2263 //
2264 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2265 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2266 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2267
2268 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2269 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2270 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
2271
2272 // Up to 32 threads in warp 0 are active.
2273 llvm::Value *IsActiveThread =
2274 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2275 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2276
2277 CGF.EmitBlock(W0ThenBB);
2278
2279 // SrcMediumPtr = &medium[tid]
2280 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2281 TransferMedium->getValueType(), TransferMedium,
2282 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2283 // SrcMediumVal = *SrcMediumPtr;
2284 Address SrcMediumPtr(
2285 Bld.CreateBitCast(
2286 SrcMediumPtrVal,
2287 CopyType->getPointerTo(
2288 SrcMediumPtrVal->getType()->getPointerAddressSpace())),
2289 CopyType, Align);
2290
2291 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2292 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2293 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2294 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
2295 Address TargetElemPtr(TargetElemPtrVal, CGF.Int8Ty, Align);
2296 TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
2297 if (NumIters > 1)
2298 TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
2299
2300 // *TargetElemPtr = SrcMediumVal;
2301 llvm::Value *SrcMediumValue =
2302 CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
2303 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2304 CType);
2305 Bld.CreateBr(W0MergeBB);
2306
2307 CGF.EmitBlock(W0ElseBB);
2308 Bld.CreateBr(W0MergeBB);
2309
2310 CGF.EmitBlock(W0MergeBB);
2311
2312 if (NumIters > 1) {
2313 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
2314 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
2315 CGF.EmitBranch(PrecondBB);
2316 (void)ApplyDebugLocation::CreateEmpty(CGF);
2317 CGF.EmitBlock(ExitBB);
2318 }
2319 RealTySize %= TySize;
2320 }
2321 ++Idx;
2322 }
2323
2324 CGF.FinishFunction();
2325 return Fn;
2326}
2327
2328/// Emit a helper that reduces data across two OpenMP threads (lanes)
2329/// in the same warp. It uses shuffle instructions to copy over data from
2330/// a remote lane's stack. The reduction algorithm performed is specified
2331/// by the fourth parameter.
2332///
2333/// Algorithm Versions.
2334/// Full Warp Reduce (argument value 0):
2335/// This algorithm assumes that all 32 lanes are active and gathers
2336/// data from these 32 lanes, producing a single resultant value.
2337/// Contiguous Partial Warp Reduce (argument value 1):
2338/// This algorithm assumes that only a *contiguous* subset of lanes
2339/// are active. This happens for the last warp in a parallel region
2340/// when the user specified num_threads is not an integer multiple of
2341/// 32. This contiguous subset always starts with the zeroth lane.
2342/// Partial Warp Reduce (argument value 2):
2343/// This algorithm gathers data from any number of lanes at any position.
2344/// All reduced values are stored in the lowest possible lane. The set
2345/// of problems every algorithm addresses is a super set of those
2346/// addressable by algorithms with a lower version number. Overhead
2347/// increases as algorithm version increases.
2348///
2349/// Terminology
2350/// Reduce element:
2351/// Reduce element refers to the individual data field with primitive
2352/// data types to be combined and reduced across threads.
2353/// Reduce list:
2354/// Reduce list refers to a collection of local, thread-private
2355/// reduce elements.
2356/// Remote Reduce list:
2357/// Remote Reduce list refers to a collection of remote (relative to
2358/// the current thread) reduce elements.
2359///
2360/// We distinguish between three states of threads that are important to
2361/// the implementation of this function.
2362/// Alive threads:
2363/// Threads in a warp executing the SIMT instruction, as distinguished from
2364/// threads that are inactive due to divergent control flow.
2365/// Active threads:
2366/// The minimal set of threads that has to be alive upon entry to this
2367/// function. The computation is correct iff active threads are alive.
2368/// Some threads are alive but they are not active because they do not
2369/// contribute to the computation in any useful manner. Turning them off
2370/// may introduce control flow overheads without any tangible benefits.
2371/// Effective threads:
2372/// In order to comply with the argument requirements of the shuffle
2373/// function, we must keep all lanes holding data alive. But at most
2374/// half of them perform value aggregation; we refer to this half of
2375/// threads as effective. The other half is simply handing off their
2376/// data.
2377///
2378/// Procedure
2379/// Value shuffle:
2380/// In this step active threads transfer data from higher lane positions
2381/// in the warp to lower lane positions, creating Remote Reduce list.
2382/// Value aggregation:
2383/// In this step, effective threads combine their thread local Reduce list
2384/// with Remote Reduce list and store the result in the thread local
2385/// Reduce list.
2386/// Value copy:
2387/// In this step, we deal with the assumption made by algorithm 2
2388/// (i.e. contiguity assumption). When we have an odd number of lanes
2389/// active, say 2k+1, only k threads will be effective and therefore k
2390/// new values will be produced. However, the Reduce list owned by the
2391/// (2k+1)th thread is ignored in the value aggregation. Therefore
2392/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2393/// that the contiguity assumption still holds.
2394static llvm::Function *emitShuffleAndReduceFunction(
2395 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2396 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2397 ASTContext &C = CGM.getContext();
2398
2399 // Thread local Reduce list used to host the values of data to be reduced.
2400 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2401 C.VoidPtrTy, ImplicitParamDecl::Other);
2402 // Current lane id; could be logical.
2403 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2404 ImplicitParamDecl::Other);
2405 // Offset of the remote source lane relative to the current lane.
2406 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2407 C.ShortTy, ImplicitParamDecl::Other);
2408 // Algorithm version. This is expected to be known at compile time.
2409 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2410 C.ShortTy, ImplicitParamDecl::Other);
2411 FunctionArgList Args;
2412 Args.push_back(&ReduceListArg);
2413 Args.push_back(&LaneIDArg);
2414 Args.push_back(&RemoteLaneOffsetArg);
2415 Args.push_back(&AlgoVerArg);
2416
2417 const CGFunctionInfo &CGFI =
2418 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2419 auto *Fn = llvm::Function::Create(
2420 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2421 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2422 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2423 Fn->setDoesNotRecurse();
2424
2425 CodeGenFunction CGF(CGM);
2426 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2427
2428 CGBuilderTy &Bld = CGF.Builder;
2429
2430 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2431 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2432 Address LocalReduceList(
2433 Bld.CreatePointerBitCastOrAddrSpaceCast(
2434 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2435 C.VoidPtrTy, SourceLocation()),
2436 ElemTy->getPointerTo()),
2437 ElemTy, CGF.getPointerAlign());
2438
2439 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2440 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2441 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2442
2443 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2444 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2445 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2446
2447 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2448 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2449 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2450
2451 // Create a local thread-private variable to host the Reduce list
2452 // from a remote lane.
2453 Address RemoteReduceList =
2454 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2455
2456 // This loop iterates through the list of reduce elements and copies,
2457 // element by element, from a remote lane in the warp to RemoteReduceList,
2458 // hosted on the thread's stack.
2459 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2460 LocalReduceList, RemoteReduceList,
2461 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2462 /*ScratchpadIndex=*/nullptr,
2463 /*ScratchpadWidth=*/nullptr});
2464
2465 // The actions to be performed on the Remote Reduce list is dependent
2466 // on the algorithm version.
2467 //
2468 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2469 // LaneId % 2 == 0 && Offset > 0):
2470 // do the reduction value aggregation
2471 //
2472 // The thread local variable Reduce list is mutated in place to host the
2473 // reduced data, which is the aggregated value produced from local and
2474 // remote lanes.
2475 //
2476 // Note that AlgoVer is expected to be a constant integer known at compile
2477 // time.
2478 // When AlgoVer==0, the first conjunction evaluates to true, making
2479 // the entire predicate true during compile time.
2480 // When AlgoVer==1, the second conjunction has only the second part to be
2481 // evaluated during runtime. Other conjunctions evaluates to false
2482 // during compile time.
2483 // When AlgoVer==2, the third conjunction has only the second part to be
2484 // evaluated during runtime. Other conjunctions evaluates to false
2485 // during compile time.
2486 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2487
2488 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2489 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2490 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2491
2492 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2493 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2494 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2495 CondAlgo2 = Bld.CreateAnd(
2496 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2497
2498 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2499 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2500
2501 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2502 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2503 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2504 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2505
2506 CGF.EmitBlock(ThenBB);
2507 // reduce_function(LocalReduceList, RemoteReduceList)
2508 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2509 LocalReduceList.getPointer(), CGF.VoidPtrTy);
2510 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2511 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2512 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2513 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2514 Bld.CreateBr(MergeBB);
2515
2516 CGF.EmitBlock(ElseBB);
2517 Bld.CreateBr(MergeBB);
2518
2519 CGF.EmitBlock(MergeBB);
2520
2521 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2522 // Reduce list.
2523 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2524 llvm::Value *CondCopy = Bld.CreateAnd(
2525 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2526
2527 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2528 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2529 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2530 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2531
2532 CGF.EmitBlock(CpyThenBB);
2533 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2534 RemoteReduceList, LocalReduceList);
2535 Bld.CreateBr(CpyMergeBB);
2536
2537 CGF.EmitBlock(CpyElseBB);
2538 Bld.CreateBr(CpyMergeBB);
2539
2540 CGF.EmitBlock(CpyMergeBB);
2541
2542 CGF.FinishFunction();
2543 return Fn;
2544}
2545
2546/// This function emits a helper that copies all the reduction variables from
2547/// the team into the provided global buffer for the reduction variables.
2548///
2549/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2550/// For all data entries D in reduce_data:
2551/// Copy local D to buffer.D[Idx]
2552static llvm::Value *emitListToGlobalCopyFunction(
2553 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2554 QualType ReductionArrayTy, SourceLocation Loc,
2555 const RecordDecl *TeamReductionRec,
2556 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2557 &VarFieldMap) {
2558 ASTContext &C = CGM.getContext();
2559
2560 // Buffer: global reduction buffer.
2561 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2562 C.VoidPtrTy, ImplicitParamDecl::Other);
2563 // Idx: index of the buffer.
2564 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2565 ImplicitParamDecl::Other);
2566 // ReduceList: thread local Reduce list.
2567 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2568 C.VoidPtrTy, ImplicitParamDecl::Other);
2569 FunctionArgList Args;
2570 Args.push_back(&BufferArg);
2571 Args.push_back(&IdxArg);
2572 Args.push_back(&ReduceListArg);
2573
2574 const CGFunctionInfo &CGFI =
2575 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2576 auto *Fn = llvm::Function::Create(
2577 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2578 "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2579 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2580 Fn->setDoesNotRecurse();
2581 CodeGenFunction CGF(CGM);
2582 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2583
2584 CGBuilderTy &Bld = CGF.Builder;
2585
2586 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2587 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2588 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2589 Address LocalReduceList(
2590 Bld.CreatePointerBitCastOrAddrSpaceCast(
2591 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2592 C.VoidPtrTy, Loc),
2593 ElemTy->getPointerTo()),
2594 ElemTy, CGF.getPointerAlign());
2595 QualType StaticTy = C.getRecordType(TeamReductionRec);
2596 llvm::Type *LLVMReductionsBufferTy =
2597 CGM.getTypes().ConvertTypeForMem(StaticTy);
2598 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2599 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2600 LLVMReductionsBufferTy->getPointerTo());
2601 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2602 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2603 /*Volatile=*/false, C.IntTy,
2604 Loc)};
2605 unsigned Idx = 0;
2606 for (const Expr *Private : Privates) {
2607 // Reduce element = LocalReduceList[i]
2608 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2609 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2610 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2611 // elemptr = ((CopyType*)(elemptrptr)) + I
2612 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2613 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2614 ElemPtrPtr, ElemTy->getPointerTo());
2615 Address ElemPtr =
2616 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2617 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2618 // Global = Buffer.VD[Idx];
2619 const FieldDecl *FD = VarFieldMap.lookup(VD);
2620 LValue GlobLVal = CGF.EmitLValueForField(
2621 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2622 Address GlobAddr = GlobLVal.getAddress(CGF);
2623 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2624 GlobAddr.getPointer(), Idxs);
2625 GlobLVal.setAddress(Address(BufferPtr,
2626 CGF.ConvertTypeForMem(Private->getType()),
2627 GlobAddr.getAlignment()));
2628 switch (CGF.getEvaluationKind(Private->getType())) {
2629 case TEK_Scalar: {
2630 llvm::Value *V = CGF.EmitLoadOfScalar(
2631 ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2632 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2633 CGF.EmitStoreOfScalar(V, GlobLVal);
2634 break;
2635 }
2636 case TEK_Complex: {
2637 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2638 CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2639 CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2640 break;
2641 }
2642 case TEK_Aggregate:
2643 CGF.EmitAggregateCopy(GlobLVal,
2644 CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2645 Private->getType(), AggValueSlot::DoesNotOverlap);
2646 break;
2647 }
2648 ++Idx;
2649 }
2650
2651 CGF.FinishFunction();
2652 return Fn;
2653}
2654
2655/// This function emits a helper that reduces all the reduction variables from
2656/// the team into the provided global buffer for the reduction variables.
2657///
2658/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2659/// void *GlobPtrs[];
2660/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2661/// ...
2662/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2663/// reduce_function(GlobPtrs, reduce_data);
2664static llvm::Value *emitListToGlobalReduceFunction(
2665 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2666 QualType ReductionArrayTy, SourceLocation Loc,
2667 const RecordDecl *TeamReductionRec,
2668 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2669 &VarFieldMap,
2670 llvm::Function *ReduceFn) {
2671 ASTContext &C = CGM.getContext();
2672
2673 // Buffer: global reduction buffer.
2674 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2675 C.VoidPtrTy, ImplicitParamDecl::Other);
2676 // Idx: index of the buffer.
2677 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2678 ImplicitParamDecl::Other);
2679 // ReduceList: thread local Reduce list.
2680 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2681 C.VoidPtrTy, ImplicitParamDecl::Other);
2682 FunctionArgList Args;
2683 Args.push_back(&BufferArg);
2684 Args.push_back(&IdxArg);
2685 Args.push_back(&ReduceListArg);
2686
2687 const CGFunctionInfo &CGFI =
2688 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2689 auto *Fn = llvm::Function::Create(
2690 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2691 "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2692 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2693 Fn->setDoesNotRecurse();
2694 CodeGenFunction CGF(CGM);
2695 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2696
2697 CGBuilderTy &Bld = CGF.Builder;
2698
2699 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2700 QualType StaticTy = C.getRecordType(TeamReductionRec);
2701 llvm::Type *LLVMReductionsBufferTy =
2702 CGM.getTypes().ConvertTypeForMem(StaticTy);
2703 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2704 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2705 LLVMReductionsBufferTy->getPointerTo());
2706
2707 // 1. Build a list of reduction variables.
2708 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2709 Address ReductionList =
2710 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2711 auto IPriv = Privates.begin();
2712 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2713 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2714 /*Volatile=*/false, C.IntTy,
2715 Loc)};
2716 unsigned Idx = 0;
2717 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2718 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2719 // Global = Buffer.VD[Idx];
2720 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2721 const FieldDecl *FD = VarFieldMap.lookup(VD);
2722 LValue GlobLVal = CGF.EmitLValueForField(
2723 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2724 Address GlobAddr = GlobLVal.getAddress(CGF);
2725 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2726 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2727 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2728 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2729 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2730 // Store array size.
2731 ++Idx;
2732 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2733 llvm::Value *Size = CGF.Builder.CreateIntCast(
2734 CGF.getVLASize(
2735 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2736 .NumElts,
2737 CGF.SizeTy, /*isSigned=*/false);
2738 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2739 Elem);
2740 }
2741 }
2742
2743 // Call reduce_function(GlobalReduceList, ReduceList)
2744 llvm::Value *GlobalReduceList =
2745 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2746 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2747 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2748 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2749 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2750 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2751 CGF.FinishFunction();
2752 return Fn;
2753}
2754
2755/// This function emits a helper that copies all the reduction variables from
2756/// the team into the provided global buffer for the reduction variables.
2757///
2758/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2759/// For all data entries D in reduce_data:
2760/// Copy buffer.D[Idx] to local D;
2761static llvm::Value *emitGlobalToListCopyFunction(
2762 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2763 QualType ReductionArrayTy, SourceLocation Loc,
2764 const RecordDecl *TeamReductionRec,
2765 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2766 &VarFieldMap) {
2767 ASTContext &C = CGM.getContext();
2768
2769 // Buffer: global reduction buffer.
2770 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2771 C.VoidPtrTy, ImplicitParamDecl::Other);
2772 // Idx: index of the buffer.
2773 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2774 ImplicitParamDecl::Other);
2775 // ReduceList: thread local Reduce list.
2776 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2777 C.VoidPtrTy, ImplicitParamDecl::Other);
2778 FunctionArgList Args;
2779 Args.push_back(&BufferArg);
2780 Args.push_back(&IdxArg);
2781 Args.push_back(&ReduceListArg);
2782
2783 const CGFunctionInfo &CGFI =
2784 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2785 auto *Fn = llvm::Function::Create(
2786 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2787 "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2788 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2789 Fn->setDoesNotRecurse();
2790 CodeGenFunction CGF(CGM);
2791 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2792
2793 CGBuilderTy &Bld = CGF.Builder;
2794
2795 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2796 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2797 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2798 Address LocalReduceList(
2799 Bld.CreatePointerBitCastOrAddrSpaceCast(
2800 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2801 C.VoidPtrTy, Loc),
2802 ElemTy->getPointerTo()),
2803 ElemTy, CGF.getPointerAlign());
2804 QualType StaticTy = C.getRecordType(TeamReductionRec);
2805 llvm::Type *LLVMReductionsBufferTy =
2806 CGM.getTypes().ConvertTypeForMem(StaticTy);
2807 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2808 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2809 LLVMReductionsBufferTy->getPointerTo());
2810
2811 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2812 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2813 /*Volatile=*/false, C.IntTy,
2814 Loc)};
2815 unsigned Idx = 0;
2816 for (const Expr *Private : Privates) {
2817 // Reduce element = LocalReduceList[i]
2818 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2819 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2820 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2821 // elemptr = ((CopyType*)(elemptrptr)) + I
2822 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2823 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2824 ElemPtrPtr, ElemTy->getPointerTo());
2825 Address ElemPtr =
2826 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2827 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2828 // Global = Buffer.VD[Idx];
2829 const FieldDecl *FD = VarFieldMap.lookup(VD);
2830 LValue GlobLVal = CGF.EmitLValueForField(
2831 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2832 Address GlobAddr = GlobLVal.getAddress(CGF);
2833 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2834 GlobAddr.getPointer(), Idxs);
2835 GlobLVal.setAddress(Address(BufferPtr,
2836 CGF.ConvertTypeForMem(Private->getType()),
2837 GlobAddr.getAlignment()));
2838 switch (CGF.getEvaluationKind(Private->getType())) {
2839 case TEK_Scalar: {
2840 llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2841 CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2842 LValueBaseInfo(AlignmentSource::Type),
2843 TBAAAccessInfo());
2844 break;
2845 }
2846 case TEK_Complex: {
2847 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
2848 CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2849 /*isInit=*/false);
2850 break;
2851 }
2852 case TEK_Aggregate:
2853 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2854 GlobLVal, Private->getType(),
2855 AggValueSlot::DoesNotOverlap);
2856 break;
2857 }
2858 ++Idx;
2859 }
2860
2861 CGF.FinishFunction();
2862 return Fn;
2863}
2864
2865/// This function emits a helper that reduces all the reduction variables from
2866/// the team into the provided global buffer for the reduction variables.
2867///
2868/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2869/// void *GlobPtrs[];
2870/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2871/// ...
2872/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2873/// reduce_function(reduce_data, GlobPtrs);
2874static llvm::Value *emitGlobalToListReduceFunction(
2875 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2876 QualType ReductionArrayTy, SourceLocation Loc,
2877 const RecordDecl *TeamReductionRec,
2878 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2879 &VarFieldMap,
2880 llvm::Function *ReduceFn) {
2881 ASTContext &C = CGM.getContext();
2882
2883 // Buffer: global reduction buffer.
2884 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2885 C.VoidPtrTy, ImplicitParamDecl::Other);
2886 // Idx: index of the buffer.
2887 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2888 ImplicitParamDecl::Other);
2889 // ReduceList: thread local Reduce list.
2890 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2891 C.VoidPtrTy, ImplicitParamDecl::Other);
2892 FunctionArgList Args;
2893 Args.push_back(&BufferArg);
2894 Args.push_back(&IdxArg);
2895 Args.push_back(&ReduceListArg);
2896
2897 const CGFunctionInfo &CGFI =
2898 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2899 auto *Fn = llvm::Function::Create(
2900 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2901 "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2902 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2903 Fn->setDoesNotRecurse();
2904 CodeGenFunction CGF(CGM);
2905 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2906
2907 CGBuilderTy &Bld = CGF.Builder;
2908
2909 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2910 QualType StaticTy = C.getRecordType(TeamReductionRec);
2911 llvm::Type *LLVMReductionsBufferTy =
2912 CGM.getTypes().ConvertTypeForMem(StaticTy);
2913 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2914 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2915 LLVMReductionsBufferTy->getPointerTo());
2916
2917 // 1. Build a list of reduction variables.
2918 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2919 Address ReductionList =
2920 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2921 auto IPriv = Privates.begin();
2922 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2923 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2924 /*Volatile=*/false, C.IntTy,
2925 Loc)};
2926 unsigned Idx = 0;
2927 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2928 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2929 // Global = Buffer.VD[Idx];
2930 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2931 const FieldDecl *FD = VarFieldMap.lookup(VD);
2932 LValue GlobLVal = CGF.EmitLValueForField(
2933 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2934 Address GlobAddr = GlobLVal.getAddress(CGF);
2935 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2936 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2937 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2938 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2939 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2940 // Store array size.
2941 ++Idx;
2942 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2943 llvm::Value *Size = CGF.Builder.CreateIntCast(
2944 CGF.getVLASize(
2945 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2946 .NumElts,
2947 CGF.SizeTy, /*isSigned=*/false);
2948 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2949 Elem);
2950 }
2951 }
2952
2953 // Call reduce_function(ReduceList, GlobalReduceList)
2954 llvm::Value *GlobalReduceList =
2955 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2956 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2957 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2958 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2959 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2960 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2961 CGF.FinishFunction();
2962 return Fn;
2963}
2964
2965///
2966/// Design of OpenMP reductions on the GPU
2967///
2968/// Consider a typical OpenMP program with one or more reduction
2969/// clauses:
2970///
2971/// float foo;
2972/// double bar;
2973/// #pragma omp target teams distribute parallel for \
2974/// reduction(+:foo) reduction(*:bar)
2975/// for (int i = 0; i < N; i++) {
2976/// foo += A[i]; bar *= B[i];
2977/// }
2978///
2979/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2980/// all teams. In our OpenMP implementation on the NVPTX device an
2981/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2982/// within a team are mapped to CUDA threads within a threadblock.
2983/// Our goal is to efficiently aggregate values across all OpenMP
2984/// threads such that:
2985///
2986/// - the compiler and runtime are logically concise, and
2987/// - the reduction is performed efficiently in a hierarchical
2988/// manner as follows: within OpenMP threads in the same warp,
2989/// across warps in a threadblock, and finally across teams on
2990/// the NVPTX device.
2991///
2992/// Introduction to Decoupling
2993///
2994/// We would like to decouple the compiler and the runtime so that the
2995/// latter is ignorant of the reduction variables (number, data types)
2996/// and the reduction operators. This allows a simpler interface
2997/// and implementation while still attaining good performance.
2998///
2999/// Pseudocode for the aforementioned OpenMP program generated by the
3000/// compiler is as follows:
3001///
3002/// 1. Create private copies of reduction variables on each OpenMP
3003/// thread: 'foo_private', 'bar_private'
3004/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3005/// to it and writes the result in 'foo_private' and 'bar_private'
3006/// respectively.
3007/// 3. Call the OpenMP runtime on the GPU to reduce within a team
3008/// and store the result on the team master:
3009///
3010/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
3011/// reduceData, shuffleReduceFn, interWarpCpyFn)
3012///
3013/// where:
3014/// struct ReduceData {
3015/// double *foo;
3016/// double *bar;
3017/// } reduceData
3018/// reduceData.foo = &foo_private
3019/// reduceData.bar = &bar_private
3020///
3021/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3022/// auxiliary functions generated by the compiler that operate on
3023/// variables of type 'ReduceData'. They aid the runtime perform
3024/// algorithmic steps in a data agnostic manner.
3025///
3026/// 'shuffleReduceFn' is a pointer to a function that reduces data
3027/// of type 'ReduceData' across two OpenMP threads (lanes) in the
3028/// same warp. It takes the following arguments as input:
3029///
3030/// a. variable of type 'ReduceData' on the calling lane,
3031/// b. its lane_id,
3032/// c. an offset relative to the current lane_id to generate a
3033/// remote_lane_id. The remote lane contains the second
3034/// variable of type 'ReduceData' that is to be reduced.
3035/// d. an algorithm version parameter determining which reduction
3036/// algorithm to use.
3037///
3038/// 'shuffleReduceFn' retrieves data from the remote lane using
3039/// efficient GPU shuffle intrinsics and reduces, using the
3040/// algorithm specified by the 4th parameter, the two operands
3041/// element-wise. The result is written to the first operand.
3042///
3043/// Different reduction algorithms are implemented in different
3044/// runtime functions, all calling 'shuffleReduceFn' to perform
3045/// the essential reduction step. Therefore, based on the 4th
3046/// parameter, this function behaves slightly differently to
3047/// cooperate with the runtime to ensure correctness under
3048/// different circumstances.
3049///
3050/// 'InterWarpCpyFn' is a pointer to a function that transfers
3051/// reduced variables across warps. It tunnels, through CUDA
3052/// shared memory, the thread-private data of type 'ReduceData'
3053/// from lane 0 of each warp to a lane in the first warp.
3054/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3055/// The last team writes the global reduced value to memory.
3056///
3057/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3058/// reduceData, shuffleReduceFn, interWarpCpyFn,
3059/// scratchpadCopyFn, loadAndReduceFn)
3060///
3061/// 'scratchpadCopyFn' is a helper that stores reduced
3062/// data from the team master to a scratchpad array in
3063/// global memory.
3064///
3065/// 'loadAndReduceFn' is a helper that loads data from
3066/// the scratchpad array and reduces it with the input
3067/// operand.
3068///
3069/// These compiler generated functions hide address
3070/// calculation and alignment information from the runtime.
3071/// 5. if ret == 1:
3072/// The team master of the last team stores the reduced
3073/// result to the globals in memory.
3074/// foo += reduceData.foo; bar *= reduceData.bar
3075///
3076///
3077/// Warp Reduction Algorithms
3078///
3079/// On the warp level, we have three algorithms implemented in the
3080/// OpenMP runtime depending on the number of active lanes:
3081///
3082/// Full Warp Reduction
3083///
3084/// The reduce algorithm within a warp where all lanes are active
3085/// is implemented in the runtime as follows:
3086///
3087/// full_warp_reduce(void *reduce_data,
3088/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3089/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3090/// ShuffleReduceFn(reduce_data, 0, offset, 0);
3091/// }
3092///
3093/// The algorithm completes in log(2, WARPSIZE) steps.
3094///
3095/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3096/// not used therefore we save instructions by not retrieving lane_id
3097/// from the corresponding special registers. The 4th parameter, which
3098/// represents the version of the algorithm being used, is set to 0 to
3099/// signify full warp reduction.
3100///
3101/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3102///
3103/// #reduce_elem refers to an element in the local lane's data structure
3104/// #remote_elem is retrieved from a remote lane
3105/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3106/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3107///
3108/// Contiguous Partial Warp Reduction
3109///
3110/// This reduce algorithm is used within a warp where only the first
3111/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3112/// number of OpenMP threads in a parallel region is not a multiple of
3113/// WARPSIZE. The algorithm is implemented in the runtime as follows:
3114///
3115/// void
3116/// contiguous_partial_reduce(void *reduce_data,
3117/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3118/// int size, int lane_id) {
3119/// int curr_size;
3120/// int offset;
3121/// curr_size = size;
3122/// mask = curr_size/2;
3123/// while (offset>0) {
3124/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3125/// curr_size = (curr_size+1)/2;
3126/// offset = curr_size/2;
3127/// }
3128/// }
3129///
3130/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3131///
3132/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3133/// if (lane_id < offset)
3134/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3135/// else
3136/// reduce_elem = remote_elem
3137///
3138/// This algorithm assumes that the data to be reduced are located in a
3139/// contiguous subset of lanes starting from the first. When there is
3140/// an odd number of active lanes, the data in the last lane is not
3141/// aggregated with any other lane's dat but is instead copied over.
3142///
3143/// Dispersed Partial Warp Reduction
3144///
3145/// This algorithm is used within a warp when any discontiguous subset of
3146/// lanes are active. It is used to implement the reduction operation
3147/// across lanes in an OpenMP simd region or in a nested parallel region.
3148///
3149/// void
3150/// dispersed_partial_reduce(void *reduce_data,
3151/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3152/// int size, remote_id;
3153/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3154/// do {
3155/// remote_id = next_active_lane_id_right_after_me();
3156/// # the above function returns 0 of no active lane
3157/// # is present right after the current lane.
3158/// size = number_of_active_lanes_in_this_warp();
3159/// logical_lane_id /= 2;
3160/// ShuffleReduceFn(reduce_data, logical_lane_id,
3161/// remote_id-1-threadIdx.x, 2);
3162/// } while (logical_lane_id % 2 == 0 && size > 1);
3163/// }
3164///
3165/// There is no assumption made about the initial state of the reduction.
3166/// Any number of lanes (>=1) could be active at any position. The reduction
3167/// result is returned in the first active lane.
3168///
3169/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3170///
3171/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3172/// if (lane_id % 2 == 0 && offset > 0)
3173/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3174/// else
3175/// reduce_elem = remote_elem
3176///
3177///
3178/// Intra-Team Reduction
3179///
3180/// This function, as implemented in the runtime call
3181/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
3182/// threads in a team. It first reduces within a warp using the
3183/// aforementioned algorithms. We then proceed to gather all such
3184/// reduced values at the first warp.
3185///
3186/// The runtime makes use of the function 'InterWarpCpyFn', which copies
3187/// data from each of the "warp master" (zeroth lane of each warp, where
3188/// warp-reduced data is held) to the zeroth warp. This step reduces (in
3189/// a mathematical sense) the problem of reduction across warp masters in
3190/// a block to the problem of warp reduction.
3191///
3192///
3193/// Inter-Team Reduction
3194///
3195/// Once a team has reduced its data to a single value, it is stored in
3196/// a global scratchpad array. Since each team has a distinct slot, this
3197/// can be done without locking.
3198///
3199/// The last team to write to the scratchpad array proceeds to reduce the
3200/// scratchpad array. One or more workers in the last team use the helper
3201/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3202/// the k'th worker reduces every k'th element.
3203///
3204/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
3205/// reduce across workers and compute a globally reduced value.
3206///
3207void CGOpenMPRuntimeGPU::emitReduction(
3208 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3209 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3210 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3211 if (!CGF.HaveInsertPoint())
1
Taking false branch
3212 return;
3213
3214 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
3215#ifndef NDEBUG
3216 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
3217#endif
3218
3219 if (Options.SimpleReduction) {
2
Assuming field 'SimpleReduction' is false
3220 assert(!TeamsReduction && !ParallelReduction &&(static_cast <bool> (!TeamsReduction && !ParallelReduction
&& "Invalid reduction selection in emitReduction.") ?
void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3221, __extension__
__PRETTY_FUNCTION__))
3221 "Invalid reduction selection in emitReduction.")(static_cast <bool> (!TeamsReduction && !ParallelReduction
&& "Invalid reduction selection in emitReduction.") ?
void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3221, __extension__
__PRETTY_FUNCTION__))
;
3222 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3223 ReductionOps, Options);
3224 return;
3225 }
3226
3227 assert((TeamsReduction || ParallelReduction) &&(static_cast <bool> ((TeamsReduction || ParallelReduction
) && "Invalid reduction selection in emitReduction.")
? void (0) : __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3228, __extension__
__PRETTY_FUNCTION__))
3
Taking false branch
4
Assuming 'TeamsReduction' is false
5
Assuming 'ParallelReduction' is true
6
'?' condition is true
3228 "Invalid reduction selection in emitReduction.")(static_cast <bool> ((TeamsReduction || ParallelReduction
) && "Invalid reduction selection in emitReduction.")
? void (0) : __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3228, __extension__
__PRETTY_FUNCTION__))
;
3229
3230 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3231 // RedList, shuffle_reduce_func, interwarp_copy_func);
3232 // or
3233 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3234 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
3235 llvm::Value *ThreadId = getThreadID(CGF, Loc);
3236
3237 llvm::Value *Res;
3238 ASTContext &C = CGM.getContext();
3239 // 1. Build a list of reduction variables.
3240 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3241 auto Size = RHSExprs.size();
3242 for (const Expr *E : Privates) {
7
Assuming '__begin1' is equal to '__end1'
3243 if (E->getType()->isVariablyModifiedType())
3244 // Reserve place for array size.
3245 ++Size;
3246 }
3247 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3248 QualType ReductionArrayTy =
3249 C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
3250 /*IndexTypeQuals=*/0);
3251 Address ReductionList =
3252 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3253 auto IPriv = Privates.begin();
3254 unsigned Idx = 0;
3255 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
8
Assuming 'I' is >= 'E'
9
Loop condition is false. Execution continues on line 3275
3256 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3257 CGF.Builder.CreateStore(
3258 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3259 CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
3260 Elem);
3261 if ((*IPriv)->getType()->isVariablyModifiedType()) {
3262 // Store array size.
3263 ++Idx;
3264 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
3265 llvm::Value *Size = CGF.Builder.CreateIntCast(
3266 CGF.getVLASize(
3267 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
3268 .NumElts,
3269 CGF.SizeTy, /*isSigned=*/false);
3270 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3271 Elem);
3272 }
3273 }
3274
3275 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3276 ReductionList.getPointer(), CGF.VoidPtrTy);
3277 llvm::Function *ReductionFn =
3278 emitReductionFunction(Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
3279 Privates, LHSExprs, RHSExprs, ReductionOps);
3280 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3281 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
3282 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
3283 llvm::Value *InterWarpCopyFn =
3284 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
10
Calling 'emitInterWarpCopyFunction'
3285
3286 if (ParallelReduction) {
3287 llvm::Value *Args[] = {RTLoc,
3288 ThreadId,
3289 CGF.Builder.getInt32(RHSExprs.size()),
3290 ReductionArrayTySize,
3291 RL,
3292 ShuffleAndReduceFn,
3293 InterWarpCopyFn};
3294
3295 Res = CGF.EmitRuntimeCall(
3296 OMPBuilder.getOrCreateRuntimeFunction(
3297 CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
3298 Args);
3299 } else {
3300 assert(TeamsReduction && "expected teams reduction.")(static_cast <bool> (TeamsReduction && "expected teams reduction."
) ? void (0) : __assert_fail ("TeamsReduction && \"expected teams reduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3300, __extension__
__PRETTY_FUNCTION__))
;
3301 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
3302 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
3303 int Cnt = 0;
3304 for (const Expr *DRE : Privates) {
3305 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
3306 ++Cnt;
3307 }
3308 const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
3309 CGM.getContext(), PrivatesReductions, llvm::None, VarFieldMap,
3310 C.getLangOpts().OpenMPCUDAReductionBufNum);
3311 TeamsReductions.push_back(TeamReductionRec);
3312 if (!KernelTeamsReductionPtr) {
3313 KernelTeamsReductionPtr = new llvm::GlobalVariable(
3314 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
3315 llvm::GlobalValue::InternalLinkage, nullptr,
3316 "_openmp_teams_reductions_buffer_$_$ptr");
3317 }
3318 llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
3319 Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
3320 /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
3321 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
3322 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3323 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
3324 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3325 ReductionFn);
3326 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
3327 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
3328 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
3329 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
3330 ReductionFn);
3331
3332 llvm::Value *Args[] = {
3333 RTLoc,
3334 ThreadId,
3335 GlobalBufferPtr,
3336 CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
3337 RL,
3338 ShuffleAndReduceFn,
3339 InterWarpCopyFn,
3340 GlobalToBufferCpyFn,
3341 GlobalToBufferRedFn,
3342 BufferToGlobalCpyFn,
3343 BufferToGlobalRedFn};
3344
3345 Res = CGF.EmitRuntimeCall(
3346 OMPBuilder.getOrCreateRuntimeFunction(
3347 CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
3348 Args);
3349 }
3350
3351 // 5. Build if (res == 1)
3352 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3353 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3354 llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3355 Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3356 CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3357
3358 // 6. Build then branch: where we have reduced values in the master
3359 // thread in each team.
3360 // __kmpc_end_reduce{_nowait}(<gtid>);
3361 // break;
3362 CGF.EmitBlock(ThenBB);
3363
3364 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3365 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3366 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3367 auto IPriv = Privates.begin();
3368 auto ILHS = LHSExprs.begin();
3369 auto IRHS = RHSExprs.begin();
3370 for (const Expr *E : ReductionOps) {
3371 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3372 cast<DeclRefExpr>(*IRHS));
3373 ++IPriv;
3374 ++ILHS;
3375 ++IRHS;
3376 }
3377 };
3378 llvm::Value *EndArgs[] = {ThreadId};
3379 RegionCodeGenTy RCG(CodeGen);
3380 NVPTXActionTy Action(
3381 nullptr, llvm::None,
3382 OMPBuilder.getOrCreateRuntimeFunction(
3383 CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3384 EndArgs);
3385 RCG.setAction(Action);
3386 RCG(CGF);
3387 // There is no need to emit line number for unconditional branch.
3388 (void)ApplyDebugLocation::CreateEmpty(CGF);
3389 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3390}
3391
3392const VarDecl *
3393CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3394 const VarDecl *NativeParam) const {
3395 if (!NativeParam->getType()->isReferenceType())
3396 return NativeParam;
3397 QualType ArgType = NativeParam->getType();
3398 QualifierCollector QC;
3399 const Type *NonQualTy = QC.strip(ArgType);
3400 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3401 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3402 if (Attr->getCaptureKind() == OMPC_map) {
3403 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3404 LangAS::opencl_global);
3405 }
3406 }
3407 ArgType = CGM.getContext().getPointerType(PointeeTy);
3408 QC.addRestrict();
3409 enum { NVPTX_local_addr = 5 };
3410 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3411 ArgType = QC.apply(CGM.getContext(), ArgType);
3412 if (isa<ImplicitParamDecl>(NativeParam))
3413 return ImplicitParamDecl::Create(
3414 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3415 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3416 return ParmVarDecl::Create(
3417 CGM.getContext(),
3418 const_cast<DeclContext *>(NativeParam->getDeclContext()),
3419 NativeParam->getBeginLoc(), NativeParam->getLocation(),
3420 NativeParam->getIdentifier(), ArgType,
3421 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3422}
3423
3424Address
3425CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3426 const VarDecl *NativeParam,
3427 const VarDecl *TargetParam) const {
3428 assert(NativeParam != TargetParam &&(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3430, __extension__
__PRETTY_FUNCTION__))
3429 NativeParam->getType()->isReferenceType() &&(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3430, __extension__
__PRETTY_FUNCTION__))
3430 "Native arg must not be the same as target arg.")(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3430, __extension__
__PRETTY_FUNCTION__))
;
3431 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3432 QualType NativeParamType = NativeParam->getType();
3433 QualifierCollector QC;
3434 const Type *NonQualTy = QC.strip(NativeParamType);
3435 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3436 unsigned NativePointeeAddrSpace =
3437 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
3438 QualType TargetTy = TargetParam->getType();
3439 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3440 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3441 // First cast to generic.
3442 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3443 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3444 cast<llvm::PointerType>(TargetAddr->getType()), /*AddrSpace=*/0));
3445 // Cast from generic to native address space.
3446 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3447 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3448 cast<llvm::PointerType>(TargetAddr->getType()),
3449 NativePointeeAddrSpace));
3450 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3451 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3452 NativeParamType);
3453 return NativeParamAddr;
3454}
3455
3456void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3457 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3458 ArrayRef<llvm::Value *> Args) const {
3459 SmallVector<llvm::Value *, 4> TargetArgs;
3460 TargetArgs.reserve(Args.size());
3461 auto *FnType = OutlinedFn.getFunctionType();
3462 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3463 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3464 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3465 break;
3466 }
3467 llvm::Type *TargetType = FnType->getParamType(I);
3468 llvm::Value *NativeArg = Args[I];
3469 if (!TargetType->isPointerTy()) {
3470 TargetArgs.emplace_back(NativeArg);
3471 continue;
3472 }
3473 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3474 NativeArg, llvm::PointerType::getWithSamePointeeType(
3475 cast<llvm::PointerType>(NativeArg->getType()), /*AddrSpace*/ 0));
3476 TargetArgs.emplace_back(
3477 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3478 }
3479 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3480}
3481
3482/// Emit function which wraps the outline parallel region
3483/// and controls the arguments which are passed to this function.
3484/// The wrapper ensures that the outlined function is called
3485/// with the correct arguments when data is shared.
3486llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3487 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3488 ASTContext &Ctx = CGM.getContext();
3489 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3490
3491 // Create a function that takes as argument the source thread.
3492 FunctionArgList WrapperArgs;
3493 QualType Int16QTy =
3494 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3495 QualType Int32QTy =
3496 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3497 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3498 /*Id=*/nullptr, Int16QTy,
3499 ImplicitParamDecl::Other);
3500 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3501 /*Id=*/nullptr, Int32QTy,
3502 ImplicitParamDecl::Other);
3503 WrapperArgs.emplace_back(&ParallelLevelArg);
3504 WrapperArgs.emplace_back(&WrapperArg);
3505
3506 const CGFunctionInfo &CGFI =
3507 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3508
3509 auto *Fn = llvm::Function::Create(
3510 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3511 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3512
3513 // Ensure we do not inline the function. This is trivially true for the ones
3514 // passed to __kmpc_fork_call but the ones calles in serialized regions
3515 // could be inlined. This is not a perfect but it is closer to the invariant
3516 // we want, namely, every data environment starts with a new function.
3517 // TODO: We should pass the if condition to the runtime function and do the
3518 // handling there. Much cleaner code.
3519 Fn->addFnAttr(llvm::Attribute::NoInline);
3520
3521 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3522 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3523 Fn->setDoesNotRecurse();
3524
3525 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3526 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3527 D.getBeginLoc(), D.getBeginLoc());
3528
3529 const auto *RD = CS.getCapturedRecordDecl();
3530 auto CurField = RD->field_begin();
3531
3532 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3533 /*Name=*/".zero.addr");
3534 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3535 // Get the array of arguments.
3536 SmallVector<llvm::Value *, 8> Args;
3537
3538 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3539 Args.emplace_back(ZeroAddr.getPointer());
3540
3541 CGBuilderTy &Bld = CGF.Builder;
3542 auto CI = CS.capture_begin();
3543
3544 // Use global memory for data sharing.
3545 // Handle passing of global args to workers.
3546 Address GlobalArgs =
3547 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3548 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3549 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3550 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3551 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3552 DataSharingArgs);
3553
3554 // Retrieve the shared variables from the list of references returned
3555 // by the runtime. Pass the variables to the outlined function.
3556 Address SharedArgListAddress = Address::invalid();
3557 if (CS.capture_size() > 0 ||
3558 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3559 SharedArgListAddress = CGF.EmitLoadOfPointer(
3560 GlobalArgs, CGF.getContext()
3561 .getPointerType(CGF.getContext().VoidPtrTy)
3562 .castAs<PointerType>());
3563 }
3564 unsigned Idx = 0;
3565 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3566 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3567 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3568 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3569 llvm::Value *LB = CGF.EmitLoadOfScalar(
3570 TypedAddress,
3571 /*Volatile=*/false,
3572 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3573 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3574 Args.emplace_back(LB);
3575 ++Idx;
3576 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3577 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3578 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3579 llvm::Value *UB = CGF.EmitLoadOfScalar(
3580 TypedAddress,
3581 /*Volatile=*/false,
3582 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3583 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3584 Args.emplace_back(UB);
3585 ++Idx;
3586 }
3587 if (CS.capture_size() > 0) {
3588 ASTContext &CGFContext = CGF.getContext();
3589 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3590 QualType ElemTy = CurField->getType();
3591 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3592 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3593 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
3594 CGF.ConvertTypeForMem(ElemTy));
3595 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3596 /*Volatile=*/false,
3597 CGFContext.getPointerType(ElemTy),
3598 CI->getLocation());
3599 if (CI->capturesVariableByCopy() &&
3600 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3601 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3602 CI->getLocation());
3603 }
3604 Args.emplace_back(Arg);
3605 }
3606 }
3607
3608 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3609 CGF.FinishFunction();
3610 return Fn;
3611}
3612
3613void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3614 const Decl *D) {
3615 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3616 return;
3617
3618 assert(D && "Expected function or captured|block decl.")(static_cast <bool> (D && "Expected function or captured|block decl."
) ? void (0) : __assert_fail ("D && \"Expected function or captured|block decl.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3618, __extension__
__PRETTY_FUNCTION__))
;
3619 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&(static_cast <bool> (FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 && "Function is registered already.") ? void
(0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3620, __extension__
__PRETTY_FUNCTION__))
3620 "Function is registered already.")(static_cast <bool> (FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 && "Function is registered already.") ? void
(0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3620, __extension__
__PRETTY_FUNCTION__))
;
3621 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&(static_cast <bool> ((!TeamAndReductions.first || TeamAndReductions
.first == D) && "Team is set but not processed.") ? void
(0) : __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3622, __extension__
__PRETTY_FUNCTION__))
3622 "Team is set but not processed.")(static_cast <bool> ((!TeamAndReductions.first || TeamAndReductions
.first == D) && "Team is set but not processed.") ? void
(0) : __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3622, __extension__
__PRETTY_FUNCTION__))
;
3623 const Stmt *Body = nullptr;
3624 bool NeedToDelayGlobalization = false;
3625 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3626 Body = FD->getBody();
3627 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3628 Body = BD->getBody();
3629 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3630 Body = CD->getBody();
3631 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3632 if (NeedToDelayGlobalization &&
3633 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3634 return;
3635 }
3636 if (!Body)
3637 return;
3638 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3639 VarChecker.Visit(Body);
3640 const RecordDecl *GlobalizedVarsRecord =
3641 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3642 TeamAndReductions.first = nullptr;
3643 TeamAndReductions.second.clear();
3644 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3645 VarChecker.getEscapedVariableLengthDecls();
3646 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3647 return;
3648 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3649 I->getSecond().MappedParams =
3650 std::make_unique<CodeGenFunction::OMPMapVars>();
3651 I->getSecond().EscapedParameters.insert(
3652 VarChecker.getEscapedParameters().begin(),
3653 VarChecker.getEscapedParameters().end());
3654 I->getSecond().EscapedVariableLengthDecls.append(
3655 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3656 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3657 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3658 assert(VD->isCanonicalDecl() && "Expected canonical declaration")(static_cast <bool> (VD->isCanonicalDecl() &&
"Expected canonical declaration") ? void (0) : __assert_fail
("VD->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3658, __extension__
__PRETTY_FUNCTION__))
;
3659 Data.insert(std::make_pair(VD, MappedVarData()));
3660 }
3661 if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
3662 CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
3663 VarChecker.Visit(Body);
3664 I->getSecond().SecondaryLocalVarData.emplace();
3665 DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
3666 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3667 assert(VD->isCanonicalDecl() && "Expected canonical declaration")(static_cast <bool> (VD->isCanonicalDecl() &&
"Expected canonical declaration") ? void (0) : __assert_fail
("VD->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3667, __extension__
__PRETTY_FUNCTION__))
;
3668 Data.insert(std::make_pair(VD, MappedVarData()));
3669 }
3670 }
3671 if (!NeedToDelayGlobalization) {
3672 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
3673 struct GlobalizationScope final : EHScopeStack::Cleanup {
3674 GlobalizationScope() = default;
3675
3676 void Emit(CodeGenFunction &CGF, Flags flags) override {
3677 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3678 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
3679 }
3680 };
3681 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3682 }
3683}
3684
3685Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3686 const VarDecl *VD) {
3687 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3688 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3689 auto AS = LangAS::Default;
3690 switch (A->getAllocatorType()) {
3691 // Use the default allocator here as by default local vars are
3692 // threadlocal.
3693 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3694 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3695 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3696 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3697 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3698 // Follow the user decision - use default allocation.
3699 return Address::invalid();
3700 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3701 // TODO: implement aupport for user-defined allocators.
3702 return Address::invalid();
3703 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3704 AS = LangAS::cuda_constant;
3705 break;
3706 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3707 AS = LangAS::cuda_shared;
3708 break;
3709 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3710 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3711 break;
3712 }
3713 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3714 auto *GV = new llvm::GlobalVariable(
3715 CGM.getModule(), VarTy, /*isConstant=*/false,
3716 llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
3717 VD->getName(),
3718 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3719 CGM.getContext().getTargetAddressSpace(AS));
3720 CharUnits Align = CGM.getContext().getDeclAlign(VD);
3721 GV->setAlignment(Align.getAsAlign());
3722 return Address(
3723 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3724 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3725 VD->getType().getAddressSpace()))),
3726 VarTy, Align);
3727 }
3728
3729 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3730 return Address::invalid();
3731
3732 VD = VD->getCanonicalDecl();
3733 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3734 if (I == FunctionGlobalizedDecls.end())
3735 return Address::invalid();
3736 auto VDI = I->getSecond().LocalVarData.find(VD);
3737 if (VDI != I->getSecond().LocalVarData.end())
3738 return VDI->second.PrivateAddr;
3739 if (VD->hasAttrs()) {
3740 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3741 E(VD->attr_end());
3742 IT != E; ++IT) {
3743 auto VDI = I->getSecond().LocalVarData.find(
3744 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3745 ->getCanonicalDecl());
3746 if (VDI != I->getSecond().LocalVarData.end())
3747 return VDI->second.PrivateAddr;
3748 }
3749 }
3750
3751 return Address::invalid();
3752}
3753
3754void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3755 FunctionGlobalizedDecls.erase(CGF.CurFn);
3756 CGOpenMPRuntime::functionFinished(CGF);
3757}
3758
3759void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3760 CodeGenFunction &CGF, const OMPLoopDirective &S,
3761 OpenMPDistScheduleClauseKind &ScheduleKind,
3762 llvm::Value *&Chunk) const {
3763 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3764 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3765 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3766 Chunk = CGF.EmitScalarConversion(
3767 RT.getGPUNumThreads(CGF),
3768 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3769 S.getIterationVariable()->getType(), S.getBeginLoc());
3770 return;
3771 }
3772 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3773 CGF, S, ScheduleKind, Chunk);
3774}
3775
3776void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3777 CodeGenFunction &CGF, const OMPLoopDirective &S,
3778 OpenMPScheduleClauseKind &ScheduleKind,
3779 const Expr *&ChunkExpr) const {
3780 ScheduleKind = OMPC_SCHEDULE_static;
3781 // Chunk size is 1 in this case.
3782 llvm::APInt ChunkSize(32, 1);
3783 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3784 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3785 SourceLocation());
3786}
3787
3788void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3789 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3790 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) && " Expected target-based directive."
) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3791, __extension__
__PRETTY_FUNCTION__))
3791 " Expected target-based directive.")(static_cast <bool> (isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) && " Expected target-based directive."
) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3791, __extension__
__PRETTY_FUNCTION__))
;
3792 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3793 for (const CapturedStmt::Capture &C : CS->captures()) {
3794 // Capture variables captured by reference in lambdas for target-based
3795 // directives.
3796 if (!C.capturesVariable())
3797 continue;
3798 const VarDecl *VD = C.getCapturedVar();
3799 const auto *RD = VD->getType()
3800 .getCanonicalType()
3801 .getNonReferenceType()
3802 ->getAsCXXRecordDecl();
3803 if (!RD || !RD->isLambda())
3804 continue;
3805 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3806 LValue VDLVal;
3807 if (VD->getType().getCanonicalType()->isReferenceType())
3808 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3809 else
3810 VDLVal = CGF.MakeAddrLValue(
3811 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3812 llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
3813 FieldDecl *ThisCapture = nullptr;
3814 RD->getCaptureFields(Captures, ThisCapture);
3815 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3816 LValue ThisLVal =
3817 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3818 llvm::Value *CXXThis = CGF.LoadCXXThis();
3819 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3820 }
3821 for (const LambdaCapture &LC : RD->captures()) {
3822 if (LC.getCaptureKind() != LCK_ByRef)
3823 continue;
3824 const VarDecl *VD = LC.getCapturedVar();
3825 if (!CS->capturesVariable(VD))
3826 continue;
3827 auto It = Captures.find(VD);
3828 assert(It != Captures.end() && "Found lambda capture without field.")(static_cast <bool> (It != Captures.end() && "Found lambda capture without field."
) ? void (0) : __assert_fail ("It != Captures.end() && \"Found lambda capture without field.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3828, __extension__
__PRETTY_FUNCTION__))
;
3829 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3830 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3831 if (VD->getType().getCanonicalType()->isReferenceType())
3832 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3833 VD->getType().getCanonicalType())
3834 .getAddress(CGF);
3835 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3836 }
3837 }
3838}
3839
3840bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3841 LangAS &AS) {
3842 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3843 return false;
3844 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3845 switch(A->getAllocatorType()) {
3846 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3847 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3848 // Not supported, fallback to the default mem space.
3849 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3850 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3851 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3852 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3853 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3854 AS = LangAS::Default;
3855 return true;
3856 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3857 AS = LangAS::cuda_constant;
3858 return true;
3859 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3860 AS = LangAS::cuda_shared;
3861 return true;
3862 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3863 llvm_unreachable("Expected predefined allocator for the variables with the "::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the "
"static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3864)
3864 "static storage.")::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the "
"static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3864)
;
3865 }
3866 return false;
3867}
3868
3869// Get current CudaArch and ignore any unknown values
3870static CudaArch getCudaArch(CodeGenModule &CGM) {
3871 if (!CGM.getTarget().hasFeature("ptx"))
3872 return CudaArch::UNKNOWN;
3873 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3874 if (Feature.getValue()) {
3875 CudaArch Arch = StringToCudaArch(Feature.getKey());
3876 if (Arch != CudaArch::UNKNOWN)
3877 return Arch;
3878 }
3879 }
3880 return CudaArch::UNKNOWN;
3881}
3882
3883/// Check to see if target architecture supports unified addressing which is
3884/// a restriction for OpenMP requires clause "unified_shared_memory".
3885void CGOpenMPRuntimeGPU::processRequiresDirective(
3886 const OMPRequiresDecl *D) {
3887 for (const OMPClause *Clause : D->clauselists()) {
3888 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3889 CudaArch Arch = getCudaArch(CGM);
3890 switch (Arch) {
3891 case CudaArch::SM_20:
3892 case CudaArch::SM_21:
3893 case CudaArch::SM_30:
3894 case CudaArch::SM_32:
3895 case CudaArch::SM_35:
3896 case CudaArch::SM_37:
3897 case CudaArch::SM_50:
3898 case CudaArch::SM_52:
3899 case CudaArch::SM_53: {
3900 SmallString<256> Buffer;
3901 llvm::raw_svector_ostream Out(Buffer);
3902 Out << "Target architecture " << CudaArchToString(Arch)
3903 << " does not support unified addressing";
3904 CGM.Error(Clause->getBeginLoc(), Out.str());
3905 return;
3906 }
3907 case CudaArch::SM_60:
3908 case CudaArch::SM_61:
3909 case CudaArch::SM_62:
3910 case CudaArch::SM_70:
3911 case CudaArch::SM_72:
3912 case CudaArch::SM_75:
3913 case CudaArch::SM_80:
3914 case CudaArch::SM_86:
3915 case CudaArch::GFX600:
3916 case CudaArch::GFX601:
3917 case CudaArch::GFX602:
3918 case CudaArch::GFX700:
3919 case CudaArch::GFX701:
3920 case CudaArch::GFX702:
3921 case CudaArch::GFX703:
3922 case CudaArch::GFX704:
3923 case CudaArch::GFX705:
3924 case CudaArch::GFX801:
3925 case CudaArch::GFX802:
3926 case CudaArch::GFX803:
3927 case CudaArch::GFX805:
3928 case CudaArch::GFX810:
3929 case CudaArch::GFX900:
3930 case CudaArch::GFX902:
3931 case CudaArch::GFX904:
3932 case CudaArch::GFX906:
3933 case CudaArch::GFX908:
3934 case CudaArch::GFX909:
3935 case CudaArch::GFX90a:
3936 case CudaArch::GFX90c:
3937 case CudaArch::GFX940:
3938 case CudaArch::GFX1010:
3939 case CudaArch::GFX1011:
3940 case CudaArch::GFX1012:
3941 case CudaArch::GFX1013:
3942 case CudaArch::GFX1030:
3943 case CudaArch::GFX1031:
3944 case CudaArch::GFX1032:
3945 case CudaArch::GFX1033:
3946 case CudaArch::GFX1034:
3947 case CudaArch::GFX1035:
3948 case CudaArch::GFX1036:
3949 case CudaArch::Generic:
3950 case CudaArch::UNUSED:
3951 case CudaArch::UNKNOWN:
3952 break;
3953 case CudaArch::LAST:
3954 llvm_unreachable("Unexpected Cuda arch.")::llvm::llvm_unreachable_internal("Unexpected Cuda arch.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3954)
;
3955 }
3956 }
3957 }
3958 CGOpenMPRuntime::processRequiresDirective(D);
3959}
3960
3961void CGOpenMPRuntimeGPU::clear() {
3962
3963 if (!TeamsReductions.empty()) {
3964 ASTContext &C = CGM.getContext();
3965 RecordDecl *StaticRD = C.buildImplicitRecord(
3966 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3967 StaticRD->startDefinition();
3968 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3969 QualType RecTy = C.getRecordType(TeamReductionRec);
3970 auto *Field = FieldDecl::Create(
3971 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3972 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3973 /*BW=*/nullptr, /*Mutable=*/false,
3974 /*InitStyle=*/ICIS_NoInit);
3975 Field->setAccess(AS_public);
3976 StaticRD->addDecl(Field);
3977 }
3978 StaticRD->completeDefinition();
3979 QualType StaticTy = C.getRecordType(StaticRD);
3980 llvm::Type *LLVMReductionsBufferTy =
3981 CGM.getTypes().ConvertTypeForMem(StaticTy);
3982 // FIXME: nvlink does not handle weak linkage correctly (object with the
3983 // different size are reported as erroneous).
3984 // Restore CommonLinkage as soon as nvlink is fixed.
3985 auto *GV = new llvm::GlobalVariable(
3986 CGM.getModule(), LLVMReductionsBufferTy,
3987 /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3988 llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3989 "_openmp_teams_reductions_buffer_$_");
3990 KernelTeamsReductionPtr->setInitializer(
3991 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3992 CGM.VoidPtrTy));
3993 }
3994 CGOpenMPRuntime::clear();
3995}
3996
3997llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
3998 CGBuilderTy &Bld = CGF.Builder;
3999 llvm::Module *M = &CGF.CGM.getModule();
4000 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
4001 llvm::Function *F = M->getFunction(LocSize);
4002 if (!F) {
4003 F = llvm::Function::Create(
4004 llvm::FunctionType::get(CGF.Int32Ty, llvm::None, false),
4005 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
4006 }
4007 return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
4008}
4009
4010llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
4011 ArrayRef<llvm::Value *> Args{};
4012 return CGF.EmitRuntimeCall(
4013 OMPBuilder.getOrCreateRuntimeFunction(
4014 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
4015 Args);
4016}
4017
4018llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
4019 ArrayRef<llvm::Value *> Args{};
4020 return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
4021 CGM.getModule(), OMPRTL___kmpc_get_warp_size),
4022 Args);
4023}