Bug Summary

File:build/source/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Warning:line 531, column 29
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name CGOpenMPRuntimeGPU.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -relaxed-aliasing -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm -resource-dir /usr/lib/llvm-17/lib/clang/17 -I tools/clang/lib/CodeGen -I /build/source/clang/lib/CodeGen -I /build/source/clang/include -I tools/clang/include -I include -I /build/source/llvm/include -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm=build-llvm -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm=build-llvm -fcoverage-prefix-map=/build/source/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
17#include "clang/AST/DeclOpenMP.h"
18#include "clang/AST/OpenMPClause.h"
19#include "clang/AST/StmtOpenMP.h"
20#include "clang/AST/StmtVisitor.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24#include "llvm/Support/MathExtras.h"
25
26using namespace clang;
27using namespace CodeGen;
28using namespace llvm::omp;
29
30namespace {
31/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32class NVPTXActionTy final : public PrePostActionTy {
33 llvm::FunctionCallee EnterCallee = nullptr;
34 ArrayRef<llvm::Value *> EnterArgs;
35 llvm::FunctionCallee ExitCallee = nullptr;
36 ArrayRef<llvm::Value *> ExitArgs;
37 bool Conditional = false;
38 llvm::BasicBlock *ContBlock = nullptr;
39
40public:
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42 ArrayRef<llvm::Value *> EnterArgs,
43 llvm::FunctionCallee ExitCallee,
44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46 ExitArgs(ExitArgs), Conditional(Conditional) {}
47 void Enter(CodeGenFunction &CGF) override {
48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49 if (Conditional) {
50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52 ContBlock = CGF.createBasicBlock("omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55 CGF.EmitBlock(ThenBlock);
56 }
57 }
58 void Done(CodeGenFunction &CGF) {
59 // Emit the rest of blocks/branches
60 CGF.EmitBranch(ContBlock);
61 CGF.EmitBlock(ContBlock, true);
62 }
63 void Exit(CodeGenFunction &CGF) override {
64 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65 }
66};
67
68/// A class to track the execution mode when codegening directives within
69/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70/// to the target region and used by containing directives such as 'parallel'
71/// to emit optimized code.
72class ExecutionRuntimeModesRAII {
73private:
74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75 CGOpenMPRuntimeGPU::EM_Unknown;
76 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77
78public:
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
81 : ExecMode(ExecMode) {
82 SavedExecMode = ExecMode;
83 ExecMode = EntryMode;
84 }
85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86};
87
88/// GPU Configuration: This information can be derived from cuda registers,
89/// however, providing compile time constants helps generate more efficient
90/// code. For all practical purposes this is fine because the configuration
91/// is the same for all known NVPTX architectures.
92enum MachineConfiguration : unsigned {
93 /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
94 /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
95
96 /// Global memory alignment for performance.
97 GlobalMemoryAlignment = 128,
98};
99
100static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
101 RefExpr = RefExpr->IgnoreParens();
102 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
103 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
104 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
105 Base = TempASE->getBase()->IgnoreParenImpCasts();
106 RefExpr = Base;
107 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
108 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
109 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
110 Base = TempOASE->getBase()->IgnoreParenImpCasts();
111 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
112 Base = TempASE->getBase()->IgnoreParenImpCasts();
113 RefExpr = Base;
114 }
115 RefExpr = RefExpr->IgnoreParenImpCasts();
116 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
117 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
118 const auto *ME = cast<MemberExpr>(RefExpr);
119 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
120}
121
122
123static RecordDecl *buildRecordForGlobalizedVars(
124 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
125 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
126 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
127 &MappedDeclsFields, int BufSize) {
128 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
129 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
130 return nullptr;
131 SmallVector<VarsDataTy, 4> GlobalizedVars;
132 for (const ValueDecl *D : EscapedDecls)
133 GlobalizedVars.emplace_back(
134 CharUnits::fromQuantity(std::max(
135 C.getDeclAlign(D).getQuantity(),
136 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
137 D);
138 for (const ValueDecl *D : EscapedDeclsForTeams)
139 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
140 llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
141 return L.first > R.first;
142 });
143
144 // Build struct _globalized_locals_ty {
145 // /* globalized vars */[WarSize] align (max(decl_align,
146 // GlobalMemoryAlignment))
147 // /* globalized vars */ for EscapedDeclsForTeams
148 // };
149 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
150 GlobalizedRD->startDefinition();
151 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
152 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
153 for (const auto &Pair : GlobalizedVars) {
154 const ValueDecl *VD = Pair.second;
155 QualType Type = VD->getType();
156 if (Type->isLValueReferenceType())
157 Type = C.getPointerType(Type.getNonReferenceType());
158 else
159 Type = Type.getNonReferenceType();
160 SourceLocation Loc = VD->getLocation();
161 FieldDecl *Field;
162 if (SingleEscaped.count(VD)) {
163 Field = FieldDecl::Create(
164 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
165 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
166 /*BW=*/nullptr, /*Mutable=*/false,
167 /*InitStyle=*/ICIS_NoInit);
168 Field->setAccess(AS_public);
169 if (VD->hasAttrs()) {
170 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
171 E(VD->getAttrs().end());
172 I != E; ++I)
173 Field->addAttr(*I);
174 }
175 } else {
176 llvm::APInt ArraySize(32, BufSize);
177 Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
178 0);
179 Field = FieldDecl::Create(
180 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
181 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
182 /*BW=*/nullptr, /*Mutable=*/false,
183 /*InitStyle=*/ICIS_NoInit);
184 Field->setAccess(AS_public);
185 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
186 static_cast<CharUnits::QuantityType>(
187 GlobalMemoryAlignment)));
188 Field->addAttr(AlignedAttr::CreateImplicit(
189 C, /*IsAlignmentExpr=*/true,
190 IntegerLiteral::Create(C, Align,
191 C.getIntTypeForBitwidth(32, /*Signed=*/0),
192 SourceLocation()),
193 {}, AlignedAttr::GNU_aligned));
194 }
195 GlobalizedRD->addDecl(Field);
196 MappedDeclsFields.try_emplace(VD, Field);
197 }
198 GlobalizedRD->completeDefinition();
199 return GlobalizedRD;
200}
201
202/// Get the list of variables that can escape their declaration context.
203class CheckVarsEscapingDeclContext final
204 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
205 CodeGenFunction &CGF;
206 llvm::SetVector<const ValueDecl *> EscapedDecls;
207 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
208 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
209 RecordDecl *GlobalizedRD = nullptr;
210 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
211 bool AllEscaped = false;
212 bool IsForCombinedParallelRegion = false;
213
214 void markAsEscaped(const ValueDecl *VD) {
215 // Do not globalize declare target variables.
216 if (!isa<VarDecl>(VD) ||
217 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
218 return;
219 VD = cast<ValueDecl>(VD->getCanonicalDecl());
220 // Use user-specified allocation.
221 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
222 return;
223 // Variables captured by value must be globalized.
224 if (auto *CSI = CGF.CapturedStmtInfo) {
225 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
226 // Check if need to capture the variable that was already captured by
227 // value in the outer region.
228 if (!IsForCombinedParallelRegion) {
229 if (!FD->hasAttrs())
230 return;
231 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
232 if (!Attr)
233 return;
234 if (((Attr->getCaptureKind() != OMPC_map) &&
235 !isOpenMPPrivate(Attr->getCaptureKind())) ||
236 ((Attr->getCaptureKind() == OMPC_map) &&
237 !FD->getType()->isAnyPointerType()))
238 return;
239 }
240 if (!FD->getType()->isReferenceType()) {
241 assert(!VD->getType()->isVariablyModifiedType() &&(static_cast <bool> (!VD->getType()->isVariablyModifiedType
() && "Parameter captured by value with variably modified type"
) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 242, __extension__
__PRETTY_FUNCTION__))
242 "Parameter captured by value with variably modified type")(static_cast <bool> (!VD->getType()->isVariablyModifiedType
() && "Parameter captured by value with variably modified type"
) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 242, __extension__
__PRETTY_FUNCTION__))
;
243 EscapedParameters.insert(VD);
244 } else if (!IsForCombinedParallelRegion) {
245 return;
246 }
247 }
248 }
249 if ((!CGF.CapturedStmtInfo ||
250 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
251 VD->getType()->isReferenceType())
252 // Do not globalize variables with reference type.
253 return;
254 if (VD->getType()->isVariablyModifiedType())
255 EscapedVariableLengthDecls.insert(VD);
256 else
257 EscapedDecls.insert(VD);
258 }
259
260 void VisitValueDecl(const ValueDecl *VD) {
261 if (VD->getType()->isLValueReferenceType())
262 markAsEscaped(VD);
263 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
264 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
265 const bool SavedAllEscaped = AllEscaped;
266 AllEscaped = VD->getType()->isLValueReferenceType();
267 Visit(VarD->getInit());
268 AllEscaped = SavedAllEscaped;
269 }
270 }
271 }
272 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
273 ArrayRef<OMPClause *> Clauses,
274 bool IsCombinedParallelRegion) {
275 if (!S)
276 return;
277 for (const CapturedStmt::Capture &C : S->captures()) {
278 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
279 const ValueDecl *VD = C.getCapturedVar();
280 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
281 if (IsCombinedParallelRegion) {
282 // Check if the variable is privatized in the combined construct and
283 // those private copies must be shared in the inner parallel
284 // directive.
285 IsForCombinedParallelRegion = false;
286 for (const OMPClause *C : Clauses) {
287 if (!isOpenMPPrivate(C->getClauseKind()) ||
288 C->getClauseKind() == OMPC_reduction ||
289 C->getClauseKind() == OMPC_linear ||
290 C->getClauseKind() == OMPC_private)
291 continue;
292 ArrayRef<const Expr *> Vars;
293 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
294 Vars = PC->getVarRefs();
295 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
296 Vars = PC->getVarRefs();
297 else
298 llvm_unreachable("Unexpected clause.")::llvm::llvm_unreachable_internal("Unexpected clause.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 298)
;
299 for (const auto *E : Vars) {
300 const Decl *D =
301 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
302 if (D == VD->getCanonicalDecl()) {
303 IsForCombinedParallelRegion = true;
304 break;
305 }
306 }
307 if (IsForCombinedParallelRegion)
308 break;
309 }
310 }
311 markAsEscaped(VD);
312 if (isa<OMPCapturedExprDecl>(VD))
313 VisitValueDecl(VD);
314 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
315 }
316 }
317 }
318
319 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
320 assert(!GlobalizedRD &&(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already."
) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 321, __extension__
__PRETTY_FUNCTION__))
321 "Record for globalized variables is built already.")(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already."
) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 321, __extension__
__PRETTY_FUNCTION__))
;
322 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
323 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
324 if (IsInTTDRegion)
325 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
326 else
327 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
328 GlobalizedRD = ::buildRecordForGlobalizedVars(
329 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
330 MappedDeclsFields, WarpSize);
331 }
332
333public:
334 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
335 ArrayRef<const ValueDecl *> TeamsReductions)
336 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
337 }
338 virtual ~CheckVarsEscapingDeclContext() = default;
339 void VisitDeclStmt(const DeclStmt *S) {
340 if (!S)
341 return;
342 for (const Decl *D : S->decls())
343 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
344 VisitValueDecl(VD);
345 }
346 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
347 if (!D)
348 return;
349 if (!D->hasAssociatedStmt())
350 return;
351 if (const auto *S =
352 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
353 // Do not analyze directives that do not actually require capturing,
354 // like `omp for` or `omp simd` directives.
355 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
356 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
357 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
358 VisitStmt(S->getCapturedStmt());
359 return;
360 }
361 VisitOpenMPCapturedStmt(
362 S, D->clauses(),
363 CaptureRegions.back() == OMPD_parallel &&
364 isOpenMPDistributeDirective(D->getDirectiveKind()));
365 }
366 }
367 void VisitCapturedStmt(const CapturedStmt *S) {
368 if (!S)
369 return;
370 for (const CapturedStmt::Capture &C : S->captures()) {
371 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
372 const ValueDecl *VD = C.getCapturedVar();
373 markAsEscaped(VD);
374 if (isa<OMPCapturedExprDecl>(VD))
375 VisitValueDecl(VD);
376 }
377 }
378 }
379 void VisitLambdaExpr(const LambdaExpr *E) {
380 if (!E)
381 return;
382 for (const LambdaCapture &C : E->captures()) {
383 if (C.capturesVariable()) {
384 if (C.getCaptureKind() == LCK_ByRef) {
385 const ValueDecl *VD = C.getCapturedVar();
386 markAsEscaped(VD);
387 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
388 VisitValueDecl(VD);
389 }
390 }
391 }
392 }
393 void VisitBlockExpr(const BlockExpr *E) {
394 if (!E)
395 return;
396 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
397 if (C.isByRef()) {
398 const VarDecl *VD = C.getVariable();
399 markAsEscaped(VD);
400 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
401 VisitValueDecl(VD);
402 }
403 }
404 }
405 void VisitCallExpr(const CallExpr *E) {
406 if (!E)
407 return;
408 for (const Expr *Arg : E->arguments()) {
409 if (!Arg)
410 continue;
411 if (Arg->isLValue()) {
412 const bool SavedAllEscaped = AllEscaped;
413 AllEscaped = true;
414 Visit(Arg);
415 AllEscaped = SavedAllEscaped;
416 } else {
417 Visit(Arg);
418 }
419 }
420 Visit(E->getCallee());
421 }
422 void VisitDeclRefExpr(const DeclRefExpr *E) {
423 if (!E)
424 return;
425 const ValueDecl *VD = E->getDecl();
426 if (AllEscaped)
427 markAsEscaped(VD);
428 if (isa<OMPCapturedExprDecl>(VD))
429 VisitValueDecl(VD);
430 else if (VD->isInitCapture())
431 VisitValueDecl(VD);
432 }
433 void VisitUnaryOperator(const UnaryOperator *E) {
434 if (!E)
435 return;
436 if (E->getOpcode() == UO_AddrOf) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(E->getSubExpr());
443 }
444 }
445 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
446 if (!E)
447 return;
448 if (E->getCastKind() == CK_ArrayToPointerDecay) {
449 const bool SavedAllEscaped = AllEscaped;
450 AllEscaped = true;
451 Visit(E->getSubExpr());
452 AllEscaped = SavedAllEscaped;
453 } else {
454 Visit(E->getSubExpr());
455 }
456 }
457 void VisitExpr(const Expr *E) {
458 if (!E)
459 return;
460 bool SavedAllEscaped = AllEscaped;
461 if (!E->isLValue())
462 AllEscaped = false;
463 for (const Stmt *Child : E->children())
464 if (Child)
465 Visit(Child);
466 AllEscaped = SavedAllEscaped;
467 }
468 void VisitStmt(const Stmt *S) {
469 if (!S)
470 return;
471 for (const Stmt *Child : S->children())
472 if (Child)
473 Visit(Child);
474 }
475
476 /// Returns the record that handles all the escaped local variables and used
477 /// instead of their original storage.
478 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
479 if (!GlobalizedRD)
480 buildRecordForGlobalizedVars(IsInTTDRegion);
481 return GlobalizedRD;
482 }
483
484 /// Returns the field in the globalized record for the escaped variable.
485 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
486 assert(GlobalizedRD &&(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already."
) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 487, __extension__
__PRETTY_FUNCTION__))
487 "Record for globalized variables must be generated already.")(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already."
) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 487, __extension__
__PRETTY_FUNCTION__))
;
488 auto I = MappedDeclsFields.find(VD);
489 if (I == MappedDeclsFields.end())
490 return nullptr;
491 return I->getSecond();
492 }
493
494 /// Returns the list of the escaped local variables/parameters.
495 ArrayRef<const ValueDecl *> getEscapedDecls() const {
496 return EscapedDecls.getArrayRef();
497 }
498
499 /// Checks if the escaped local variable is actually a parameter passed by
500 /// value.
501 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
502 return EscapedParameters;
503 }
504
505 /// Returns the list of the escaped variables with the variably modified
506 /// types.
507 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
508 return EscapedVariableLengthDecls.getArrayRef();
509 }
510};
511} // anonymous namespace
512
513/// Get the id of the warp in the block.
514/// We assume that the warp size is 32, which is always the case
515/// on the NVPTX device, to generate more efficient code.
516static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
517 CGBuilderTy &Bld = CGF.Builder;
518 unsigned LaneIDBits =
519 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
520 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
521 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
522}
523
524/// Get the id of the current lane in the Warp.
525/// We assume that the warp size is 32, which is always the case
526/// on the NVPTX device, to generate more efficient code.
527static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
528 CGBuilderTy &Bld = CGF.Builder;
529 unsigned LaneIDBits =
530 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
531 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
14
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'
532 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
533 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
534 "nvptx_lane_id");
535}
536
537CGOpenMPRuntimeGPU::ExecutionMode
538CGOpenMPRuntimeGPU::getExecutionMode() const {
539 return CurrentExecutionMode;
540}
541
542static CGOpenMPRuntimeGPU::DataSharingMode
543getDataSharingMode(CodeGenModule &CGM) {
544 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
545 : CGOpenMPRuntimeGPU::Generic;
546}
547
548/// Check for inner (nested) SPMD construct, if any
549static bool hasNestedSPMDDirective(ASTContext &Ctx,
550 const OMPExecutableDirective &D) {
551 const auto *CS = D.getInnermostCapturedStmt();
552 const auto *Body =
553 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
554 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
555
556 if (const auto *NestedDir =
557 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
558 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
559 switch (D.getDirectiveKind()) {
560 case OMPD_target:
561 if (isOpenMPParallelDirective(DKind))
562 return true;
563 if (DKind == OMPD_teams) {
564 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
565 /*IgnoreCaptured=*/true);
566 if (!Body)
567 return false;
568 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
569 if (const auto *NND =
570 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
571 DKind = NND->getDirectiveKind();
572 if (isOpenMPParallelDirective(DKind))
573 return true;
574 }
575 }
576 return false;
577 case OMPD_target_teams:
578 return isOpenMPParallelDirective(DKind);
579 case OMPD_target_simd:
580 case OMPD_target_parallel:
581 case OMPD_target_parallel_for:
582 case OMPD_target_parallel_for_simd:
583 case OMPD_target_teams_distribute:
584 case OMPD_target_teams_distribute_simd:
585 case OMPD_target_teams_distribute_parallel_for:
586 case OMPD_target_teams_distribute_parallel_for_simd:
587 case OMPD_parallel:
588 case OMPD_for:
589 case OMPD_parallel_for:
590 case OMPD_parallel_master:
591 case OMPD_parallel_sections:
592 case OMPD_for_simd:
593 case OMPD_parallel_for_simd:
594 case OMPD_cancel:
595 case OMPD_cancellation_point:
596 case OMPD_ordered:
597 case OMPD_threadprivate:
598 case OMPD_allocate:
599 case OMPD_task:
600 case OMPD_simd:
601 case OMPD_sections:
602 case OMPD_section:
603 case OMPD_single:
604 case OMPD_master:
605 case OMPD_critical:
606 case OMPD_taskyield:
607 case OMPD_barrier:
608 case OMPD_taskwait:
609 case OMPD_taskgroup:
610 case OMPD_atomic:
611 case OMPD_flush:
612 case OMPD_depobj:
613 case OMPD_scan:
614 case OMPD_teams:
615 case OMPD_target_data:
616 case OMPD_target_exit_data:
617 case OMPD_target_enter_data:
618 case OMPD_distribute:
619 case OMPD_distribute_simd:
620 case OMPD_distribute_parallel_for:
621 case OMPD_distribute_parallel_for_simd:
622 case OMPD_teams_distribute:
623 case OMPD_teams_distribute_simd:
624 case OMPD_teams_distribute_parallel_for:
625 case OMPD_teams_distribute_parallel_for_simd:
626 case OMPD_target_update:
627 case OMPD_declare_simd:
628 case OMPD_declare_variant:
629 case OMPD_begin_declare_variant:
630 case OMPD_end_declare_variant:
631 case OMPD_declare_target:
632 case OMPD_end_declare_target:
633 case OMPD_declare_reduction:
634 case OMPD_declare_mapper:
635 case OMPD_taskloop:
636 case OMPD_taskloop_simd:
637 case OMPD_master_taskloop:
638 case OMPD_master_taskloop_simd:
639 case OMPD_parallel_master_taskloop:
640 case OMPD_parallel_master_taskloop_simd:
641 case OMPD_requires:
642 case OMPD_unknown:
643 default:
644 llvm_unreachable("Unexpected directive.")::llvm::llvm_unreachable_internal("Unexpected directive.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 644)
;
645 }
646 }
647
648 return false;
649}
650
651static bool supportsSPMDExecutionMode(ASTContext &Ctx,
652 const OMPExecutableDirective &D) {
653 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
654 switch (DirectiveKind) {
655 case OMPD_target:
656 case OMPD_target_teams:
657 return hasNestedSPMDDirective(Ctx, D);
658 case OMPD_target_parallel:
659 case OMPD_target_parallel_for:
660 case OMPD_target_parallel_for_simd:
661 case OMPD_target_teams_distribute_parallel_for:
662 case OMPD_target_teams_distribute_parallel_for_simd:
663 case OMPD_target_simd:
664 case OMPD_target_teams_distribute_simd:
665 return true;
666 case OMPD_target_teams_distribute:
667 return false;
668 case OMPD_parallel:
669 case OMPD_for:
670 case OMPD_parallel_for:
671 case OMPD_parallel_master:
672 case OMPD_parallel_sections:
673 case OMPD_for_simd:
674 case OMPD_parallel_for_simd:
675 case OMPD_cancel:
676 case OMPD_cancellation_point:
677 case OMPD_ordered:
678 case OMPD_threadprivate:
679 case OMPD_allocate:
680 case OMPD_task:
681 case OMPD_simd:
682 case OMPD_sections:
683 case OMPD_section:
684 case OMPD_single:
685 case OMPD_master:
686 case OMPD_critical:
687 case OMPD_taskyield:
688 case OMPD_barrier:
689 case OMPD_taskwait:
690 case OMPD_taskgroup:
691 case OMPD_atomic:
692 case OMPD_flush:
693 case OMPD_depobj:
694 case OMPD_scan:
695 case OMPD_teams:
696 case OMPD_target_data:
697 case OMPD_target_exit_data:
698 case OMPD_target_enter_data:
699 case OMPD_distribute:
700 case OMPD_distribute_simd:
701 case OMPD_distribute_parallel_for:
702 case OMPD_distribute_parallel_for_simd:
703 case OMPD_teams_distribute:
704 case OMPD_teams_distribute_simd:
705 case OMPD_teams_distribute_parallel_for:
706 case OMPD_teams_distribute_parallel_for_simd:
707 case OMPD_target_update:
708 case OMPD_declare_simd:
709 case OMPD_declare_variant:
710 case OMPD_begin_declare_variant:
711 case OMPD_end_declare_variant:
712 case OMPD_declare_target:
713 case OMPD_end_declare_target:
714 case OMPD_declare_reduction:
715 case OMPD_declare_mapper:
716 case OMPD_taskloop:
717 case OMPD_taskloop_simd:
718 case OMPD_master_taskloop:
719 case OMPD_master_taskloop_simd:
720 case OMPD_parallel_master_taskloop:
721 case OMPD_parallel_master_taskloop_simd:
722 case OMPD_requires:
723 case OMPD_unknown:
724 default:
725 break;
726 }
727 llvm_unreachable(::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 728)
728 "Unknown programming model for OpenMP directive on NVPTX target.")::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 728)
;
729}
730
731void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
732 StringRef ParentName,
733 llvm::Function *&OutlinedFn,
734 llvm::Constant *&OutlinedFnID,
735 bool IsOffloadEntry,
736 const RegionCodeGenTy &CodeGen) {
737 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
738 EntryFunctionState EST;
739 WrapperFunctionsMap.clear();
740
741 // Emit target region as a standalone region.
742 class NVPTXPrePostActionTy : public PrePostActionTy {
743 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
744
745 public:
746 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
747 : EST(EST) {}
748 void Enter(CodeGenFunction &CGF) override {
749 auto &RT =
750 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
751 RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
752 // Skip target region initialization.
753 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
754 }
755 void Exit(CodeGenFunction &CGF) override {
756 auto &RT =
757 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
758 RT.clearLocThreadIdInsertPt(CGF);
759 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
760 }
761 } Action(EST);
762 CodeGen.setAction(Action);
763 IsInTTDRegion = true;
764 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
765 IsOffloadEntry, CodeGen);
766 IsInTTDRegion = false;
767}
768
769void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
770 EntryFunctionState &EST, bool IsSPMD) {
771 CGBuilderTy &Bld = CGF.Builder;
772 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD));
773 if (!IsSPMD)
774 emitGenericVarsProlog(CGF, EST.Loc);
775}
776
777void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
778 EntryFunctionState &EST,
779 bool IsSPMD) {
780 if (!IsSPMD)
781 emitGenericVarsEpilog(CGF);
782
783 CGBuilderTy &Bld = CGF.Builder;
784 OMPBuilder.createTargetDeinit(Bld, IsSPMD);
785}
786
787void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
788 StringRef ParentName,
789 llvm::Function *&OutlinedFn,
790 llvm::Constant *&OutlinedFnID,
791 bool IsOffloadEntry,
792 const RegionCodeGenTy &CodeGen) {
793 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
794 EntryFunctionState EST;
795
796 // Emit target region as a standalone region.
797 class NVPTXPrePostActionTy : public PrePostActionTy {
798 CGOpenMPRuntimeGPU &RT;
799 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
800
801 public:
802 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
803 CGOpenMPRuntimeGPU::EntryFunctionState &EST)
804 : RT(RT), EST(EST) {}
805 void Enter(CodeGenFunction &CGF) override {
806 RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
807 // Skip target region initialization.
808 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
809 }
810 void Exit(CodeGenFunction &CGF) override {
811 RT.clearLocThreadIdInsertPt(CGF);
812 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
813 }
814 } Action(*this, EST);
815 CodeGen.setAction(Action);
816 IsInTTDRegion = true;
817 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
818 IsOffloadEntry, CodeGen);
819 IsInTTDRegion = false;
820}
821
822// Create a unique global variable to indicate the execution mode of this target
823// region. The execution mode is either 'generic', or 'spmd' depending on the
824// target directive. This variable is picked up by the offload library to setup
825// the device appropriately before kernel launch. If the execution mode is
826// 'generic', the runtime reserves one warp for the master, otherwise, all
827// warps participate in parallel work.
828static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
829 bool Mode) {
830 auto *GVMode = new llvm::GlobalVariable(
831 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
832 llvm::GlobalValue::WeakAnyLinkage,
833 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
834 : OMP_TGT_EXEC_MODE_GENERIC),
835 Twine(Name, "_exec_mode"));
836 GVMode->setVisibility(llvm::GlobalVariable::ProtectedVisibility);
837 CGM.addCompilerUsedGlobal(GVMode);
838}
839
840void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
841 const OMPExecutableDirective &D, StringRef ParentName,
842 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
843 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
844 if (!IsOffloadEntry) // Nothing to do.
845 return;
846
847 assert(!ParentName.empty() && "Invalid target region parent name!")(static_cast <bool> (!ParentName.empty() && "Invalid target region parent name!"
) ? void (0) : __assert_fail ("!ParentName.empty() && \"Invalid target region parent name!\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 847, __extension__
__PRETTY_FUNCTION__))
;
848
849 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
850 if (Mode)
851 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
852 CodeGen);
853 else
854 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
855 CodeGen);
856
857 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
858}
859
860CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
861 : CGOpenMPRuntime(CGM) {
862 llvm::OpenMPIRBuilderConfig Config(CGM.getLangOpts().OpenMPIsDevice, true,
863 hasRequiresUnifiedSharedMemory(),
864 CGM.getLangOpts().OpenMPOffloadMandatory);
865 OMPBuilder.setConfig(Config);
866
867 if (!CGM.getLangOpts().OpenMPIsDevice)
868 llvm_unreachable("OpenMP can only handle device code.")::llvm::llvm_unreachable_internal("OpenMP can only handle device code."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 868)
;
869
870 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
871 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
872 return;
873
874 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
875 "__omp_rtl_debug_kind");
876 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
877 "__omp_rtl_assume_teams_oversubscription");
878 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
879 "__omp_rtl_assume_threads_oversubscription");
880 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
881 "__omp_rtl_assume_no_thread_state");
882 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
883 "__omp_rtl_assume_no_nested_parallelism");
884}
885
886void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
887 ProcBindKind ProcBind,
888 SourceLocation Loc) {
889 // Do nothing in case of SPMD mode and L0 parallel.
890 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
891 return;
892
893 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
894}
895
896void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
897 llvm::Value *NumThreads,
898 SourceLocation Loc) {
899 // Nothing to do.
900}
901
902void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
903 const Expr *NumTeams,
904 const Expr *ThreadLimit,
905 SourceLocation Loc) {}
906
907llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
908 CodeGenFunction &CGF, const OMPExecutableDirective &D,
909 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
910 const RegionCodeGenTy &CodeGen) {
911 // Emit target region as a standalone region.
912 bool PrevIsInTTDRegion = IsInTTDRegion;
913 IsInTTDRegion = false;
914 auto *OutlinedFun =
915 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
916 CGF, D, ThreadIDVar, InnermostKind, CodeGen));
917 IsInTTDRegion = PrevIsInTTDRegion;
918 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
919 llvm::Function *WrapperFun =
920 createParallelDataSharingWrapper(OutlinedFun, D);
921 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
922 }
923
924 return OutlinedFun;
925}
926
927/// Get list of lastprivate variables from the teams distribute ... or
928/// teams {distribute ...} directives.
929static void
930getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
931 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
932 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 933, __extension__
__PRETTY_FUNCTION__))
933 "expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 933, __extension__
__PRETTY_FUNCTION__))
;
934 const OMPExecutableDirective *Dir = &D;
935 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
936 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
937 Ctx,
938 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
939 /*IgnoreCaptured=*/true))) {
940 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
941 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
942 Dir = nullptr;
943 }
944 }
945 if (!Dir)
946 return;
947 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
948 for (const Expr *E : C->getVarRefs())
949 Vars.push_back(getPrivateItem(E));
950 }
951}
952
953/// Get list of reduction variables from the teams ... directives.
954static void
955getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
956 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
957 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 958, __extension__
__PRETTY_FUNCTION__))
958 "expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 958, __extension__
__PRETTY_FUNCTION__))
;
959 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
960 for (const Expr *E : C->privates())
961 Vars.push_back(getPrivateItem(E));
962 }
963}
964
965llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
966 CodeGenFunction &CGF, const OMPExecutableDirective &D,
967 const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
968 const RegionCodeGenTy &CodeGen) {
969 SourceLocation Loc = D.getBeginLoc();
970
971 const RecordDecl *GlobalizedRD = nullptr;
972 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
973 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
974 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
975 // Globalize team reductions variable unconditionally in all modes.
976 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
977 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
978 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
979 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
980 if (!LastPrivatesReductions.empty()) {
981 GlobalizedRD = ::buildRecordForGlobalizedVars(
982 CGM.getContext(), std::nullopt, LastPrivatesReductions,
983 MappedDeclsFields, WarpSize);
984 }
985 } else if (!LastPrivatesReductions.empty()) {
986 assert(!TeamAndReductions.first &&(static_cast <bool> (!TeamAndReductions.first &&
"Previous team declaration is not expected.") ? void (0) : __assert_fail
("!TeamAndReductions.first && \"Previous team declaration is not expected.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 987, __extension__
__PRETTY_FUNCTION__))
987 "Previous team declaration is not expected.")(static_cast <bool> (!TeamAndReductions.first &&
"Previous team declaration is not expected.") ? void (0) : __assert_fail
("!TeamAndReductions.first && \"Previous team declaration is not expected.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 987, __extension__
__PRETTY_FUNCTION__))
;
988 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
989 std::swap(TeamAndReductions.second, LastPrivatesReductions);
990 }
991
992 // Emit target region as a standalone region.
993 class NVPTXPrePostActionTy : public PrePostActionTy {
994 SourceLocation &Loc;
995 const RecordDecl *GlobalizedRD;
996 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
997 &MappedDeclsFields;
998
999 public:
1000 NVPTXPrePostActionTy(
1001 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1002 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1003 &MappedDeclsFields)
1004 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1005 MappedDeclsFields(MappedDeclsFields) {}
1006 void Enter(CodeGenFunction &CGF) override {
1007 auto &Rt =
1008 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1009 if (GlobalizedRD) {
1010 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1011 I->getSecond().MappedParams =
1012 std::make_unique<CodeGenFunction::OMPMapVars>();
1013 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1014 for (const auto &Pair : MappedDeclsFields) {
1015 assert(Pair.getFirst()->isCanonicalDecl() &&(static_cast <bool> (Pair.getFirst()->isCanonicalDecl
() && "Expected canonical declaration") ? void (0) : __assert_fail
("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1016, __extension__
__PRETTY_FUNCTION__))
1016 "Expected canonical declaration")(static_cast <bool> (Pair.getFirst()->isCanonicalDecl
() && "Expected canonical declaration") ? void (0) : __assert_fail
("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1016, __extension__
__PRETTY_FUNCTION__))
;
1017 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1018 }
1019 }
1020 Rt.emitGenericVarsProlog(CGF, Loc);
1021 }
1022 void Exit(CodeGenFunction &CGF) override {
1023 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1024 .emitGenericVarsEpilog(CGF);
1025 }
1026 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1027 CodeGen.setAction(Action);
1028 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1029 CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1030
1031 return OutlinedFun;
1032}
1033
1034void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1035 SourceLocation Loc,
1036 bool WithSPMDCheck) {
1037 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1038 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1039 return;
1040
1041 CGBuilderTy &Bld = CGF.Builder;
1042
1043 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1044 if (I == FunctionGlobalizedDecls.end())
1045 return;
1046
1047 for (auto &Rec : I->getSecond().LocalVarData) {
1048 const auto *VD = cast<VarDecl>(Rec.first);
1049 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1050 QualType VarTy = VD->getType();
1051
1052 // Get the local allocation of a firstprivate variable before sharing
1053 llvm::Value *ParValue;
1054 if (EscapedParam) {
1055 LValue ParLVal =
1056 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1057 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1058 }
1059
1060 // Allocate space for the variable to be globalized
1061 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1062 llvm::CallBase *VoidPtr =
1063 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1064 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1065 AllocArgs, VD->getName());
1066 // FIXME: We should use the variables actual alignment as an argument.
1067 VoidPtr->addRetAttr(llvm::Attribute::get(
1068 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1069 CGM.getContext().getTargetInfo().getNewAlign() / 8));
1070
1071 // Cast the void pointer and get the address of the globalized variable.
1072 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1073 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1074 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1075 LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1076 Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1077 Rec.second.GlobalizedVal = VoidPtr;
1078
1079 // Assign the local allocation to the newly globalized location.
1080 if (EscapedParam) {
1081 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1082 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1083 }
1084 if (auto *DI = CGF.getDebugInfo())
1085 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1086 }
1087 for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1088 // Use actual memory size of the VLA object including the padding
1089 // for alignment purposes.
1090 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1091 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1092 Size = Bld.CreateNUWAdd(
1093 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1094 llvm::Value *AlignVal =
1095 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1096
1097 Size = Bld.CreateUDiv(Size, AlignVal);
1098 Size = Bld.CreateNUWMul(Size, AlignVal);
1099
1100 // Allocate space for this VLA object to be globalized.
1101 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1102 llvm::CallBase *VoidPtr =
1103 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1104 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1105 AllocArgs, VD->getName());
1106 VoidPtr->addRetAttr(
1107 llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
1108 CGM.getContext().getTargetInfo().getNewAlign()));
1109
1110 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1111 std::pair<llvm::Value *, llvm::Value *>(
1112 {VoidPtr, CGF.getTypeSize(VD->getType())}));
1113 LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1114 CGM.getContext().getDeclAlign(VD),
1115 AlignmentSource::Decl);
1116 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1117 Base.getAddress(CGF));
1118 }
1119 I->getSecond().MappedParams->apply(CGF);
1120}
1121
1122void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1123 bool WithSPMDCheck) {
1124 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1125 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1126 return;
1127
1128 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1129 if (I != FunctionGlobalizedDecls.end()) {
1130 // Deallocate the memory for each globalized VLA object
1131 for (const auto &AddrSizePair :
1132 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1133 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1134 CGM.getModule(), OMPRTL___kmpc_free_shared),
1135 {AddrSizePair.first, AddrSizePair.second});
1136 }
1137 // Deallocate the memory for each globalized value
1138 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1139 const auto *VD = cast<VarDecl>(Rec.first);
1140 I->getSecond().MappedParams->restore(CGF);
1141
1142 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1143 CGF.getTypeSize(VD->getType())};
1144 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1145 CGM.getModule(), OMPRTL___kmpc_free_shared),
1146 FreeArgs);
1147 }
1148 }
1149}
1150
1151void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1152 const OMPExecutableDirective &D,
1153 SourceLocation Loc,
1154 llvm::Function *OutlinedFn,
1155 ArrayRef<llvm::Value *> CapturedVars) {
1156 if (!CGF.HaveInsertPoint())
1157 return;
1158
1159 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1160 /*Name=*/".zero.addr");
1161 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1162 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1163 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1164 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1165 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1166 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1167}
1168
1169void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1170 SourceLocation Loc,
1171 llvm::Function *OutlinedFn,
1172 ArrayRef<llvm::Value *> CapturedVars,
1173 const Expr *IfCond,
1174 llvm::Value *NumThreads) {
1175 if (!CGF.HaveInsertPoint())
1176 return;
1177
1178 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1179 NumThreads](CodeGenFunction &CGF,
1180 PrePostActionTy &Action) {
1181 CGBuilderTy &Bld = CGF.Builder;
1182 llvm::Value *NumThreadsVal = NumThreads;
1183 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1184 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1185 if (WFn)
1186 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1187 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1188
1189 // Create a private scope that will globalize the arguments
1190 // passed from the outside of the target region.
1191 // TODO: Is that needed?
1192 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1193
1194 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1195 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1196 "captured_vars_addrs");
1197 // There's something to share.
1198 if (!CapturedVars.empty()) {
1199 // Prepare for parallel region. Indicate the outlined function.
1200 ASTContext &Ctx = CGF.getContext();
1201 unsigned Idx = 0;
1202 for (llvm::Value *V : CapturedVars) {
1203 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1204 llvm::Value *PtrV;
1205 if (V->getType()->isIntegerTy())
1206 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1207 else
1208 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1209 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1210 Ctx.getPointerType(Ctx.VoidPtrTy));
1211 ++Idx;
1212 }
1213 }
1214
1215 llvm::Value *IfCondVal = nullptr;
1216 if (IfCond)
1217 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1218 /* isSigned */ false);
1219 else
1220 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1221
1222 if (!NumThreadsVal)
1223 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1224 else
1225 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1226
1227 assert(IfCondVal && "Expected a value")(static_cast <bool> (IfCondVal && "Expected a value"
) ? void (0) : __assert_fail ("IfCondVal && \"Expected a value\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1227, __extension__
__PRETTY_FUNCTION__))
;
1228 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1229 llvm::Value *Args[] = {
1230 RTLoc,
1231 getThreadID(CGF, Loc),
1232 IfCondVal,
1233 NumThreadsVal,
1234 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1235 FnPtr,
1236 ID,
1237 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1238 CGF.VoidPtrPtrTy),
1239 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1240 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1241 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1242 Args);
1243 };
1244
1245 RegionCodeGenTy RCG(ParallelGen);
1246 RCG(CGF);
1247}
1248
1249void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1250 // Always emit simple barriers!
1251 if (!CGF.HaveInsertPoint())
1252 return;
1253 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1254 // This function does not use parameters, so we can emit just default values.
1255 llvm::Value *Args[] = {
1256 llvm::ConstantPointerNull::get(
1257 cast<llvm::PointerType>(getIdentTyPointerTy())),
1258 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1259 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1260 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1261 Args);
1262}
1263
1264void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1265 SourceLocation Loc,
1266 OpenMPDirectiveKind Kind, bool,
1267 bool) {
1268 // Always emit simple barriers!
1269 if (!CGF.HaveInsertPoint())
1270 return;
1271 // Build call __kmpc_cancel_barrier(loc, thread_id);
1272 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1273 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1274 getThreadID(CGF, Loc)};
1275
1276 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1277 CGM.getModule(), OMPRTL___kmpc_barrier),
1278 Args);
1279}
1280
1281void CGOpenMPRuntimeGPU::emitCriticalRegion(
1282 CodeGenFunction &CGF, StringRef CriticalName,
1283 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1284 const Expr *Hint) {
1285 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1286 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1287 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1288 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1289 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1290
1291 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1292
1293 // Get the mask of active threads in the warp.
1294 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1295 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1296 // Fetch team-local id of the thread.
1297 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1298
1299 // Get the width of the team.
1300 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1301
1302 // Initialize the counter variable for the loop.
1303 QualType Int32Ty =
1304 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1305 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1306 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1307 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1308 /*isInit=*/true);
1309
1310 // Block checks if loop counter exceeds upper bound.
1311 CGF.EmitBlock(LoopBB);
1312 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1313 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1314 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1315
1316 // Block tests which single thread should execute region, and which threads
1317 // should go straight to synchronisation point.
1318 CGF.EmitBlock(TestBB);
1319 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1320 llvm::Value *CmpThreadToCounter =
1321 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1322 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1323
1324 // Block emits the body of the critical region.
1325 CGF.EmitBlock(BodyBB);
1326
1327 // Output the critical statement.
1328 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1329 Hint);
1330
1331 // After the body surrounded by the critical region, the single executing
1332 // thread will jump to the synchronisation point.
1333 // Block waits for all threads in current team to finish then increments the
1334 // counter variable and returns to the loop.
1335 CGF.EmitBlock(SyncBB);
1336 // Reconverge active threads in the warp.
1337 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1338 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1339 Mask);
1340
1341 llvm::Value *IncCounterVal =
1342 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1343 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1344 CGF.EmitBranch(LoopBB);
1345
1346 // Block that is reached when all threads in the team complete the region.
1347 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1348}
1349
1350/// Cast value to the specified type.
1351static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1352 QualType ValTy, QualType CastTy,
1353 SourceLocation Loc) {
1354 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(CastTy).isZero() && "Cast type must sized.") ? void (
0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1355, __extension__
__PRETTY_FUNCTION__))
1355 "Cast type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(CastTy).isZero() && "Cast type must sized.") ? void (
0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1355, __extension__
__PRETTY_FUNCTION__))
;
1356 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(ValTy).isZero() && "Val type must sized.") ? void (0
) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1357, __extension__
__PRETTY_FUNCTION__))
1357 "Val type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(ValTy).isZero() && "Val type must sized.") ? void (0
) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1357, __extension__
__PRETTY_FUNCTION__))
;
1358 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1359 if (ValTy == CastTy)
1360 return Val;
1361 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1362 CGF.getContext().getTypeSizeInChars(CastTy))
1363 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1364 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1365 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1366 CastTy->hasSignedIntegerRepresentation());
1367 Address CastItem = CGF.CreateMemTemp(CastTy);
1368 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1369 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()),
1370 Val->getType());
1371 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1372 LValueBaseInfo(AlignmentSource::Type),
1373 TBAAAccessInfo());
1374 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1375 LValueBaseInfo(AlignmentSource::Type),
1376 TBAAAccessInfo());
1377}
1378
1379/// This function creates calls to one of two shuffle functions to copy
1380/// variables between lanes in a warp.
1381static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1382 llvm::Value *Elem,
1383 QualType ElemType,
1384 llvm::Value *Offset,
1385 SourceLocation Loc) {
1386 CodeGenModule &CGM = CGF.CGM;
1387 CGBuilderTy &Bld = CGF.Builder;
1388 CGOpenMPRuntimeGPU &RT =
1389 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1390 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1391
1392 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1393 assert(Size.getQuantity() <= 8 &&(static_cast <bool> (Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.") ? void (0) :
__assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1394, __extension__
__PRETTY_FUNCTION__))
1394 "Unsupported bitwidth in shuffle instruction.")(static_cast <bool> (Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.") ? void (0) :
__assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1394, __extension__
__PRETTY_FUNCTION__))
;
1395
1396 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1397 ? OMPRTL___kmpc_shuffle_int32
1398 : OMPRTL___kmpc_shuffle_int64;
1399
1400 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1401 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1402 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1403 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1404 llvm::Value *WarpSize =
1405 Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1406
1407 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1408 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1409 {ElemCast, Offset, WarpSize});
1410
1411 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1412}
1413
1414static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1415 Address DestAddr, QualType ElemType,
1416 llvm::Value *Offset, SourceLocation Loc) {
1417 CGBuilderTy &Bld = CGF.Builder;
1418
1419 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1420 // Create the loop over the big sized data.
1421 // ptr = (void*)Elem;
1422 // ptrEnd = (void*) Elem + 1;
1423 // Step = 8;
1424 // while (ptr + Step < ptrEnd)
1425 // shuffle((int64_t)*ptr);
1426 // Step = 4;
1427 // while (ptr + Step < ptrEnd)
1428 // shuffle((int32_t)*ptr);
1429 // ...
1430 Address ElemPtr = DestAddr;
1431 Address Ptr = SrcAddr;
1432 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1433 Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
1434 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1435 if (Size < CharUnits::fromQuantity(IntSize))
1436 continue;
1437 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1438 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1439 /*Signed=*/1);
1440 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1441 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
1442 IntTy);
1443 ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1444 ElemPtr, IntTy->getPointerTo(), IntTy);
1445 if (Size.getQuantity() / IntSize > 1) {
1446 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1447 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1448 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1449 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1450 CGF.EmitBlock(PreCondBB);
1451 llvm::PHINode *PhiSrc =
1452 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1453 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1454 llvm::PHINode *PhiDest =
1455 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1456 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1457 Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1458 ElemPtr =
1459 Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1460 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1461 CGF.Int8Ty, PtrEnd.getPointer(),
1462 Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
1463 CGF.VoidPtrTy));
1464 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1465 ThenBB, ExitBB);
1466 CGF.EmitBlock(ThenBB);
1467 llvm::Value *Res = createRuntimeShuffleFunction(
1468 CGF,
1469 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1470 LValueBaseInfo(AlignmentSource::Type),
1471 TBAAAccessInfo()),
1472 IntType, Offset, Loc);
1473 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1474 LValueBaseInfo(AlignmentSource::Type),
1475 TBAAAccessInfo());
1476 Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1477 Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1478 PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1479 PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1480 CGF.EmitBranch(PreCondBB);
1481 CGF.EmitBlock(ExitBB);
1482 } else {
1483 llvm::Value *Res = createRuntimeShuffleFunction(
1484 CGF,
1485 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1486 LValueBaseInfo(AlignmentSource::Type),
1487 TBAAAccessInfo()),
1488 IntType, Offset, Loc);
1489 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1490 LValueBaseInfo(AlignmentSource::Type),
1491 TBAAAccessInfo());
1492 Ptr = Bld.CreateConstGEP(Ptr, 1);
1493 ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1494 }
1495 Size = Size % IntSize;
1496 }
1497}
1498
1499namespace {
1500enum CopyAction : unsigned {
1501 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1502 // the warp using shuffle instructions.
1503 RemoteLaneToThread,
1504 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1505 ThreadCopy,
1506 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1507 ThreadToScratchpad,
1508 // ScratchpadToThread: Copy from a scratchpad array in global memory
1509 // containing team-reduced data to a thread's stack.
1510 ScratchpadToThread,
1511};
1512} // namespace
1513
1514struct CopyOptionsTy {
1515 llvm::Value *RemoteLaneOffset;
1516 llvm::Value *ScratchpadIndex;
1517 llvm::Value *ScratchpadWidth;
1518};
1519
1520/// Emit instructions to copy a Reduce list, which contains partially
1521/// aggregated values, in the specified direction.
1522static void emitReductionListCopy(
1523 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1524 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1525 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1526
1527 CodeGenModule &CGM = CGF.CGM;
1528 ASTContext &C = CGM.getContext();
1529 CGBuilderTy &Bld = CGF.Builder;
1530
1531 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1532 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1533 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1534
1535 // Iterates, element-by-element, through the source Reduce list and
1536 // make a copy.
1537 unsigned Idx = 0;
1538 unsigned Size = Privates.size();
1539 for (const Expr *Private : Privates) {
1540 Address SrcElementAddr = Address::invalid();
1541 Address DestElementAddr = Address::invalid();
1542 Address DestElementPtrAddr = Address::invalid();
1543 // Should we shuffle in an element from a remote lane?
1544 bool ShuffleInElement = false;
1545 // Set to true to update the pointer in the dest Reduce list to a
1546 // newly created element.
1547 bool UpdateDestListPtr = false;
1548 // Increment the src or dest pointer to the scratchpad, for each
1549 // new element.
1550 bool IncrScratchpadSrc = false;
1551 bool IncrScratchpadDest = false;
1552 QualType PrivatePtrType = C.getPointerType(Private->getType());
1553 llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
1554
1555 switch (Action) {
1556 case RemoteLaneToThread: {
1557 // Step 1.1: Get the address for the src element in the Reduce list.
1558 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1559 SrcElementAddr =
1560 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1561 SrcElementPtrAddr, PrivateLlvmPtrType),
1562 PrivatePtrType->castAs<PointerType>());
1563
1564 // Step 1.2: Create a temporary to store the element in the destination
1565 // Reduce list.
1566 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1567 DestElementAddr =
1568 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1569 ShuffleInElement = true;
1570 UpdateDestListPtr = true;
1571 break;
1572 }
1573 case ThreadCopy: {
1574 // Step 1.1: Get the address for the src element in the Reduce list.
1575 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1576 SrcElementAddr =
1577 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1578 SrcElementPtrAddr, PrivateLlvmPtrType),
1579 PrivatePtrType->castAs<PointerType>());
1580
1581 // Step 1.2: Get the address for dest element. The destination
1582 // element has already been created on the thread's stack.
1583 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1584 DestElementAddr =
1585 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1586 DestElementPtrAddr, PrivateLlvmPtrType),
1587 PrivatePtrType->castAs<PointerType>());
1588 break;
1589 }
1590 case ThreadToScratchpad: {
1591 // Step 1.1: Get the address for the src element in the Reduce list.
1592 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1593 SrcElementAddr =
1594 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1595 SrcElementPtrAddr, PrivateLlvmPtrType),
1596 PrivatePtrType->castAs<PointerType>());
1597
1598 // Step 1.2: Get the address for dest element:
1599 // address = base + index * ElementSizeInChars.
1600 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1601 llvm::Value *CurrentOffset =
1602 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1603 llvm::Value *ScratchPadElemAbsolutePtrVal =
1604 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1605 ScratchPadElemAbsolutePtrVal =
1606 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1607 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1608 C.getTypeAlignInChars(Private->getType()));
1609 IncrScratchpadDest = true;
1610 break;
1611 }
1612 case ScratchpadToThread: {
1613 // Step 1.1: Get the address for the src element in the scratchpad.
1614 // address = base + index * ElementSizeInChars.
1615 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1616 llvm::Value *CurrentOffset =
1617 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1618 llvm::Value *ScratchPadElemAbsolutePtrVal =
1619 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1620 ScratchPadElemAbsolutePtrVal =
1621 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1622 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1623 C.getTypeAlignInChars(Private->getType()));
1624 IncrScratchpadSrc = true;
1625
1626 // Step 1.2: Create a temporary to store the element in the destination
1627 // Reduce list.
1628 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1629 DestElementAddr =
1630 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1631 UpdateDestListPtr = true;
1632 break;
1633 }
1634 }
1635
1636 // Regardless of src and dest of copy, we emit the load of src
1637 // element as this is required in all directions
1638 SrcElementAddr = Bld.CreateElementBitCast(
1639 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1640 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1641 SrcElementAddr.getElementType());
1642
1643 // Now that all active lanes have read the element in the
1644 // Reduce list, shuffle over the value from the remote lane.
1645 if (ShuffleInElement) {
1646 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1647 RemoteLaneOffset, Private->getExprLoc());
1648 } else {
1649 switch (CGF.getEvaluationKind(Private->getType())) {
1650 case TEK_Scalar: {
1651 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1652 SrcElementAddr, /*Volatile=*/false, Private->getType(),
1653 Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
1654 TBAAAccessInfo());
1655 // Store the source element value to the dest element address.
1656 CGF.EmitStoreOfScalar(
1657 Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
1658 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
1659 break;
1660 }
1661 case TEK_Complex: {
1662 CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
1663 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1664 Private->getExprLoc());
1665 CGF.EmitStoreOfComplex(
1666 Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1667 /*isInit=*/false);
1668 break;
1669 }
1670 case TEK_Aggregate:
1671 CGF.EmitAggregateCopy(
1672 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1673 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1674 Private->getType(), AggValueSlot::DoesNotOverlap);
1675 break;
1676 }
1677 }
1678
1679 // Step 3.1: Modify reference in dest Reduce list as needed.
1680 // Modifying the reference in Reduce list to point to the newly
1681 // created element. The element is live in the current function
1682 // scope and that of functions it invokes (i.e., reduce_function).
1683 // RemoteReduceData[i] = (void*)&RemoteElem
1684 if (UpdateDestListPtr) {
1685 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1686 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1687 DestElementPtrAddr, /*Volatile=*/false,
1688 C.VoidPtrTy);
1689 }
1690
1691 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1692 // address of the next element in scratchpad memory, unless we're currently
1693 // processing the last one. Memory alignment is also taken care of here.
1694 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1695 // FIXME: This code doesn't make any sense, it's trying to perform
1696 // integer arithmetic on pointers.
1697 llvm::Value *ScratchpadBasePtr =
1698 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1699 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1700 ScratchpadBasePtr = Bld.CreateNUWAdd(
1701 ScratchpadBasePtr,
1702 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
1703
1704 // Take care of global memory alignment for performance
1705 ScratchpadBasePtr = Bld.CreateNUWSub(
1706 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1707 ScratchpadBasePtr = Bld.CreateUDiv(
1708 ScratchpadBasePtr,
1709 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1710 ScratchpadBasePtr = Bld.CreateNUWAdd(
1711 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1712 ScratchpadBasePtr = Bld.CreateNUWMul(
1713 ScratchpadBasePtr,
1714 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1715
1716 if (IncrScratchpadDest)
1717 DestBase =
1718 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1719 else /* IncrScratchpadSrc = true */
1720 SrcBase =
1721 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1722 }
1723
1724 ++Idx;
1725 }
1726}
1727
1728/// This function emits a helper that gathers Reduce lists from the first
1729/// lane of every active warp to lanes in the first warp.
1730///
1731/// void inter_warp_copy_func(void* reduce_data, num_warps)
1732/// shared smem[warp_size];
1733/// For all data entries D in reduce_data:
1734/// sync
1735/// If (I am the first lane in each warp)
1736/// Copy my local D to smem[warp_id]
1737/// sync
1738/// if (I am the first warp)
1739/// Copy smem[thread_id] to my local D
1740static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1741 ArrayRef<const Expr *> Privates,
1742 QualType ReductionArrayTy,
1743 SourceLocation Loc) {
1744 ASTContext &C = CGM.getContext();
1745 llvm::Module &M = CGM.getModule();
1746
1747 // ReduceList: thread local Reduce list.
1748 // At the stage of the computation when this function is called, partially
1749 // aggregated values reside in the first lane of every active warp.
1750 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1751 C.VoidPtrTy, ImplicitParamDecl::Other);
1752 // NumWarps: number of warps active in the parallel region. This could
1753 // be smaller than 32 (max warps in a CTA) for partial block reduction.
1754 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1755 C.getIntTypeForBitwidth(32, /* Signed */ true),
1756 ImplicitParamDecl::Other);
1757 FunctionArgList Args;
1758 Args.push_back(&ReduceListArg);
1759 Args.push_back(&NumWarpsArg);
1760
1761 const CGFunctionInfo &CGFI =
1762 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1763 auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
1764 llvm::GlobalValue::InternalLinkage,
1765 "_omp_reduction_inter_warp_copy_func", &M);
1766 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
1767 Fn->setDoesNotRecurse();
1768 CodeGenFunction CGF(CGM);
1769 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
1770
1771 CGBuilderTy &Bld = CGF.Builder;
1772
1773 // This array is used as a medium to transfer, one reduce element at a time,
1774 // the data from the first lane of every warp to lanes in the first warp
1775 // in order to perform the final step of a reduction in a parallel region
1776 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1777 // for reduced latency, as well as to have a distinct copy for concurrently
1778 // executing target regions. The array is declared with common linkage so
1779 // as to be shared across compilation units.
1780 StringRef TransferMediumName =
1781 "__openmp_nvptx_data_transfer_temporary_storage";
1782 llvm::GlobalVariable *TransferMedium =
1783 M.getGlobalVariable(TransferMediumName);
1784 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1785 if (!TransferMedium) {
11
Assuming 'TransferMedium' is non-null
12
Taking false branch
1786 auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
1787 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1788 TransferMedium = new llvm::GlobalVariable(
1789 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
1790 llvm::UndefValue::get(Ty), TransferMediumName,
1791 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1792 SharedAddressSpace);
1793 CGM.addCompilerUsedGlobal(TransferMedium);
1794 }
1795
1796 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1797 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1798 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1799 // nvptx_lane_id = nvptx_id % warpsize
1800 llvm::Value *LaneID = getNVPTXLaneID(CGF);
13
Calling 'getNVPTXLaneID'
1801 // nvptx_warp_id = nvptx_id / warpsize
1802 llvm::Value *WarpID = getNVPTXWarpID(CGF);
1803
1804 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1805 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
1806 Address LocalReduceList(
1807 Bld.CreatePointerBitCastOrAddrSpaceCast(
1808 CGF.EmitLoadOfScalar(
1809 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
1810 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
1811 ElemTy->getPointerTo()),
1812 ElemTy, CGF.getPointerAlign());
1813
1814 unsigned Idx = 0;
1815 for (const Expr *Private : Privates) {
1816 //
1817 // Warp master copies reduce element to transfer medium in __shared__
1818 // memory.
1819 //
1820 unsigned RealTySize =
1821 C.getTypeSizeInChars(Private->getType())
1822 .alignTo(C.getTypeAlignInChars(Private->getType()))
1823 .getQuantity();
1824 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1825 unsigned NumIters = RealTySize / TySize;
1826 if (NumIters == 0)
1827 continue;
1828 QualType CType = C.getIntTypeForBitwidth(
1829 C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
1830 llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
1831 CharUnits Align = CharUnits::fromQuantity(TySize);
1832 llvm::Value *Cnt = nullptr;
1833 Address CntAddr = Address::invalid();
1834 llvm::BasicBlock *PrecondBB = nullptr;
1835 llvm::BasicBlock *ExitBB = nullptr;
1836 if (NumIters > 1) {
1837 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1838 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
1839 /*Volatile=*/false, C.IntTy);
1840 PrecondBB = CGF.createBasicBlock("precond");
1841 ExitBB = CGF.createBasicBlock("exit");
1842 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
1843 // There is no need to emit line number for unconditional branch.
1844 (void)ApplyDebugLocation::CreateEmpty(CGF);
1845 CGF.EmitBlock(PrecondBB);
1846 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
1847 llvm::Value *Cmp =
1848 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
1849 Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
1850 CGF.EmitBlock(BodyBB);
1851 }
1852 // kmpc_barrier.
1853 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1854 /*EmitChecks=*/false,
1855 /*ForceSimpleCall=*/true);
1856 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1857 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1858 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1859
1860 // if (lane_id == 0)
1861 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
1862 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1863 CGF.EmitBlock(ThenBB);
1864
1865 // Reduce element = LocalReduceList[i]
1866 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1867 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1868 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1869 // elemptr = ((CopyType*)(elemptrptr)) + I
1870 Address ElemPtr(ElemPtrPtr, CGF.Int8Ty, Align);
1871 ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
1872 if (NumIters > 1)
1873 ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
1874
1875 // Get pointer to location in transfer medium.
1876 // MediumPtr = &medium[warp_id]
1877 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1878 TransferMedium->getValueType(), TransferMedium,
1879 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1880 // Casting to actual data type.
1881 // MediumPtr = (CopyType*)MediumPtrAddr;
1882 Address MediumPtr(
1883 Bld.CreateBitCast(
1884 MediumPtrVal,
1885 CopyType->getPointerTo(
1886 MediumPtrVal->getType()->getPointerAddressSpace())),
1887 CopyType, Align);
1888
1889 // elem = *elemptr
1890 //*MediumPtr = elem
1891 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1892 ElemPtr, /*Volatile=*/false, CType, Loc,
1893 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
1894 // Store the source element value to the dest element address.
1895 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
1896 LValueBaseInfo(AlignmentSource::Type),
1897 TBAAAccessInfo());
1898
1899 Bld.CreateBr(MergeBB);
1900
1901 CGF.EmitBlock(ElseBB);
1902 Bld.CreateBr(MergeBB);
1903
1904 CGF.EmitBlock(MergeBB);
1905
1906 // kmpc_barrier.
1907 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1908 /*EmitChecks=*/false,
1909 /*ForceSimpleCall=*/true);
1910
1911 //
1912 // Warp 0 copies reduce element from transfer medium.
1913 //
1914 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1915 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1916 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1917
1918 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1919 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1920 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
1921
1922 // Up to 32 threads in warp 0 are active.
1923 llvm::Value *IsActiveThread =
1924 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1925 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1926
1927 CGF.EmitBlock(W0ThenBB);
1928
1929 // SrcMediumPtr = &medium[tid]
1930 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1931 TransferMedium->getValueType(), TransferMedium,
1932 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1933 // SrcMediumVal = *SrcMediumPtr;
1934 Address SrcMediumPtr(
1935 Bld.CreateBitCast(
1936 SrcMediumPtrVal,
1937 CopyType->getPointerTo(
1938 SrcMediumPtrVal->getType()->getPointerAddressSpace())),
1939 CopyType, Align);
1940
1941 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
1942 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1943 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1944 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
1945 Address TargetElemPtr(TargetElemPtrVal, CGF.Int8Ty, Align);
1946 TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
1947 if (NumIters > 1)
1948 TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
1949
1950 // *TargetElemPtr = SrcMediumVal;
1951 llvm::Value *SrcMediumValue =
1952 CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
1953 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1954 CType);
1955 Bld.CreateBr(W0MergeBB);
1956
1957 CGF.EmitBlock(W0ElseBB);
1958 Bld.CreateBr(W0MergeBB);
1959
1960 CGF.EmitBlock(W0MergeBB);
1961
1962 if (NumIters > 1) {
1963 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
1964 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
1965 CGF.EmitBranch(PrecondBB);
1966 (void)ApplyDebugLocation::CreateEmpty(CGF);
1967 CGF.EmitBlock(ExitBB);
1968 }
1969 RealTySize %= TySize;
1970 }
1971 ++Idx;
1972 }
1973
1974 CGF.FinishFunction();
1975 return Fn;
1976}
1977
1978/// Emit a helper that reduces data across two OpenMP threads (lanes)
1979/// in the same warp. It uses shuffle instructions to copy over data from
1980/// a remote lane's stack. The reduction algorithm performed is specified
1981/// by the fourth parameter.
1982///
1983/// Algorithm Versions.
1984/// Full Warp Reduce (argument value 0):
1985/// This algorithm assumes that all 32 lanes are active and gathers
1986/// data from these 32 lanes, producing a single resultant value.
1987/// Contiguous Partial Warp Reduce (argument value 1):
1988/// This algorithm assumes that only a *contiguous* subset of lanes
1989/// are active. This happens for the last warp in a parallel region
1990/// when the user specified num_threads is not an integer multiple of
1991/// 32. This contiguous subset always starts with the zeroth lane.
1992/// Partial Warp Reduce (argument value 2):
1993/// This algorithm gathers data from any number of lanes at any position.
1994/// All reduced values are stored in the lowest possible lane. The set
1995/// of problems every algorithm addresses is a super set of those
1996/// addressable by algorithms with a lower version number. Overhead
1997/// increases as algorithm version increases.
1998///
1999/// Terminology
2000/// Reduce element:
2001/// Reduce element refers to the individual data field with primitive
2002/// data types to be combined and reduced across threads.
2003/// Reduce list:
2004/// Reduce list refers to a collection of local, thread-private
2005/// reduce elements.
2006/// Remote Reduce list:
2007/// Remote Reduce list refers to a collection of remote (relative to
2008/// the current thread) reduce elements.
2009///
2010/// We distinguish between three states of threads that are important to
2011/// the implementation of this function.
2012/// Alive threads:
2013/// Threads in a warp executing the SIMT instruction, as distinguished from
2014/// threads that are inactive due to divergent control flow.
2015/// Active threads:
2016/// The minimal set of threads that has to be alive upon entry to this
2017/// function. The computation is correct iff active threads are alive.
2018/// Some threads are alive but they are not active because they do not
2019/// contribute to the computation in any useful manner. Turning them off
2020/// may introduce control flow overheads without any tangible benefits.
2021/// Effective threads:
2022/// In order to comply with the argument requirements of the shuffle
2023/// function, we must keep all lanes holding data alive. But at most
2024/// half of them perform value aggregation; we refer to this half of
2025/// threads as effective. The other half is simply handing off their
2026/// data.
2027///
2028/// Procedure
2029/// Value shuffle:
2030/// In this step active threads transfer data from higher lane positions
2031/// in the warp to lower lane positions, creating Remote Reduce list.
2032/// Value aggregation:
2033/// In this step, effective threads combine their thread local Reduce list
2034/// with Remote Reduce list and store the result in the thread local
2035/// Reduce list.
2036/// Value copy:
2037/// In this step, we deal with the assumption made by algorithm 2
2038/// (i.e. contiguity assumption). When we have an odd number of lanes
2039/// active, say 2k+1, only k threads will be effective and therefore k
2040/// new values will be produced. However, the Reduce list owned by the
2041/// (2k+1)th thread is ignored in the value aggregation. Therefore
2042/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2043/// that the contiguity assumption still holds.
2044static llvm::Function *emitShuffleAndReduceFunction(
2045 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2046 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2047 ASTContext &C = CGM.getContext();
2048
2049 // Thread local Reduce list used to host the values of data to be reduced.
2050 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2051 C.VoidPtrTy, ImplicitParamDecl::Other);
2052 // Current lane id; could be logical.
2053 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2054 ImplicitParamDecl::Other);
2055 // Offset of the remote source lane relative to the current lane.
2056 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2057 C.ShortTy, ImplicitParamDecl::Other);
2058 // Algorithm version. This is expected to be known at compile time.
2059 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2060 C.ShortTy, ImplicitParamDecl::Other);
2061 FunctionArgList Args;
2062 Args.push_back(&ReduceListArg);
2063 Args.push_back(&LaneIDArg);
2064 Args.push_back(&RemoteLaneOffsetArg);
2065 Args.push_back(&AlgoVerArg);
2066
2067 const CGFunctionInfo &CGFI =
2068 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2069 auto *Fn = llvm::Function::Create(
2070 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2071 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2072 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2073 Fn->setDoesNotRecurse();
2074
2075 CodeGenFunction CGF(CGM);
2076 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2077
2078 CGBuilderTy &Bld = CGF.Builder;
2079
2080 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2081 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2082 Address LocalReduceList(
2083 Bld.CreatePointerBitCastOrAddrSpaceCast(
2084 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2085 C.VoidPtrTy, SourceLocation()),
2086 ElemTy->getPointerTo()),
2087 ElemTy, CGF.getPointerAlign());
2088
2089 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2090 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2091 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2092
2093 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2094 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2095 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2096
2097 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2098 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2099 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2100
2101 // Create a local thread-private variable to host the Reduce list
2102 // from a remote lane.
2103 Address RemoteReduceList =
2104 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2105
2106 // This loop iterates through the list of reduce elements and copies,
2107 // element by element, from a remote lane in the warp to RemoteReduceList,
2108 // hosted on the thread's stack.
2109 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2110 LocalReduceList, RemoteReduceList,
2111 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2112 /*ScratchpadIndex=*/nullptr,
2113 /*ScratchpadWidth=*/nullptr});
2114
2115 // The actions to be performed on the Remote Reduce list is dependent
2116 // on the algorithm version.
2117 //
2118 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2119 // LaneId % 2 == 0 && Offset > 0):
2120 // do the reduction value aggregation
2121 //
2122 // The thread local variable Reduce list is mutated in place to host the
2123 // reduced data, which is the aggregated value produced from local and
2124 // remote lanes.
2125 //
2126 // Note that AlgoVer is expected to be a constant integer known at compile
2127 // time.
2128 // When AlgoVer==0, the first conjunction evaluates to true, making
2129 // the entire predicate true during compile time.
2130 // When AlgoVer==1, the second conjunction has only the second part to be
2131 // evaluated during runtime. Other conjunctions evaluates to false
2132 // during compile time.
2133 // When AlgoVer==2, the third conjunction has only the second part to be
2134 // evaluated during runtime. Other conjunctions evaluates to false
2135 // during compile time.
2136 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2137
2138 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2139 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2140 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2141
2142 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2143 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2144 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2145 CondAlgo2 = Bld.CreateAnd(
2146 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2147
2148 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2149 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2150
2151 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2152 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2153 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2154 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2155
2156 CGF.EmitBlock(ThenBB);
2157 // reduce_function(LocalReduceList, RemoteReduceList)
2158 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2159 LocalReduceList.getPointer(), CGF.VoidPtrTy);
2160 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2161 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2162 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2163 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2164 Bld.CreateBr(MergeBB);
2165
2166 CGF.EmitBlock(ElseBB);
2167 Bld.CreateBr(MergeBB);
2168
2169 CGF.EmitBlock(MergeBB);
2170
2171 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2172 // Reduce list.
2173 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2174 llvm::Value *CondCopy = Bld.CreateAnd(
2175 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2176
2177 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2178 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2179 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2180 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2181
2182 CGF.EmitBlock(CpyThenBB);
2183 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2184 RemoteReduceList, LocalReduceList);
2185 Bld.CreateBr(CpyMergeBB);
2186
2187 CGF.EmitBlock(CpyElseBB);
2188 Bld.CreateBr(CpyMergeBB);
2189
2190 CGF.EmitBlock(CpyMergeBB);
2191
2192 CGF.FinishFunction();
2193 return Fn;
2194}
2195
2196/// This function emits a helper that copies all the reduction variables from
2197/// the team into the provided global buffer for the reduction variables.
2198///
2199/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2200/// For all data entries D in reduce_data:
2201/// Copy local D to buffer.D[Idx]
2202static llvm::Value *emitListToGlobalCopyFunction(
2203 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2204 QualType ReductionArrayTy, SourceLocation Loc,
2205 const RecordDecl *TeamReductionRec,
2206 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2207 &VarFieldMap) {
2208 ASTContext &C = CGM.getContext();
2209
2210 // Buffer: global reduction buffer.
2211 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2212 C.VoidPtrTy, ImplicitParamDecl::Other);
2213 // Idx: index of the buffer.
2214 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2215 ImplicitParamDecl::Other);
2216 // ReduceList: thread local Reduce list.
2217 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2218 C.VoidPtrTy, ImplicitParamDecl::Other);
2219 FunctionArgList Args;
2220 Args.push_back(&BufferArg);
2221 Args.push_back(&IdxArg);
2222 Args.push_back(&ReduceListArg);
2223
2224 const CGFunctionInfo &CGFI =
2225 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2226 auto *Fn = llvm::Function::Create(
2227 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2228 "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2229 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2230 Fn->setDoesNotRecurse();
2231 CodeGenFunction CGF(CGM);
2232 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2233
2234 CGBuilderTy &Bld = CGF.Builder;
2235
2236 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2237 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2238 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2239 Address LocalReduceList(
2240 Bld.CreatePointerBitCastOrAddrSpaceCast(
2241 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2242 C.VoidPtrTy, Loc),
2243 ElemTy->getPointerTo()),
2244 ElemTy, CGF.getPointerAlign());
2245 QualType StaticTy = C.getRecordType(TeamReductionRec);
2246 llvm::Type *LLVMReductionsBufferTy =
2247 CGM.getTypes().ConvertTypeForMem(StaticTy);
2248 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2249 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2250 LLVMReductionsBufferTy->getPointerTo());
2251 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2252 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2253 /*Volatile=*/false, C.IntTy,
2254 Loc)};
2255 unsigned Idx = 0;
2256 for (const Expr *Private : Privates) {
2257 // Reduce element = LocalReduceList[i]
2258 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2259 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2260 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2261 // elemptr = ((CopyType*)(elemptrptr)) + I
2262 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2263 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2264 ElemPtrPtr, ElemTy->getPointerTo());
2265 Address ElemPtr =
2266 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2267 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2268 // Global = Buffer.VD[Idx];
2269 const FieldDecl *FD = VarFieldMap.lookup(VD);
2270 LValue GlobLVal = CGF.EmitLValueForField(
2271 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2272 Address GlobAddr = GlobLVal.getAddress(CGF);
2273 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2274 GlobAddr.getPointer(), Idxs);
2275 GlobLVal.setAddress(Address(BufferPtr,
2276 CGF.ConvertTypeForMem(Private->getType()),
2277 GlobAddr.getAlignment()));
2278 switch (CGF.getEvaluationKind(Private->getType())) {
2279 case TEK_Scalar: {
2280 llvm::Value *V = CGF.EmitLoadOfScalar(
2281 ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2282 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2283 CGF.EmitStoreOfScalar(V, GlobLVal);
2284 break;
2285 }
2286 case TEK_Complex: {
2287 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2288 CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2289 CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2290 break;
2291 }
2292 case TEK_Aggregate:
2293 CGF.EmitAggregateCopy(GlobLVal,
2294 CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2295 Private->getType(), AggValueSlot::DoesNotOverlap);
2296 break;
2297 }
2298 ++Idx;
2299 }
2300
2301 CGF.FinishFunction();
2302 return Fn;
2303}
2304
2305/// This function emits a helper that reduces all the reduction variables from
2306/// the team into the provided global buffer for the reduction variables.
2307///
2308/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2309/// void *GlobPtrs[];
2310/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2311/// ...
2312/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2313/// reduce_function(GlobPtrs, reduce_data);
2314static llvm::Value *emitListToGlobalReduceFunction(
2315 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2316 QualType ReductionArrayTy, SourceLocation Loc,
2317 const RecordDecl *TeamReductionRec,
2318 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2319 &VarFieldMap,
2320 llvm::Function *ReduceFn) {
2321 ASTContext &C = CGM.getContext();
2322
2323 // Buffer: global reduction buffer.
2324 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2325 C.VoidPtrTy, ImplicitParamDecl::Other);
2326 // Idx: index of the buffer.
2327 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2328 ImplicitParamDecl::Other);
2329 // ReduceList: thread local Reduce list.
2330 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2331 C.VoidPtrTy, ImplicitParamDecl::Other);
2332 FunctionArgList Args;
2333 Args.push_back(&BufferArg);
2334 Args.push_back(&IdxArg);
2335 Args.push_back(&ReduceListArg);
2336
2337 const CGFunctionInfo &CGFI =
2338 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2339 auto *Fn = llvm::Function::Create(
2340 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2341 "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2342 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2343 Fn->setDoesNotRecurse();
2344 CodeGenFunction CGF(CGM);
2345 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2346
2347 CGBuilderTy &Bld = CGF.Builder;
2348
2349 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2350 QualType StaticTy = C.getRecordType(TeamReductionRec);
2351 llvm::Type *LLVMReductionsBufferTy =
2352 CGM.getTypes().ConvertTypeForMem(StaticTy);
2353 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2354 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2355 LLVMReductionsBufferTy->getPointerTo());
2356
2357 // 1. Build a list of reduction variables.
2358 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2359 Address ReductionList =
2360 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2361 auto IPriv = Privates.begin();
2362 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2363 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2364 /*Volatile=*/false, C.IntTy,
2365 Loc)};
2366 unsigned Idx = 0;
2367 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2368 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2369 // Global = Buffer.VD[Idx];
2370 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2371 const FieldDecl *FD = VarFieldMap.lookup(VD);
2372 LValue GlobLVal = CGF.EmitLValueForField(
2373 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2374 Address GlobAddr = GlobLVal.getAddress(CGF);
2375 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2376 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2377 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2378 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2379 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2380 // Store array size.
2381 ++Idx;
2382 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2383 llvm::Value *Size = CGF.Builder.CreateIntCast(
2384 CGF.getVLASize(
2385 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2386 .NumElts,
2387 CGF.SizeTy, /*isSigned=*/false);
2388 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2389 Elem);
2390 }
2391 }
2392
2393 // Call reduce_function(GlobalReduceList, ReduceList)
2394 llvm::Value *GlobalReduceList =
2395 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2396 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2397 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2398 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2399 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2400 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2401 CGF.FinishFunction();
2402 return Fn;
2403}
2404
2405/// This function emits a helper that copies all the reduction variables from
2406/// the team into the provided global buffer for the reduction variables.
2407///
2408/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2409/// For all data entries D in reduce_data:
2410/// Copy buffer.D[Idx] to local D;
2411static llvm::Value *emitGlobalToListCopyFunction(
2412 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2413 QualType ReductionArrayTy, SourceLocation Loc,
2414 const RecordDecl *TeamReductionRec,
2415 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2416 &VarFieldMap) {
2417 ASTContext &C = CGM.getContext();
2418
2419 // Buffer: global reduction buffer.
2420 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2421 C.VoidPtrTy, ImplicitParamDecl::Other);
2422 // Idx: index of the buffer.
2423 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2424 ImplicitParamDecl::Other);
2425 // ReduceList: thread local Reduce list.
2426 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2427 C.VoidPtrTy, ImplicitParamDecl::Other);
2428 FunctionArgList Args;
2429 Args.push_back(&BufferArg);
2430 Args.push_back(&IdxArg);
2431 Args.push_back(&ReduceListArg);
2432
2433 const CGFunctionInfo &CGFI =
2434 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2435 auto *Fn = llvm::Function::Create(
2436 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2437 "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2438 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2439 Fn->setDoesNotRecurse();
2440 CodeGenFunction CGF(CGM);
2441 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2442
2443 CGBuilderTy &Bld = CGF.Builder;
2444
2445 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2446 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2447 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2448 Address LocalReduceList(
2449 Bld.CreatePointerBitCastOrAddrSpaceCast(
2450 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2451 C.VoidPtrTy, Loc),
2452 ElemTy->getPointerTo()),
2453 ElemTy, CGF.getPointerAlign());
2454 QualType StaticTy = C.getRecordType(TeamReductionRec);
2455 llvm::Type *LLVMReductionsBufferTy =
2456 CGM.getTypes().ConvertTypeForMem(StaticTy);
2457 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2458 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2459 LLVMReductionsBufferTy->getPointerTo());
2460
2461 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2462 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2463 /*Volatile=*/false, C.IntTy,
2464 Loc)};
2465 unsigned Idx = 0;
2466 for (const Expr *Private : Privates) {
2467 // Reduce element = LocalReduceList[i]
2468 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2469 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2470 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2471 // elemptr = ((CopyType*)(elemptrptr)) + I
2472 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2473 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2474 ElemPtrPtr, ElemTy->getPointerTo());
2475 Address ElemPtr =
2476 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2477 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2478 // Global = Buffer.VD[Idx];
2479 const FieldDecl *FD = VarFieldMap.lookup(VD);
2480 LValue GlobLVal = CGF.EmitLValueForField(
2481 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2482 Address GlobAddr = GlobLVal.getAddress(CGF);
2483 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2484 GlobAddr.getPointer(), Idxs);
2485 GlobLVal.setAddress(Address(BufferPtr,
2486 CGF.ConvertTypeForMem(Private->getType()),
2487 GlobAddr.getAlignment()));
2488 switch (CGF.getEvaluationKind(Private->getType())) {
2489 case TEK_Scalar: {
2490 llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2491 CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2492 LValueBaseInfo(AlignmentSource::Type),
2493 TBAAAccessInfo());
2494 break;
2495 }
2496 case TEK_Complex: {
2497 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
2498 CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2499 /*isInit=*/false);
2500 break;
2501 }
2502 case TEK_Aggregate:
2503 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2504 GlobLVal, Private->getType(),
2505 AggValueSlot::DoesNotOverlap);
2506 break;
2507 }
2508 ++Idx;
2509 }
2510
2511 CGF.FinishFunction();
2512 return Fn;
2513}
2514
2515/// This function emits a helper that reduces all the reduction variables from
2516/// the team into the provided global buffer for the reduction variables.
2517///
2518/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2519/// void *GlobPtrs[];
2520/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2521/// ...
2522/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2523/// reduce_function(reduce_data, GlobPtrs);
2524static llvm::Value *emitGlobalToListReduceFunction(
2525 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2526 QualType ReductionArrayTy, SourceLocation Loc,
2527 const RecordDecl *TeamReductionRec,
2528 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2529 &VarFieldMap,
2530 llvm::Function *ReduceFn) {
2531 ASTContext &C = CGM.getContext();
2532
2533 // Buffer: global reduction buffer.
2534 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2535 C.VoidPtrTy, ImplicitParamDecl::Other);
2536 // Idx: index of the buffer.
2537 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2538 ImplicitParamDecl::Other);
2539 // ReduceList: thread local Reduce list.
2540 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2541 C.VoidPtrTy, ImplicitParamDecl::Other);
2542 FunctionArgList Args;
2543 Args.push_back(&BufferArg);
2544 Args.push_back(&IdxArg);
2545 Args.push_back(&ReduceListArg);
2546
2547 const CGFunctionInfo &CGFI =
2548 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2549 auto *Fn = llvm::Function::Create(
2550 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2551 "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2552 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2553 Fn->setDoesNotRecurse();
2554 CodeGenFunction CGF(CGM);
2555 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2556
2557 CGBuilderTy &Bld = CGF.Builder;
2558
2559 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2560 QualType StaticTy = C.getRecordType(TeamReductionRec);
2561 llvm::Type *LLVMReductionsBufferTy =
2562 CGM.getTypes().ConvertTypeForMem(StaticTy);
2563 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2564 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2565 LLVMReductionsBufferTy->getPointerTo());
2566
2567 // 1. Build a list of reduction variables.
2568 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2569 Address ReductionList =
2570 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2571 auto IPriv = Privates.begin();
2572 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2573 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2574 /*Volatile=*/false, C.IntTy,
2575 Loc)};
2576 unsigned Idx = 0;
2577 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2578 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2579 // Global = Buffer.VD[Idx];
2580 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2581 const FieldDecl *FD = VarFieldMap.lookup(VD);
2582 LValue GlobLVal = CGF.EmitLValueForField(
2583 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2584 Address GlobAddr = GlobLVal.getAddress(CGF);
2585 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2586 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2587 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2588 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2589 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2590 // Store array size.
2591 ++Idx;
2592 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2593 llvm::Value *Size = CGF.Builder.CreateIntCast(
2594 CGF.getVLASize(
2595 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2596 .NumElts,
2597 CGF.SizeTy, /*isSigned=*/false);
2598 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2599 Elem);
2600 }
2601 }
2602
2603 // Call reduce_function(ReduceList, GlobalReduceList)
2604 llvm::Value *GlobalReduceList =
2605 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2606 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2607 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2608 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2609 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2610 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2611 CGF.FinishFunction();
2612 return Fn;
2613}
2614
2615///
2616/// Design of OpenMP reductions on the GPU
2617///
2618/// Consider a typical OpenMP program with one or more reduction
2619/// clauses:
2620///
2621/// float foo;
2622/// double bar;
2623/// #pragma omp target teams distribute parallel for \
2624/// reduction(+:foo) reduction(*:bar)
2625/// for (int i = 0; i < N; i++) {
2626/// foo += A[i]; bar *= B[i];
2627/// }
2628///
2629/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2630/// all teams. In our OpenMP implementation on the NVPTX device an
2631/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2632/// within a team are mapped to CUDA threads within a threadblock.
2633/// Our goal is to efficiently aggregate values across all OpenMP
2634/// threads such that:
2635///
2636/// - the compiler and runtime are logically concise, and
2637/// - the reduction is performed efficiently in a hierarchical
2638/// manner as follows: within OpenMP threads in the same warp,
2639/// across warps in a threadblock, and finally across teams on
2640/// the NVPTX device.
2641///
2642/// Introduction to Decoupling
2643///
2644/// We would like to decouple the compiler and the runtime so that the
2645/// latter is ignorant of the reduction variables (number, data types)
2646/// and the reduction operators. This allows a simpler interface
2647/// and implementation while still attaining good performance.
2648///
2649/// Pseudocode for the aforementioned OpenMP program generated by the
2650/// compiler is as follows:
2651///
2652/// 1. Create private copies of reduction variables on each OpenMP
2653/// thread: 'foo_private', 'bar_private'
2654/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2655/// to it and writes the result in 'foo_private' and 'bar_private'
2656/// respectively.
2657/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2658/// and store the result on the team master:
2659///
2660/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2661/// reduceData, shuffleReduceFn, interWarpCpyFn)
2662///
2663/// where:
2664/// struct ReduceData {
2665/// double *foo;
2666/// double *bar;
2667/// } reduceData
2668/// reduceData.foo = &foo_private
2669/// reduceData.bar = &bar_private
2670///
2671/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2672/// auxiliary functions generated by the compiler that operate on
2673/// variables of type 'ReduceData'. They aid the runtime perform
2674/// algorithmic steps in a data agnostic manner.
2675///
2676/// 'shuffleReduceFn' is a pointer to a function that reduces data
2677/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2678/// same warp. It takes the following arguments as input:
2679///
2680/// a. variable of type 'ReduceData' on the calling lane,
2681/// b. its lane_id,
2682/// c. an offset relative to the current lane_id to generate a
2683/// remote_lane_id. The remote lane contains the second
2684/// variable of type 'ReduceData' that is to be reduced.
2685/// d. an algorithm version parameter determining which reduction
2686/// algorithm to use.
2687///
2688/// 'shuffleReduceFn' retrieves data from the remote lane using
2689/// efficient GPU shuffle intrinsics and reduces, using the
2690/// algorithm specified by the 4th parameter, the two operands
2691/// element-wise. The result is written to the first operand.
2692///
2693/// Different reduction algorithms are implemented in different
2694/// runtime functions, all calling 'shuffleReduceFn' to perform
2695/// the essential reduction step. Therefore, based on the 4th
2696/// parameter, this function behaves slightly differently to
2697/// cooperate with the runtime to ensure correctness under
2698/// different circumstances.
2699///
2700/// 'InterWarpCpyFn' is a pointer to a function that transfers
2701/// reduced variables across warps. It tunnels, through CUDA
2702/// shared memory, the thread-private data of type 'ReduceData'
2703/// from lane 0 of each warp to a lane in the first warp.
2704/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2705/// The last team writes the global reduced value to memory.
2706///
2707/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2708/// reduceData, shuffleReduceFn, interWarpCpyFn,
2709/// scratchpadCopyFn, loadAndReduceFn)
2710///
2711/// 'scratchpadCopyFn' is a helper that stores reduced
2712/// data from the team master to a scratchpad array in
2713/// global memory.
2714///
2715/// 'loadAndReduceFn' is a helper that loads data from
2716/// the scratchpad array and reduces it with the input
2717/// operand.
2718///
2719/// These compiler generated functions hide address
2720/// calculation and alignment information from the runtime.
2721/// 5. if ret == 1:
2722/// The team master of the last team stores the reduced
2723/// result to the globals in memory.
2724/// foo += reduceData.foo; bar *= reduceData.bar
2725///
2726///
2727/// Warp Reduction Algorithms
2728///
2729/// On the warp level, we have three algorithms implemented in the
2730/// OpenMP runtime depending on the number of active lanes:
2731///
2732/// Full Warp Reduction
2733///
2734/// The reduce algorithm within a warp where all lanes are active
2735/// is implemented in the runtime as follows:
2736///
2737/// full_warp_reduce(void *reduce_data,
2738/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2739/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2740/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2741/// }
2742///
2743/// The algorithm completes in log(2, WARPSIZE) steps.
2744///
2745/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2746/// not used therefore we save instructions by not retrieving lane_id
2747/// from the corresponding special registers. The 4th parameter, which
2748/// represents the version of the algorithm being used, is set to 0 to
2749/// signify full warp reduction.
2750///
2751/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2752///
2753/// #reduce_elem refers to an element in the local lane's data structure
2754/// #remote_elem is retrieved from a remote lane
2755/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2756/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2757///
2758/// Contiguous Partial Warp Reduction
2759///
2760/// This reduce algorithm is used within a warp where only the first
2761/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2762/// number of OpenMP threads in a parallel region is not a multiple of
2763/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2764///
2765/// void
2766/// contiguous_partial_reduce(void *reduce_data,
2767/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2768/// int size, int lane_id) {
2769/// int curr_size;
2770/// int offset;
2771/// curr_size = size;
2772/// mask = curr_size/2;
2773/// while (offset>0) {
2774/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2775/// curr_size = (curr_size+1)/2;
2776/// offset = curr_size/2;
2777/// }
2778/// }
2779///
2780/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2781///
2782/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2783/// if (lane_id < offset)
2784/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2785/// else
2786/// reduce_elem = remote_elem
2787///
2788/// This algorithm assumes that the data to be reduced are located in a
2789/// contiguous subset of lanes starting from the first. When there is
2790/// an odd number of active lanes, the data in the last lane is not
2791/// aggregated with any other lane's dat but is instead copied over.
2792///
2793/// Dispersed Partial Warp Reduction
2794///
2795/// This algorithm is used within a warp when any discontiguous subset of
2796/// lanes are active. It is used to implement the reduction operation
2797/// across lanes in an OpenMP simd region or in a nested parallel region.
2798///
2799/// void
2800/// dispersed_partial_reduce(void *reduce_data,
2801/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2802/// int size, remote_id;
2803/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2804/// do {
2805/// remote_id = next_active_lane_id_right_after_me();
2806/// # the above function returns 0 of no active lane
2807/// # is present right after the current lane.
2808/// size = number_of_active_lanes_in_this_warp();
2809/// logical_lane_id /= 2;
2810/// ShuffleReduceFn(reduce_data, logical_lane_id,
2811/// remote_id-1-threadIdx.x, 2);
2812/// } while (logical_lane_id % 2 == 0 && size > 1);
2813/// }
2814///
2815/// There is no assumption made about the initial state of the reduction.
2816/// Any number of lanes (>=1) could be active at any position. The reduction
2817/// result is returned in the first active lane.
2818///
2819/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2820///
2821/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2822/// if (lane_id % 2 == 0 && offset > 0)
2823/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2824/// else
2825/// reduce_elem = remote_elem
2826///
2827///
2828/// Intra-Team Reduction
2829///
2830/// This function, as implemented in the runtime call
2831/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2832/// threads in a team. It first reduces within a warp using the
2833/// aforementioned algorithms. We then proceed to gather all such
2834/// reduced values at the first warp.
2835///
2836/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2837/// data from each of the "warp master" (zeroth lane of each warp, where
2838/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2839/// a mathematical sense) the problem of reduction across warp masters in
2840/// a block to the problem of warp reduction.
2841///
2842///
2843/// Inter-Team Reduction
2844///
2845/// Once a team has reduced its data to a single value, it is stored in
2846/// a global scratchpad array. Since each team has a distinct slot, this
2847/// can be done without locking.
2848///
2849/// The last team to write to the scratchpad array proceeds to reduce the
2850/// scratchpad array. One or more workers in the last team use the helper
2851/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2852/// the k'th worker reduces every k'th element.
2853///
2854/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2855/// reduce across workers and compute a globally reduced value.
2856///
2857void CGOpenMPRuntimeGPU::emitReduction(
2858 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2859 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
2860 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2861 if (!CGF.HaveInsertPoint())
1
Taking false branch
2862 return;
2863
2864 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2865#ifndef NDEBUG
2866 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2867#endif
2868
2869 if (Options.SimpleReduction) {
2
Assuming field 'SimpleReduction' is false
2870 assert(!TeamsReduction && !ParallelReduction &&(static_cast <bool> (!TeamsReduction && !ParallelReduction
&& "Invalid reduction selection in emitReduction.") ?
void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2871, __extension__
__PRETTY_FUNCTION__))
2871 "Invalid reduction selection in emitReduction.")(static_cast <bool> (!TeamsReduction && !ParallelReduction
&& "Invalid reduction selection in emitReduction.") ?
void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2871, __extension__
__PRETTY_FUNCTION__))
;
2872 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2873 ReductionOps, Options);
2874 return;
2875 }
2876
2877 assert((TeamsReduction || ParallelReduction) &&(static_cast <bool> ((TeamsReduction || ParallelReduction
) && "Invalid reduction selection in emitReduction.")
? void (0) : __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2878, __extension__
__PRETTY_FUNCTION__))
3
Taking false branch
4
Assuming 'TeamsReduction' is false
5
Assuming 'ParallelReduction' is true
6
'?' condition is true
2878 "Invalid reduction selection in emitReduction.")(static_cast <bool> ((TeamsReduction || ParallelReduction
) && "Invalid reduction selection in emitReduction.")
? void (0) : __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2878, __extension__
__PRETTY_FUNCTION__))
;
2879
2880 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2881 // RedList, shuffle_reduce_func, interwarp_copy_func);
2882 // or
2883 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2884 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2885 llvm::Value *ThreadId = getThreadID(CGF, Loc);
2886
2887 llvm::Value *Res;
2888 ASTContext &C = CGM.getContext();
2889 // 1. Build a list of reduction variables.
2890 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2891 auto Size = RHSExprs.size();
2892 for (const Expr *E : Privates) {
7
Assuming '__begin1' is equal to '__end1'
2893 if (E->getType()->isVariablyModifiedType())
2894 // Reserve place for array size.
2895 ++Size;
2896 }
2897 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2898 QualType ReductionArrayTy =
2899 C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
2900 /*IndexTypeQuals=*/0);
2901 Address ReductionList =
2902 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2903 auto IPriv = Privates.begin();
2904 unsigned Idx = 0;
2905 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
8
Assuming 'I' is >= 'E'
9
Loop condition is false. Execution continues on line 2925
2906 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2907 CGF.Builder.CreateStore(
2908 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2909 CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
2910 Elem);
2911 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2912 // Store array size.
2913 ++Idx;
2914 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2915 llvm::Value *Size = CGF.Builder.CreateIntCast(
2916 CGF.getVLASize(
2917 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2918 .NumElts,
2919 CGF.SizeTy, /*isSigned=*/false);
2920 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2921 Elem);
2922 }
2923 }
2924
2925 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2926 ReductionList.getPointer(), CGF.VoidPtrTy);
2927 llvm::Function *ReductionFn = emitReductionFunction(
2928 CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
2929 Privates, LHSExprs, RHSExprs, ReductionOps);
2930 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2931 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2932 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
2933 llvm::Value *InterWarpCopyFn =
2934 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
10
Calling 'emitInterWarpCopyFunction'
2935
2936 if (ParallelReduction) {
2937 llvm::Value *Args[] = {RTLoc,
2938 ThreadId,
2939 CGF.Builder.getInt32(RHSExprs.size()),
2940 ReductionArrayTySize,
2941 RL,
2942 ShuffleAndReduceFn,
2943 InterWarpCopyFn};
2944
2945 Res = CGF.EmitRuntimeCall(
2946 OMPBuilder.getOrCreateRuntimeFunction(
2947 CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2948 Args);
2949 } else {
2950 assert(TeamsReduction && "expected teams reduction.")(static_cast <bool> (TeamsReduction && "expected teams reduction."
) ? void (0) : __assert_fail ("TeamsReduction && \"expected teams reduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2950, __extension__
__PRETTY_FUNCTION__))
;
2951 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2952 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2953 int Cnt = 0;
2954 for (const Expr *DRE : Privates) {
2955 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
2956 ++Cnt;
2957 }
2958 const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
2959 CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
2960 C.getLangOpts().OpenMPCUDAReductionBufNum);
2961 TeamsReductions.push_back(TeamReductionRec);
2962 if (!KernelTeamsReductionPtr) {
2963 KernelTeamsReductionPtr = new llvm::GlobalVariable(
2964 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
2965 llvm::GlobalValue::InternalLinkage, nullptr,
2966 "_openmp_teams_reductions_buffer_$_$ptr");
2967 }
2968 llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
2969 Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
2970 /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
2971 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2972 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2973 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2974 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2975 ReductionFn);
2976 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2977 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2978 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2979 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2980 ReductionFn);
2981
2982 llvm::Value *Args[] = {
2983 RTLoc,
2984 ThreadId,
2985 GlobalBufferPtr,
2986 CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
2987 RL,
2988 ShuffleAndReduceFn,
2989 InterWarpCopyFn,
2990 GlobalToBufferCpyFn,
2991 GlobalToBufferRedFn,
2992 BufferToGlobalCpyFn,
2993 BufferToGlobalRedFn};
2994
2995 Res = CGF.EmitRuntimeCall(
2996 OMPBuilder.getOrCreateRuntimeFunction(
2997 CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2998 Args);
2999 }
3000
3001 // 5. Build if (res == 1)
3002 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3003 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3004 llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3005 Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3006 CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3007
3008 // 6. Build then branch: where we have reduced values in the master
3009 // thread in each team.
3010 // __kmpc_end_reduce{_nowait}(<gtid>);
3011 // break;
3012 CGF.EmitBlock(ThenBB);
3013
3014 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3015 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3016 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3017 auto IPriv = Privates.begin();
3018 auto ILHS = LHSExprs.begin();
3019 auto IRHS = RHSExprs.begin();
3020 for (const Expr *E : ReductionOps) {
3021 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3022 cast<DeclRefExpr>(*IRHS));
3023 ++IPriv;
3024 ++ILHS;
3025 ++IRHS;
3026 }
3027 };
3028 llvm::Value *EndArgs[] = {ThreadId};
3029 RegionCodeGenTy RCG(CodeGen);
3030 NVPTXActionTy Action(
3031 nullptr, std::nullopt,
3032 OMPBuilder.getOrCreateRuntimeFunction(
3033 CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3034 EndArgs);
3035 RCG.setAction(Action);
3036 RCG(CGF);
3037 // There is no need to emit line number for unconditional branch.
3038 (void)ApplyDebugLocation::CreateEmpty(CGF);
3039 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3040}
3041
3042const VarDecl *
3043CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3044 const VarDecl *NativeParam) const {
3045 if (!NativeParam->getType()->isReferenceType())
3046 return NativeParam;
3047 QualType ArgType = NativeParam->getType();
3048 QualifierCollector QC;
3049 const Type *NonQualTy = QC.strip(ArgType);
3050 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3051 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3052 if (Attr->getCaptureKind() == OMPC_map) {
3053 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3054 LangAS::opencl_global);
3055 }
3056 }
3057 ArgType = CGM.getContext().getPointerType(PointeeTy);
3058 QC.addRestrict();
3059 enum { NVPTX_local_addr = 5 };
3060 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3061 ArgType = QC.apply(CGM.getContext(), ArgType);
3062 if (isa<ImplicitParamDecl>(NativeParam))
3063 return ImplicitParamDecl::Create(
3064 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3065 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3066 return ParmVarDecl::Create(
3067 CGM.getContext(),
3068 const_cast<DeclContext *>(NativeParam->getDeclContext()),
3069 NativeParam->getBeginLoc(), NativeParam->getLocation(),
3070 NativeParam->getIdentifier(), ArgType,
3071 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3072}
3073
3074Address
3075CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3076 const VarDecl *NativeParam,
3077 const VarDecl *TargetParam) const {
3078 assert(NativeParam != TargetParam &&(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3080, __extension__
__PRETTY_FUNCTION__))
3079 NativeParam->getType()->isReferenceType() &&(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3080, __extension__
__PRETTY_FUNCTION__))
3080 "Native arg must not be the same as target arg.")(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3080, __extension__
__PRETTY_FUNCTION__))
;
3081 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3082 QualType NativeParamType = NativeParam->getType();
3083 QualifierCollector QC;
3084 const Type *NonQualTy = QC.strip(NativeParamType);
3085 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3086 unsigned NativePointeeAddrSpace =
3087 CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
3088 QualType TargetTy = TargetParam->getType();
3089 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3090 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3091 // First cast to generic.
3092 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3093 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3094 cast<llvm::PointerType>(TargetAddr->getType()), /*AddrSpace=*/0));
3095 // Cast from generic to native address space.
3096 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3097 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3098 cast<llvm::PointerType>(TargetAddr->getType()),
3099 NativePointeeAddrSpace));
3100 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3101 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3102 NativeParamType);
3103 return NativeParamAddr;
3104}
3105
3106void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3107 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3108 ArrayRef<llvm::Value *> Args) const {
3109 SmallVector<llvm::Value *, 4> TargetArgs;
3110 TargetArgs.reserve(Args.size());
3111 auto *FnType = OutlinedFn.getFunctionType();
3112 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3113 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3114 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3115 break;
3116 }
3117 llvm::Type *TargetType = FnType->getParamType(I);
3118 llvm::Value *NativeArg = Args[I];
3119 if (!TargetType->isPointerTy()) {
3120 TargetArgs.emplace_back(NativeArg);
3121 continue;
3122 }
3123 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3124 NativeArg, llvm::PointerType::getWithSamePointeeType(
3125 cast<llvm::PointerType>(NativeArg->getType()), /*AddrSpace*/ 0));
3126 TargetArgs.emplace_back(
3127 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3128 }
3129 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3130}
3131
3132/// Emit function which wraps the outline parallel region
3133/// and controls the arguments which are passed to this function.
3134/// The wrapper ensures that the outlined function is called
3135/// with the correct arguments when data is shared.
3136llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3137 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3138 ASTContext &Ctx = CGM.getContext();
3139 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3140
3141 // Create a function that takes as argument the source thread.
3142 FunctionArgList WrapperArgs;
3143 QualType Int16QTy =
3144 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3145 QualType Int32QTy =
3146 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3147 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3148 /*Id=*/nullptr, Int16QTy,
3149 ImplicitParamDecl::Other);
3150 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3151 /*Id=*/nullptr, Int32QTy,
3152 ImplicitParamDecl::Other);
3153 WrapperArgs.emplace_back(&ParallelLevelArg);
3154 WrapperArgs.emplace_back(&WrapperArg);
3155
3156 const CGFunctionInfo &CGFI =
3157 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3158
3159 auto *Fn = llvm::Function::Create(
3160 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3161 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3162
3163 // Ensure we do not inline the function. This is trivially true for the ones
3164 // passed to __kmpc_fork_call but the ones calles in serialized regions
3165 // could be inlined. This is not a perfect but it is closer to the invariant
3166 // we want, namely, every data environment starts with a new function.
3167 // TODO: We should pass the if condition to the runtime function and do the
3168 // handling there. Much cleaner code.
3169 Fn->addFnAttr(llvm::Attribute::NoInline);
3170
3171 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3172 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3173 Fn->setDoesNotRecurse();
3174
3175 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3176 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3177 D.getBeginLoc(), D.getBeginLoc());
3178
3179 const auto *RD = CS.getCapturedRecordDecl();
3180 auto CurField = RD->field_begin();
3181
3182 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3183 /*Name=*/".zero.addr");
3184 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3185 // Get the array of arguments.
3186 SmallVector<llvm::Value *, 8> Args;
3187
3188 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3189 Args.emplace_back(ZeroAddr.getPointer());
3190
3191 CGBuilderTy &Bld = CGF.Builder;
3192 auto CI = CS.capture_begin();
3193
3194 // Use global memory for data sharing.
3195 // Handle passing of global args to workers.
3196 Address GlobalArgs =
3197 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3198 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3199 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3200 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3201 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3202 DataSharingArgs);
3203
3204 // Retrieve the shared variables from the list of references returned
3205 // by the runtime. Pass the variables to the outlined function.
3206 Address SharedArgListAddress = Address::invalid();
3207 if (CS.capture_size() > 0 ||
3208 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3209 SharedArgListAddress = CGF.EmitLoadOfPointer(
3210 GlobalArgs, CGF.getContext()
3211 .getPointerType(CGF.getContext().VoidPtrTy)
3212 .castAs<PointerType>());
3213 }
3214 unsigned Idx = 0;
3215 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3216 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3217 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3218 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3219 llvm::Value *LB = CGF.EmitLoadOfScalar(
3220 TypedAddress,
3221 /*Volatile=*/false,
3222 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3223 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3224 Args.emplace_back(LB);
3225 ++Idx;
3226 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3227 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3228 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3229 llvm::Value *UB = CGF.EmitLoadOfScalar(
3230 TypedAddress,
3231 /*Volatile=*/false,
3232 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3233 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3234 Args.emplace_back(UB);
3235 ++Idx;
3236 }
3237 if (CS.capture_size() > 0) {
3238 ASTContext &CGFContext = CGF.getContext();
3239 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3240 QualType ElemTy = CurField->getType();
3241 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3242 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3243 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
3244 CGF.ConvertTypeForMem(ElemTy));
3245 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3246 /*Volatile=*/false,
3247 CGFContext.getPointerType(ElemTy),
3248 CI->getLocation());
3249 if (CI->capturesVariableByCopy() &&
3250 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3251 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3252 CI->getLocation());
3253 }
3254 Args.emplace_back(Arg);
3255 }
3256 }
3257
3258 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3259 CGF.FinishFunction();
3260 return Fn;
3261}
3262
3263void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3264 const Decl *D) {
3265 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3266 return;
3267
3268 assert(D && "Expected function or captured|block decl.")(static_cast <bool> (D && "Expected function or captured|block decl."
) ? void (0) : __assert_fail ("D && \"Expected function or captured|block decl.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3268, __extension__
__PRETTY_FUNCTION__))
;
3269 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&(static_cast <bool> (FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 && "Function is registered already.") ? void
(0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3270, __extension__
__PRETTY_FUNCTION__))
3270 "Function is registered already.")(static_cast <bool> (FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 && "Function is registered already.") ? void
(0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3270, __extension__
__PRETTY_FUNCTION__))
;
3271 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&(static_cast <bool> ((!TeamAndReductions.first || TeamAndReductions
.first == D) && "Team is set but not processed.") ? void
(0) : __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3272, __extension__
__PRETTY_FUNCTION__))
3272 "Team is set but not processed.")(static_cast <bool> ((!TeamAndReductions.first || TeamAndReductions
.first == D) && "Team is set but not processed.") ? void
(0) : __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3272, __extension__
__PRETTY_FUNCTION__))
;
3273 const Stmt *Body = nullptr;
3274 bool NeedToDelayGlobalization = false;
3275 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3276 Body = FD->getBody();
3277 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3278 Body = BD->getBody();
3279 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3280 Body = CD->getBody();
3281 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3282 if (NeedToDelayGlobalization &&
3283 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3284 return;
3285 }
3286 if (!Body)
3287 return;
3288 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3289 VarChecker.Visit(Body);
3290 const RecordDecl *GlobalizedVarsRecord =
3291 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3292 TeamAndReductions.first = nullptr;
3293 TeamAndReductions.second.clear();
3294 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3295 VarChecker.getEscapedVariableLengthDecls();
3296 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3297 return;
3298 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3299 I->getSecond().MappedParams =
3300 std::make_unique<CodeGenFunction::OMPMapVars>();
3301 I->getSecond().EscapedParameters.insert(
3302 VarChecker.getEscapedParameters().begin(),
3303 VarChecker.getEscapedParameters().end());
3304 I->getSecond().EscapedVariableLengthDecls.append(
3305 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3306 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3307 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3308 assert(VD->isCanonicalDecl() && "Expected canonical declaration")(static_cast <bool> (VD->isCanonicalDecl() &&
"Expected canonical declaration") ? void (0) : __assert_fail
("VD->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3308, __extension__
__PRETTY_FUNCTION__))
;
3309 Data.insert(std::make_pair(VD, MappedVarData()));
3310 }
3311 if (!NeedToDelayGlobalization) {
3312 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
3313 struct GlobalizationScope final : EHScopeStack::Cleanup {
3314 GlobalizationScope() = default;
3315
3316 void Emit(CodeGenFunction &CGF, Flags flags) override {
3317 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3318 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
3319 }
3320 };
3321 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3322 }
3323}
3324
3325Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3326 const VarDecl *VD) {
3327 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3328 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3329 auto AS = LangAS::Default;
3330 switch (A->getAllocatorType()) {
3331 // Use the default allocator here as by default local vars are
3332 // threadlocal.
3333 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3334 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3335 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3336 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3337 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3338 // Follow the user decision - use default allocation.
3339 return Address::invalid();
3340 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3341 // TODO: implement aupport for user-defined allocators.
3342 return Address::invalid();
3343 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3344 AS = LangAS::cuda_constant;
3345 break;
3346 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3347 AS = LangAS::cuda_shared;
3348 break;
3349 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3350 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3351 break;
3352 }
3353 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3354 auto *GV = new llvm::GlobalVariable(
3355 CGM.getModule(), VarTy, /*isConstant=*/false,
3356 llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
3357 VD->getName(),
3358 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3359 CGM.getContext().getTargetAddressSpace(AS));
3360 CharUnits Align = CGM.getContext().getDeclAlign(VD);
3361 GV->setAlignment(Align.getAsAlign());
3362 return Address(
3363 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3364 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3365 VD->getType().getAddressSpace()))),
3366 VarTy, Align);
3367 }
3368
3369 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3370 return Address::invalid();
3371
3372 VD = VD->getCanonicalDecl();
3373 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3374 if (I == FunctionGlobalizedDecls.end())
3375 return Address::invalid();
3376 auto VDI = I->getSecond().LocalVarData.find(VD);
3377 if (VDI != I->getSecond().LocalVarData.end())
3378 return VDI->second.PrivateAddr;
3379 if (VD->hasAttrs()) {
3380 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3381 E(VD->attr_end());
3382 IT != E; ++IT) {
3383 auto VDI = I->getSecond().LocalVarData.find(
3384 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3385 ->getCanonicalDecl());
3386 if (VDI != I->getSecond().LocalVarData.end())
3387 return VDI->second.PrivateAddr;
3388 }
3389 }
3390
3391 return Address::invalid();
3392}
3393
3394void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3395 FunctionGlobalizedDecls.erase(CGF.CurFn);
3396 CGOpenMPRuntime::functionFinished(CGF);
3397}
3398
3399void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3400 CodeGenFunction &CGF, const OMPLoopDirective &S,
3401 OpenMPDistScheduleClauseKind &ScheduleKind,
3402 llvm::Value *&Chunk) const {
3403 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3404 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3405 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3406 Chunk = CGF.EmitScalarConversion(
3407 RT.getGPUNumThreads(CGF),
3408 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3409 S.getIterationVariable()->getType(), S.getBeginLoc());
3410 return;
3411 }
3412 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3413 CGF, S, ScheduleKind, Chunk);
3414}
3415
3416void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3417 CodeGenFunction &CGF, const OMPLoopDirective &S,
3418 OpenMPScheduleClauseKind &ScheduleKind,
3419 const Expr *&ChunkExpr) const {
3420 ScheduleKind = OMPC_SCHEDULE_static;
3421 // Chunk size is 1 in this case.
3422 llvm::APInt ChunkSize(32, 1);
3423 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3424 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3425 SourceLocation());
3426}
3427
3428void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3429 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3430 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) && " Expected target-based directive."
) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3431, __extension__
__PRETTY_FUNCTION__))
3431 " Expected target-based directive.")(static_cast <bool> (isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) && " Expected target-based directive."
) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3431, __extension__
__PRETTY_FUNCTION__))
;
3432 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3433 for (const CapturedStmt::Capture &C : CS->captures()) {
3434 // Capture variables captured by reference in lambdas for target-based
3435 // directives.
3436 if (!C.capturesVariable())
3437 continue;
3438 const VarDecl *VD = C.getCapturedVar();
3439 const auto *RD = VD->getType()
3440 .getCanonicalType()
3441 .getNonReferenceType()
3442 ->getAsCXXRecordDecl();
3443 if (!RD || !RD->isLambda())
3444 continue;
3445 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3446 LValue VDLVal;
3447 if (VD->getType().getCanonicalType()->isReferenceType())
3448 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3449 else
3450 VDLVal = CGF.MakeAddrLValue(
3451 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3452 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3453 FieldDecl *ThisCapture = nullptr;
3454 RD->getCaptureFields(Captures, ThisCapture);
3455 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3456 LValue ThisLVal =
3457 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3458 llvm::Value *CXXThis = CGF.LoadCXXThis();
3459 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3460 }
3461 for (const LambdaCapture &LC : RD->captures()) {
3462 if (LC.getCaptureKind() != LCK_ByRef)
3463 continue;
3464 const ValueDecl *VD = LC.getCapturedVar();
3465 // FIXME: For now VD is always a VarDecl because OpenMP does not support
3466 // capturing structured bindings in lambdas yet.
3467 if (!CS->capturesVariable(cast<VarDecl>(VD)))
3468 continue;
3469 auto It = Captures.find(VD);
3470 assert(It != Captures.end() && "Found lambda capture without field.")(static_cast <bool> (It != Captures.end() && "Found lambda capture without field."
) ? void (0) : __assert_fail ("It != Captures.end() && \"Found lambda capture without field.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3470, __extension__
__PRETTY_FUNCTION__))
;
3471 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3472 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3473 if (VD->getType().getCanonicalType()->isReferenceType())
3474 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3475 VD->getType().getCanonicalType())
3476 .getAddress(CGF);
3477 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3478 }
3479 }
3480}
3481
3482bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3483 LangAS &AS) {
3484 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3485 return false;
3486 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3487 switch(A->getAllocatorType()) {
3488 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3489 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3490 // Not supported, fallback to the default mem space.
3491 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3492 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3493 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3494 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3495 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3496 AS = LangAS::Default;
3497 return true;
3498 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3499 AS = LangAS::cuda_constant;
3500 return true;
3501 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3502 AS = LangAS::cuda_shared;
3503 return true;
3504 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3505 llvm_unreachable("Expected predefined allocator for the variables with the "::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the "
"static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3506)
3506 "static storage.")::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the "
"static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3506)
;
3507 }
3508 return false;
3509}
3510
3511// Get current CudaArch and ignore any unknown values
3512static CudaArch getCudaArch(CodeGenModule &CGM) {
3513 if (!CGM.getTarget().hasFeature("ptx"))
3514 return CudaArch::UNKNOWN;
3515 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3516 if (Feature.getValue()) {
3517 CudaArch Arch = StringToCudaArch(Feature.getKey());
3518 if (Arch != CudaArch::UNKNOWN)
3519 return Arch;
3520 }
3521 }
3522 return CudaArch::UNKNOWN;
3523}
3524
3525/// Check to see if target architecture supports unified addressing which is
3526/// a restriction for OpenMP requires clause "unified_shared_memory".
3527void CGOpenMPRuntimeGPU::processRequiresDirective(
3528 const OMPRequiresDecl *D) {
3529 for (const OMPClause *Clause : D->clauselists()) {
3530 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3531 CudaArch Arch = getCudaArch(CGM);
3532 switch (Arch) {
3533 case CudaArch::SM_20:
3534 case CudaArch::SM_21:
3535 case CudaArch::SM_30:
3536 case CudaArch::SM_32:
3537 case CudaArch::SM_35:
3538 case CudaArch::SM_37:
3539 case CudaArch::SM_50:
3540 case CudaArch::SM_52:
3541 case CudaArch::SM_53: {
3542 SmallString<256> Buffer;
3543 llvm::raw_svector_ostream Out(Buffer);
3544 Out << "Target architecture " << CudaArchToString(Arch)
3545 << " does not support unified addressing";
3546 CGM.Error(Clause->getBeginLoc(), Out.str());
3547 return;
3548 }
3549 case CudaArch::SM_60:
3550 case CudaArch::SM_61:
3551 case CudaArch::SM_62:
3552 case CudaArch::SM_70:
3553 case CudaArch::SM_72:
3554 case CudaArch::SM_75:
3555 case CudaArch::SM_80:
3556 case CudaArch::SM_86:
3557 case CudaArch::SM_87:
3558 case CudaArch::SM_89:
3559 case CudaArch::SM_90:
3560 case CudaArch::GFX600:
3561 case CudaArch::GFX601:
3562 case CudaArch::GFX602:
3563 case CudaArch::GFX700:
3564 case CudaArch::GFX701:
3565 case CudaArch::GFX702:
3566 case CudaArch::GFX703:
3567 case CudaArch::GFX704:
3568 case CudaArch::GFX705:
3569 case CudaArch::GFX801:
3570 case CudaArch::GFX802:
3571 case CudaArch::GFX803:
3572 case CudaArch::GFX805:
3573 case CudaArch::GFX810:
3574 case CudaArch::GFX900:
3575 case CudaArch::GFX902:
3576 case CudaArch::GFX904:
3577 case CudaArch::GFX906:
3578 case CudaArch::GFX908:
3579 case CudaArch::GFX909:
3580 case CudaArch::GFX90a:
3581 case CudaArch::GFX90c:
3582 case CudaArch::GFX940:
3583 case CudaArch::GFX1010:
3584 case CudaArch::GFX1011:
3585 case CudaArch::GFX1012:
3586 case CudaArch::GFX1013:
3587 case CudaArch::GFX1030:
3588 case CudaArch::GFX1031:
3589 case CudaArch::GFX1032:
3590 case CudaArch::GFX1033:
3591 case CudaArch::GFX1034:
3592 case CudaArch::GFX1035:
3593 case CudaArch::GFX1036:
3594 case CudaArch::GFX1100:
3595 case CudaArch::GFX1101:
3596 case CudaArch::GFX1102:
3597 case CudaArch::GFX1103:
3598 case CudaArch::Generic:
3599 case CudaArch::UNUSED:
3600 case CudaArch::UNKNOWN:
3601 break;
3602 case CudaArch::LAST:
3603 llvm_unreachable("Unexpected Cuda arch.")::llvm::llvm_unreachable_internal("Unexpected Cuda arch.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3603)
;
3604 }
3605 }
3606 }
3607 CGOpenMPRuntime::processRequiresDirective(D);
3608}
3609
3610void CGOpenMPRuntimeGPU::clear() {
3611
3612 if (!TeamsReductions.empty()) {
3613 ASTContext &C = CGM.getContext();
3614 RecordDecl *StaticRD = C.buildImplicitRecord(
3615 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3616 StaticRD->startDefinition();
3617 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3618 QualType RecTy = C.getRecordType(TeamReductionRec);
3619 auto *Field = FieldDecl::Create(
3620 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3621 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3622 /*BW=*/nullptr, /*Mutable=*/false,
3623 /*InitStyle=*/ICIS_NoInit);
3624 Field->setAccess(AS_public);
3625 StaticRD->addDecl(Field);
3626 }
3627 StaticRD->completeDefinition();
3628 QualType StaticTy = C.getRecordType(StaticRD);
3629 llvm::Type *LLVMReductionsBufferTy =
3630 CGM.getTypes().ConvertTypeForMem(StaticTy);
3631 // FIXME: nvlink does not handle weak linkage correctly (object with the
3632 // different size are reported as erroneous).
3633 // Restore CommonLinkage as soon as nvlink is fixed.
3634 auto *GV = new llvm::GlobalVariable(
3635 CGM.getModule(), LLVMReductionsBufferTy,
3636 /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3637 llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3638 "_openmp_teams_reductions_buffer_$_");
3639 KernelTeamsReductionPtr->setInitializer(
3640 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3641 CGM.VoidPtrTy));
3642 }
3643 CGOpenMPRuntime::clear();
3644}
3645
3646llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
3647 CGBuilderTy &Bld = CGF.Builder;
3648 llvm::Module *M = &CGF.CGM.getModule();
3649 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3650 llvm::Function *F = M->getFunction(LocSize);
3651 if (!F) {
3652 F = llvm::Function::Create(
3653 llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),
3654 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
3655 }
3656 return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");
3657}
3658
3659llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
3660 ArrayRef<llvm::Value *> Args{};
3661 return CGF.EmitRuntimeCall(
3662 OMPBuilder.getOrCreateRuntimeFunction(
3663 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
3664 Args);
3665}
3666
3667llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
3668 ArrayRef<llvm::Value *> Args{};
3669 return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3670 CGM.getModule(), OMPRTL___kmpc_get_warp_size),
3671 Args);
3672}