Bug Summary

File:build/source/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Warning:line 531, column 29
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name CGOpenMPRuntimeGPU.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -relaxed-aliasing -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm -resource-dir /usr/lib/llvm-17/lib/clang/17 -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I tools/clang/lib/CodeGen -I /build/source/clang/lib/CodeGen -I /build/source/clang/include -I tools/clang/include -I include -I /build/source/llvm/include -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm=build-llvm -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm=build-llvm -fcoverage-prefix-map=/build/source/= -source-date-epoch 1679915782 -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-03-27-130437-16335-1 -x c++ /build/source/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
1//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This provides a generalized class for OpenMP runtime code generation
10// specialized by GPU targets NVPTX and AMDGCN.
11//
12//===----------------------------------------------------------------------===//
13
14#include "CGOpenMPRuntimeGPU.h"
15#include "CodeGenFunction.h"
16#include "clang/AST/Attr.h"
17#include "clang/AST/DeclOpenMP.h"
18#include "clang/AST/OpenMPClause.h"
19#include "clang/AST/StmtOpenMP.h"
20#include "clang/AST/StmtVisitor.h"
21#include "clang/Basic/Cuda.h"
22#include "llvm/ADT/SmallPtrSet.h"
23#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24#include "llvm/Support/MathExtras.h"
25
26using namespace clang;
27using namespace CodeGen;
28using namespace llvm::omp;
29
30namespace {
31/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32class NVPTXActionTy final : public PrePostActionTy {
33 llvm::FunctionCallee EnterCallee = nullptr;
34 ArrayRef<llvm::Value *> EnterArgs;
35 llvm::FunctionCallee ExitCallee = nullptr;
36 ArrayRef<llvm::Value *> ExitArgs;
37 bool Conditional = false;
38 llvm::BasicBlock *ContBlock = nullptr;
39
40public:
41 NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42 ArrayRef<llvm::Value *> EnterArgs,
43 llvm::FunctionCallee ExitCallee,
44 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46 ExitArgs(ExitArgs), Conditional(Conditional) {}
47 void Enter(CodeGenFunction &CGF) override {
48 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49 if (Conditional) {
50 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52 ContBlock = CGF.createBasicBlock("omp_if.end");
53 // Generate the branch (If-stmt)
54 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55 CGF.EmitBlock(ThenBlock);
56 }
57 }
58 void Done(CodeGenFunction &CGF) {
59 // Emit the rest of blocks/branches
60 CGF.EmitBranch(ContBlock);
61 CGF.EmitBlock(ContBlock, true);
62 }
63 void Exit(CodeGenFunction &CGF) override {
64 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65 }
66};
67
68/// A class to track the execution mode when codegening directives within
69/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70/// to the target region and used by containing directives such as 'parallel'
71/// to emit optimized code.
72class ExecutionRuntimeModesRAII {
73private:
74 CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75 CGOpenMPRuntimeGPU::EM_Unknown;
76 CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77
78public:
79 ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80 CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
81 : ExecMode(ExecMode) {
82 SavedExecMode = ExecMode;
83 ExecMode = EntryMode;
84 }
85 ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86};
87
88/// GPU Configuration: This information can be derived from cuda registers,
89/// however, providing compile time constants helps generate more efficient
90/// code. For all practical purposes this is fine because the configuration
91/// is the same for all known NVPTX architectures.
92enum MachineConfiguration : unsigned {
93 /// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
94 /// specific Grid Values like GV_Warp_Size, GV_Slot_Size
95
96 /// Global memory alignment for performance.
97 GlobalMemoryAlignment = 128,
98};
99
100static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
101 RefExpr = RefExpr->IgnoreParens();
102 if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
103 const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
104 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
105 Base = TempASE->getBase()->IgnoreParenImpCasts();
106 RefExpr = Base;
107 } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
108 const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
109 while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
110 Base = TempOASE->getBase()->IgnoreParenImpCasts();
111 while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
112 Base = TempASE->getBase()->IgnoreParenImpCasts();
113 RefExpr = Base;
114 }
115 RefExpr = RefExpr->IgnoreParenImpCasts();
116 if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
117 return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
118 const auto *ME = cast<MemberExpr>(RefExpr);
119 return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
120}
121
122
123static RecordDecl *buildRecordForGlobalizedVars(
124 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
125 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
126 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
127 &MappedDeclsFields, int BufSize) {
128 using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
129 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
130 return nullptr;
131 SmallVector<VarsDataTy, 4> GlobalizedVars;
132 for (const ValueDecl *D : EscapedDecls)
133 GlobalizedVars.emplace_back(
134 CharUnits::fromQuantity(std::max(
135 C.getDeclAlign(D).getQuantity(),
136 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
137 D);
138 for (const ValueDecl *D : EscapedDeclsForTeams)
139 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
140 llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
141 return L.first > R.first;
142 });
143
144 // Build struct _globalized_locals_ty {
145 // /* globalized vars */[WarSize] align (max(decl_align,
146 // GlobalMemoryAlignment))
147 // /* globalized vars */ for EscapedDeclsForTeams
148 // };
149 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
150 GlobalizedRD->startDefinition();
151 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
152 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
153 for (const auto &Pair : GlobalizedVars) {
154 const ValueDecl *VD = Pair.second;
155 QualType Type = VD->getType();
156 if (Type->isLValueReferenceType())
157 Type = C.getPointerType(Type.getNonReferenceType());
158 else
159 Type = Type.getNonReferenceType();
160 SourceLocation Loc = VD->getLocation();
161 FieldDecl *Field;
162 if (SingleEscaped.count(VD)) {
163 Field = FieldDecl::Create(
164 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
165 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
166 /*BW=*/nullptr, /*Mutable=*/false,
167 /*InitStyle=*/ICIS_NoInit);
168 Field->setAccess(AS_public);
169 if (VD->hasAttrs()) {
170 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
171 E(VD->getAttrs().end());
172 I != E; ++I)
173 Field->addAttr(*I);
174 }
175 } else {
176 llvm::APInt ArraySize(32, BufSize);
177 Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
178 0);
179 Field = FieldDecl::Create(
180 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
181 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
182 /*BW=*/nullptr, /*Mutable=*/false,
183 /*InitStyle=*/ICIS_NoInit);
184 Field->setAccess(AS_public);
185 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
186 static_cast<CharUnits::QuantityType>(
187 GlobalMemoryAlignment)));
188 Field->addAttr(AlignedAttr::CreateImplicit(
189 C, /*IsAlignmentExpr=*/true,
190 IntegerLiteral::Create(C, Align,
191 C.getIntTypeForBitwidth(32, /*Signed=*/0),
192 SourceLocation()),
193 {}, AttributeCommonInfo::AS_GNU, AlignedAttr::GNU_aligned));
194 }
195 GlobalizedRD->addDecl(Field);
196 MappedDeclsFields.try_emplace(VD, Field);
197 }
198 GlobalizedRD->completeDefinition();
199 return GlobalizedRD;
200}
201
202/// Get the list of variables that can escape their declaration context.
203class CheckVarsEscapingDeclContext final
204 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
205 CodeGenFunction &CGF;
206 llvm::SetVector<const ValueDecl *> EscapedDecls;
207 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
208 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
209 RecordDecl *GlobalizedRD = nullptr;
210 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
211 bool AllEscaped = false;
212 bool IsForCombinedParallelRegion = false;
213
214 void markAsEscaped(const ValueDecl *VD) {
215 // Do not globalize declare target variables.
216 if (!isa<VarDecl>(VD) ||
217 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
218 return;
219 VD = cast<ValueDecl>(VD->getCanonicalDecl());
220 // Use user-specified allocation.
221 if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
222 return;
223 // Variables captured by value must be globalized.
224 if (auto *CSI = CGF.CapturedStmtInfo) {
225 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
226 // Check if need to capture the variable that was already captured by
227 // value in the outer region.
228 if (!IsForCombinedParallelRegion) {
229 if (!FD->hasAttrs())
230 return;
231 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
232 if (!Attr)
233 return;
234 if (((Attr->getCaptureKind() != OMPC_map) &&
235 !isOpenMPPrivate(Attr->getCaptureKind())) ||
236 ((Attr->getCaptureKind() == OMPC_map) &&
237 !FD->getType()->isAnyPointerType()))
238 return;
239 }
240 if (!FD->getType()->isReferenceType()) {
241 assert(!VD->getType()->isVariablyModifiedType() &&(static_cast <bool> (!VD->getType()->isVariablyModifiedType
() && "Parameter captured by value with variably modified type"
) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 242, __extension__
__PRETTY_FUNCTION__))
242 "Parameter captured by value with variably modified type")(static_cast <bool> (!VD->getType()->isVariablyModifiedType
() && "Parameter captured by value with variably modified type"
) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 242, __extension__
__PRETTY_FUNCTION__))
;
243 EscapedParameters.insert(VD);
244 } else if (!IsForCombinedParallelRegion) {
245 return;
246 }
247 }
248 }
249 if ((!CGF.CapturedStmtInfo ||
250 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
251 VD->getType()->isReferenceType())
252 // Do not globalize variables with reference type.
253 return;
254 if (VD->getType()->isVariablyModifiedType())
255 EscapedVariableLengthDecls.insert(VD);
256 else
257 EscapedDecls.insert(VD);
258 }
259
260 void VisitValueDecl(const ValueDecl *VD) {
261 if (VD->getType()->isLValueReferenceType())
262 markAsEscaped(VD);
263 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
264 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
265 const bool SavedAllEscaped = AllEscaped;
266 AllEscaped = VD->getType()->isLValueReferenceType();
267 Visit(VarD->getInit());
268 AllEscaped = SavedAllEscaped;
269 }
270 }
271 }
272 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
273 ArrayRef<OMPClause *> Clauses,
274 bool IsCombinedParallelRegion) {
275 if (!S)
276 return;
277 for (const CapturedStmt::Capture &C : S->captures()) {
278 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
279 const ValueDecl *VD = C.getCapturedVar();
280 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
281 if (IsCombinedParallelRegion) {
282 // Check if the variable is privatized in the combined construct and
283 // those private copies must be shared in the inner parallel
284 // directive.
285 IsForCombinedParallelRegion = false;
286 for (const OMPClause *C : Clauses) {
287 if (!isOpenMPPrivate(C->getClauseKind()) ||
288 C->getClauseKind() == OMPC_reduction ||
289 C->getClauseKind() == OMPC_linear ||
290 C->getClauseKind() == OMPC_private)
291 continue;
292 ArrayRef<const Expr *> Vars;
293 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
294 Vars = PC->getVarRefs();
295 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
296 Vars = PC->getVarRefs();
297 else
298 llvm_unreachable("Unexpected clause.")::llvm::llvm_unreachable_internal("Unexpected clause.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 298)
;
299 for (const auto *E : Vars) {
300 const Decl *D =
301 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
302 if (D == VD->getCanonicalDecl()) {
303 IsForCombinedParallelRegion = true;
304 break;
305 }
306 }
307 if (IsForCombinedParallelRegion)
308 break;
309 }
310 }
311 markAsEscaped(VD);
312 if (isa<OMPCapturedExprDecl>(VD))
313 VisitValueDecl(VD);
314 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
315 }
316 }
317 }
318
319 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
320 assert(!GlobalizedRD &&(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already."
) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 321, __extension__
__PRETTY_FUNCTION__))
321 "Record for globalized variables is built already.")(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already."
) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 321, __extension__
__PRETTY_FUNCTION__))
;
322 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
323 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
324 if (IsInTTDRegion)
325 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
326 else
327 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
328 GlobalizedRD = ::buildRecordForGlobalizedVars(
329 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
330 MappedDeclsFields, WarpSize);
331 }
332
333public:
334 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
335 ArrayRef<const ValueDecl *> TeamsReductions)
336 : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
337 }
338 virtual ~CheckVarsEscapingDeclContext() = default;
339 void VisitDeclStmt(const DeclStmt *S) {
340 if (!S)
341 return;
342 for (const Decl *D : S->decls())
343 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
344 VisitValueDecl(VD);
345 }
346 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
347 if (!D)
348 return;
349 if (!D->hasAssociatedStmt())
350 return;
351 if (const auto *S =
352 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
353 // Do not analyze directives that do not actually require capturing,
354 // like `omp for` or `omp simd` directives.
355 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
356 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
357 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
358 VisitStmt(S->getCapturedStmt());
359 return;
360 }
361 VisitOpenMPCapturedStmt(
362 S, D->clauses(),
363 CaptureRegions.back() == OMPD_parallel &&
364 isOpenMPDistributeDirective(D->getDirectiveKind()));
365 }
366 }
367 void VisitCapturedStmt(const CapturedStmt *S) {
368 if (!S)
369 return;
370 for (const CapturedStmt::Capture &C : S->captures()) {
371 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
372 const ValueDecl *VD = C.getCapturedVar();
373 markAsEscaped(VD);
374 if (isa<OMPCapturedExprDecl>(VD))
375 VisitValueDecl(VD);
376 }
377 }
378 }
379 void VisitLambdaExpr(const LambdaExpr *E) {
380 if (!E)
381 return;
382 for (const LambdaCapture &C : E->captures()) {
383 if (C.capturesVariable()) {
384 if (C.getCaptureKind() == LCK_ByRef) {
385 const ValueDecl *VD = C.getCapturedVar();
386 markAsEscaped(VD);
387 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
388 VisitValueDecl(VD);
389 }
390 }
391 }
392 }
393 void VisitBlockExpr(const BlockExpr *E) {
394 if (!E)
395 return;
396 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
397 if (C.isByRef()) {
398 const VarDecl *VD = C.getVariable();
399 markAsEscaped(VD);
400 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
401 VisitValueDecl(VD);
402 }
403 }
404 }
405 void VisitCallExpr(const CallExpr *E) {
406 if (!E)
407 return;
408 for (const Expr *Arg : E->arguments()) {
409 if (!Arg)
410 continue;
411 if (Arg->isLValue()) {
412 const bool SavedAllEscaped = AllEscaped;
413 AllEscaped = true;
414 Visit(Arg);
415 AllEscaped = SavedAllEscaped;
416 } else {
417 Visit(Arg);
418 }
419 }
420 Visit(E->getCallee());
421 }
422 void VisitDeclRefExpr(const DeclRefExpr *E) {
423 if (!E)
424 return;
425 const ValueDecl *VD = E->getDecl();
426 if (AllEscaped)
427 markAsEscaped(VD);
428 if (isa<OMPCapturedExprDecl>(VD))
429 VisitValueDecl(VD);
430 else if (VD->isInitCapture())
431 VisitValueDecl(VD);
432 }
433 void VisitUnaryOperator(const UnaryOperator *E) {
434 if (!E)
435 return;
436 if (E->getOpcode() == UO_AddrOf) {
437 const bool SavedAllEscaped = AllEscaped;
438 AllEscaped = true;
439 Visit(E->getSubExpr());
440 AllEscaped = SavedAllEscaped;
441 } else {
442 Visit(E->getSubExpr());
443 }
444 }
445 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
446 if (!E)
447 return;
448 if (E->getCastKind() == CK_ArrayToPointerDecay) {
449 const bool SavedAllEscaped = AllEscaped;
450 AllEscaped = true;
451 Visit(E->getSubExpr());
452 AllEscaped = SavedAllEscaped;
453 } else {
454 Visit(E->getSubExpr());
455 }
456 }
457 void VisitExpr(const Expr *E) {
458 if (!E)
459 return;
460 bool SavedAllEscaped = AllEscaped;
461 if (!E->isLValue())
462 AllEscaped = false;
463 for (const Stmt *Child : E->children())
464 if (Child)
465 Visit(Child);
466 AllEscaped = SavedAllEscaped;
467 }
468 void VisitStmt(const Stmt *S) {
469 if (!S)
470 return;
471 for (const Stmt *Child : S->children())
472 if (Child)
473 Visit(Child);
474 }
475
476 /// Returns the record that handles all the escaped local variables and used
477 /// instead of their original storage.
478 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
479 if (!GlobalizedRD)
480 buildRecordForGlobalizedVars(IsInTTDRegion);
481 return GlobalizedRD;
482 }
483
484 /// Returns the field in the globalized record for the escaped variable.
485 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
486 assert(GlobalizedRD &&(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already."
) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 487, __extension__
__PRETTY_FUNCTION__))
487 "Record for globalized variables must be generated already.")(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already."
) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 487, __extension__
__PRETTY_FUNCTION__))
;
488 auto I = MappedDeclsFields.find(VD);
489 if (I == MappedDeclsFields.end())
490 return nullptr;
491 return I->getSecond();
492 }
493
494 /// Returns the list of the escaped local variables/parameters.
495 ArrayRef<const ValueDecl *> getEscapedDecls() const {
496 return EscapedDecls.getArrayRef();
497 }
498
499 /// Checks if the escaped local variable is actually a parameter passed by
500 /// value.
501 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
502 return EscapedParameters;
503 }
504
505 /// Returns the list of the escaped variables with the variably modified
506 /// types.
507 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
508 return EscapedVariableLengthDecls.getArrayRef();
509 }
510};
511} // anonymous namespace
512
513/// Get the id of the warp in the block.
514/// We assume that the warp size is 32, which is always the case
515/// on the NVPTX device, to generate more efficient code.
516static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
517 CGBuilderTy &Bld = CGF.Builder;
518 unsigned LaneIDBits =
519 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
520 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
521 return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
522}
523
524/// Get the id of the current lane in the Warp.
525/// We assume that the warp size is 32, which is always the case
526/// on the NVPTX device, to generate more efficient code.
527static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
528 CGBuilderTy &Bld = CGF.Builder;
529 unsigned LaneIDBits =
530 llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
531 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
14
The result of the right shift is undefined due to shifting by '33', which is greater or equal to the width of type 'unsigned int'
532 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
533 return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
534 "nvptx_lane_id");
535}
536
537CGOpenMPRuntimeGPU::ExecutionMode
538CGOpenMPRuntimeGPU::getExecutionMode() const {
539 return CurrentExecutionMode;
540}
541
542static CGOpenMPRuntimeGPU::DataSharingMode
543getDataSharingMode(CodeGenModule &CGM) {
544 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
545 : CGOpenMPRuntimeGPU::Generic;
546}
547
548/// Check for inner (nested) SPMD construct, if any
549static bool hasNestedSPMDDirective(ASTContext &Ctx,
550 const OMPExecutableDirective &D) {
551 const auto *CS = D.getInnermostCapturedStmt();
552 const auto *Body =
553 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
554 const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
555
556 if (const auto *NestedDir =
557 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
558 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
559 switch (D.getDirectiveKind()) {
560 case OMPD_target:
561 if (isOpenMPParallelDirective(DKind))
562 return true;
563 if (DKind == OMPD_teams) {
564 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
565 /*IgnoreCaptured=*/true);
566 if (!Body)
567 return false;
568 ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
569 if (const auto *NND =
570 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
571 DKind = NND->getDirectiveKind();
572 if (isOpenMPParallelDirective(DKind))
573 return true;
574 }
575 }
576 return false;
577 case OMPD_target_teams:
578 return isOpenMPParallelDirective(DKind);
579 case OMPD_target_simd:
580 case OMPD_target_parallel:
581 case OMPD_target_parallel_for:
582 case OMPD_target_parallel_for_simd:
583 case OMPD_target_teams_distribute:
584 case OMPD_target_teams_distribute_simd:
585 case OMPD_target_teams_distribute_parallel_for:
586 case OMPD_target_teams_distribute_parallel_for_simd:
587 case OMPD_parallel:
588 case OMPD_for:
589 case OMPD_parallel_for:
590 case OMPD_parallel_master:
591 case OMPD_parallel_sections:
592 case OMPD_for_simd:
593 case OMPD_parallel_for_simd:
594 case OMPD_cancel:
595 case OMPD_cancellation_point:
596 case OMPD_ordered:
597 case OMPD_threadprivate:
598 case OMPD_allocate:
599 case OMPD_task:
600 case OMPD_simd:
601 case OMPD_sections:
602 case OMPD_section:
603 case OMPD_single:
604 case OMPD_master:
605 case OMPD_critical:
606 case OMPD_taskyield:
607 case OMPD_barrier:
608 case OMPD_taskwait:
609 case OMPD_taskgroup:
610 case OMPD_atomic:
611 case OMPD_flush:
612 case OMPD_depobj:
613 case OMPD_scan:
614 case OMPD_teams:
615 case OMPD_target_data:
616 case OMPD_target_exit_data:
617 case OMPD_target_enter_data:
618 case OMPD_distribute:
619 case OMPD_distribute_simd:
620 case OMPD_distribute_parallel_for:
621 case OMPD_distribute_parallel_for_simd:
622 case OMPD_teams_distribute:
623 case OMPD_teams_distribute_simd:
624 case OMPD_teams_distribute_parallel_for:
625 case OMPD_teams_distribute_parallel_for_simd:
626 case OMPD_target_update:
627 case OMPD_declare_simd:
628 case OMPD_declare_variant:
629 case OMPD_begin_declare_variant:
630 case OMPD_end_declare_variant:
631 case OMPD_declare_target:
632 case OMPD_end_declare_target:
633 case OMPD_declare_reduction:
634 case OMPD_declare_mapper:
635 case OMPD_taskloop:
636 case OMPD_taskloop_simd:
637 case OMPD_master_taskloop:
638 case OMPD_master_taskloop_simd:
639 case OMPD_parallel_master_taskloop:
640 case OMPD_parallel_master_taskloop_simd:
641 case OMPD_requires:
642 case OMPD_unknown:
643 default:
644 llvm_unreachable("Unexpected directive.")::llvm::llvm_unreachable_internal("Unexpected directive.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 644)
;
645 }
646 }
647
648 return false;
649}
650
651static bool supportsSPMDExecutionMode(ASTContext &Ctx,
652 const OMPExecutableDirective &D) {
653 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
654 switch (DirectiveKind) {
655 case OMPD_target:
656 case OMPD_target_teams:
657 return hasNestedSPMDDirective(Ctx, D);
658 case OMPD_target_parallel:
659 case OMPD_target_parallel_for:
660 case OMPD_target_parallel_for_simd:
661 case OMPD_target_teams_distribute_parallel_for:
662 case OMPD_target_teams_distribute_parallel_for_simd:
663 case OMPD_target_simd:
664 case OMPD_target_teams_distribute_simd:
665 return true;
666 case OMPD_target_teams_distribute:
667 return false;
668 case OMPD_parallel:
669 case OMPD_for:
670 case OMPD_parallel_for:
671 case OMPD_parallel_master:
672 case OMPD_parallel_sections:
673 case OMPD_for_simd:
674 case OMPD_parallel_for_simd:
675 case OMPD_cancel:
676 case OMPD_cancellation_point:
677 case OMPD_ordered:
678 case OMPD_threadprivate:
679 case OMPD_allocate:
680 case OMPD_task:
681 case OMPD_simd:
682 case OMPD_sections:
683 case OMPD_section:
684 case OMPD_single:
685 case OMPD_master:
686 case OMPD_critical:
687 case OMPD_taskyield:
688 case OMPD_barrier:
689 case OMPD_taskwait:
690 case OMPD_taskgroup:
691 case OMPD_atomic:
692 case OMPD_flush:
693 case OMPD_depobj:
694 case OMPD_scan:
695 case OMPD_teams:
696 case OMPD_target_data:
697 case OMPD_target_exit_data:
698 case OMPD_target_enter_data:
699 case OMPD_distribute:
700 case OMPD_distribute_simd:
701 case OMPD_distribute_parallel_for:
702 case OMPD_distribute_parallel_for_simd:
703 case OMPD_teams_distribute:
704 case OMPD_teams_distribute_simd:
705 case OMPD_teams_distribute_parallel_for:
706 case OMPD_teams_distribute_parallel_for_simd:
707 case OMPD_target_update:
708 case OMPD_declare_simd:
709 case OMPD_declare_variant:
710 case OMPD_begin_declare_variant:
711 case OMPD_end_declare_variant:
712 case OMPD_declare_target:
713 case OMPD_end_declare_target:
714 case OMPD_declare_reduction:
715 case OMPD_declare_mapper:
716 case OMPD_taskloop:
717 case OMPD_taskloop_simd:
718 case OMPD_master_taskloop:
719 case OMPD_master_taskloop_simd:
720 case OMPD_parallel_master_taskloop:
721 case OMPD_parallel_master_taskloop_simd:
722 case OMPD_requires:
723 case OMPD_unknown:
724 default:
725 break;
726 }
727 llvm_unreachable(::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 728)
728 "Unknown programming model for OpenMP directive on NVPTX target.")::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 728)
;
729}
730
731void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
732 StringRef ParentName,
733 llvm::Function *&OutlinedFn,
734 llvm::Constant *&OutlinedFnID,
735 bool IsOffloadEntry,
736 const RegionCodeGenTy &CodeGen) {
737 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
738 EntryFunctionState EST;
739 WrapperFunctionsMap.clear();
740
741 // Emit target region as a standalone region.
742 class NVPTXPrePostActionTy : public PrePostActionTy {
743 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
744
745 public:
746 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
747 : EST(EST) {}
748 void Enter(CodeGenFunction &CGF) override {
749 auto &RT =
750 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
751 RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
752 // Skip target region initialization.
753 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
754 }
755 void Exit(CodeGenFunction &CGF) override {
756 auto &RT =
757 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
758 RT.clearLocThreadIdInsertPt(CGF);
759 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
760 }
761 } Action(EST);
762 CodeGen.setAction(Action);
763 IsInTTDRegion = true;
764 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
765 IsOffloadEntry, CodeGen);
766 IsInTTDRegion = false;
767}
768
769void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
770 EntryFunctionState &EST, bool IsSPMD) {
771 CGBuilderTy &Bld = CGF.Builder;
772 Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD));
773 if (!IsSPMD)
774 emitGenericVarsProlog(CGF, EST.Loc);
775}
776
777void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
778 EntryFunctionState &EST,
779 bool IsSPMD) {
780 if (!IsSPMD)
781 emitGenericVarsEpilog(CGF);
782
783 CGBuilderTy &Bld = CGF.Builder;
784 OMPBuilder.createTargetDeinit(Bld, IsSPMD);
785}
786
787void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
788 StringRef ParentName,
789 llvm::Function *&OutlinedFn,
790 llvm::Constant *&OutlinedFnID,
791 bool IsOffloadEntry,
792 const RegionCodeGenTy &CodeGen) {
793 ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
794 EntryFunctionState EST;
795
796 // Emit target region as a standalone region.
797 class NVPTXPrePostActionTy : public PrePostActionTy {
798 CGOpenMPRuntimeGPU &RT;
799 CGOpenMPRuntimeGPU::EntryFunctionState &EST;
800
801 public:
802 NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
803 CGOpenMPRuntimeGPU::EntryFunctionState &EST)
804 : RT(RT), EST(EST) {}
805 void Enter(CodeGenFunction &CGF) override {
806 RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
807 // Skip target region initialization.
808 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
809 }
810 void Exit(CodeGenFunction &CGF) override {
811 RT.clearLocThreadIdInsertPt(CGF);
812 RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
813 }
814 } Action(*this, EST);
815 CodeGen.setAction(Action);
816 IsInTTDRegion = true;
817 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
818 IsOffloadEntry, CodeGen);
819 IsInTTDRegion = false;
820}
821
822// Create a unique global variable to indicate the execution mode of this target
823// region. The execution mode is either 'generic', or 'spmd' depending on the
824// target directive. This variable is picked up by the offload library to setup
825// the device appropriately before kernel launch. If the execution mode is
826// 'generic', the runtime reserves one warp for the master, otherwise, all
827// warps participate in parallel work.
828static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
829 bool Mode) {
830 auto *GVMode = new llvm::GlobalVariable(
831 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
832 llvm::GlobalValue::WeakAnyLinkage,
833 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
834 : OMP_TGT_EXEC_MODE_GENERIC),
835 Twine(Name, "_exec_mode"));
836 GVMode->setVisibility(llvm::GlobalVariable::ProtectedVisibility);
837 CGM.addCompilerUsedGlobal(GVMode);
838}
839
840void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
841 const OMPExecutableDirective &D, StringRef ParentName,
842 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
843 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
844 if (!IsOffloadEntry) // Nothing to do.
845 return;
846
847 assert(!ParentName.empty() && "Invalid target region parent name!")(static_cast <bool> (!ParentName.empty() && "Invalid target region parent name!"
) ? void (0) : __assert_fail ("!ParentName.empty() && \"Invalid target region parent name!\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 847, __extension__
__PRETTY_FUNCTION__))
;
848
849 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
850 if (Mode)
851 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
852 CodeGen);
853 else
854 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
855 CodeGen);
856
857 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
858}
859
860CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
861 : CGOpenMPRuntime(CGM) {
862 llvm::OpenMPIRBuilderConfig Config(CGM.getLangOpts().OpenMPIsDevice, true,
863 hasRequiresUnifiedSharedMemory(),
864 CGM.getLangOpts().OpenMPOffloadMandatory);
865 OMPBuilder.setConfig(Config);
866
867 if (!CGM.getLangOpts().OpenMPIsDevice)
868 llvm_unreachable("OpenMP can only handle device code.")::llvm::llvm_unreachable_internal("OpenMP can only handle device code."
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 868)
;
869
870 llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
871 if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
872 return;
873
874 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
875 "__omp_rtl_debug_kind");
876 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
877 "__omp_rtl_assume_teams_oversubscription");
878 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
879 "__omp_rtl_assume_threads_oversubscription");
880 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
881 "__omp_rtl_assume_no_thread_state");
882 OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
883 "__omp_rtl_assume_no_nested_parallelism");
884}
885
886void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
887 ProcBindKind ProcBind,
888 SourceLocation Loc) {
889 // Do nothing in case of SPMD mode and L0 parallel.
890 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
891 return;
892
893 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
894}
895
896void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
897 llvm::Value *NumThreads,
898 SourceLocation Loc) {
899 // Nothing to do.
900}
901
902void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
903 const Expr *NumTeams,
904 const Expr *ThreadLimit,
905 SourceLocation Loc) {}
906
907llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
908 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
909 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
910 // Emit target region as a standalone region.
911 bool PrevIsInTTDRegion = IsInTTDRegion;
912 IsInTTDRegion = false;
913 auto *OutlinedFun =
914 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
915 D, ThreadIDVar, InnermostKind, CodeGen));
916 IsInTTDRegion = PrevIsInTTDRegion;
917 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
918 llvm::Function *WrapperFun =
919 createParallelDataSharingWrapper(OutlinedFun, D);
920 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
921 }
922
923 return OutlinedFun;
924}
925
926/// Get list of lastprivate variables from the teams distribute ... or
927/// teams {distribute ...} directives.
928static void
929getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
930 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
931 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 932, __extension__
__PRETTY_FUNCTION__))
932 "expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 932, __extension__
__PRETTY_FUNCTION__))
;
933 const OMPExecutableDirective *Dir = &D;
934 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
935 if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
936 Ctx,
937 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
938 /*IgnoreCaptured=*/true))) {
939 Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
940 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
941 Dir = nullptr;
942 }
943 }
944 if (!Dir)
945 return;
946 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
947 for (const Expr *E : C->getVarRefs())
948 Vars.push_back(getPrivateItem(E));
949 }
950}
951
952/// Get list of reduction variables from the teams ... directives.
953static void
954getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
955 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
956 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 957, __extension__
__PRETTY_FUNCTION__))
957 "expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind
()) && "expected teams directive.") ? void (0) : __assert_fail
("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 957, __extension__
__PRETTY_FUNCTION__))
;
958 for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
959 for (const Expr *E : C->privates())
960 Vars.push_back(getPrivateItem(E));
961 }
962}
963
964llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
965 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
966 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
967 SourceLocation Loc = D.getBeginLoc();
968
969 const RecordDecl *GlobalizedRD = nullptr;
970 llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
971 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
972 unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
973 // Globalize team reductions variable unconditionally in all modes.
974 if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
975 getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
976 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
977 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
978 if (!LastPrivatesReductions.empty()) {
979 GlobalizedRD = ::buildRecordForGlobalizedVars(
980 CGM.getContext(), std::nullopt, LastPrivatesReductions,
981 MappedDeclsFields, WarpSize);
982 }
983 } else if (!LastPrivatesReductions.empty()) {
984 assert(!TeamAndReductions.first &&(static_cast <bool> (!TeamAndReductions.first &&
"Previous team declaration is not expected.") ? void (0) : __assert_fail
("!TeamAndReductions.first && \"Previous team declaration is not expected.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 985, __extension__
__PRETTY_FUNCTION__))
985 "Previous team declaration is not expected.")(static_cast <bool> (!TeamAndReductions.first &&
"Previous team declaration is not expected.") ? void (0) : __assert_fail
("!TeamAndReductions.first && \"Previous team declaration is not expected.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 985, __extension__
__PRETTY_FUNCTION__))
;
986 TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
987 std::swap(TeamAndReductions.second, LastPrivatesReductions);
988 }
989
990 // Emit target region as a standalone region.
991 class NVPTXPrePostActionTy : public PrePostActionTy {
992 SourceLocation &Loc;
993 const RecordDecl *GlobalizedRD;
994 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
995 &MappedDeclsFields;
996
997 public:
998 NVPTXPrePostActionTy(
999 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1000 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1001 &MappedDeclsFields)
1002 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1003 MappedDeclsFields(MappedDeclsFields) {}
1004 void Enter(CodeGenFunction &CGF) override {
1005 auto &Rt =
1006 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1007 if (GlobalizedRD) {
1008 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1009 I->getSecond().MappedParams =
1010 std::make_unique<CodeGenFunction::OMPMapVars>();
1011 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1012 for (const auto &Pair : MappedDeclsFields) {
1013 assert(Pair.getFirst()->isCanonicalDecl() &&(static_cast <bool> (Pair.getFirst()->isCanonicalDecl
() && "Expected canonical declaration") ? void (0) : __assert_fail
("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1014, __extension__
__PRETTY_FUNCTION__))
1014 "Expected canonical declaration")(static_cast <bool> (Pair.getFirst()->isCanonicalDecl
() && "Expected canonical declaration") ? void (0) : __assert_fail
("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1014, __extension__
__PRETTY_FUNCTION__))
;
1015 Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1016 }
1017 }
1018 Rt.emitGenericVarsProlog(CGF, Loc);
1019 }
1020 void Exit(CodeGenFunction &CGF) override {
1021 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1022 .emitGenericVarsEpilog(CGF);
1023 }
1024 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1025 CodeGen.setAction(Action);
1026 llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1027 D, ThreadIDVar, InnermostKind, CodeGen);
1028
1029 return OutlinedFun;
1030}
1031
1032void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1033 SourceLocation Loc,
1034 bool WithSPMDCheck) {
1035 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1036 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1037 return;
1038
1039 CGBuilderTy &Bld = CGF.Builder;
1040
1041 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1042 if (I == FunctionGlobalizedDecls.end())
1043 return;
1044
1045 for (auto &Rec : I->getSecond().LocalVarData) {
1046 const auto *VD = cast<VarDecl>(Rec.first);
1047 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1048 QualType VarTy = VD->getType();
1049
1050 // Get the local allocation of a firstprivate variable before sharing
1051 llvm::Value *ParValue;
1052 if (EscapedParam) {
1053 LValue ParLVal =
1054 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1055 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1056 }
1057
1058 // Allocate space for the variable to be globalized
1059 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1060 llvm::CallBase *VoidPtr =
1061 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1062 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1063 AllocArgs, VD->getName());
1064 // FIXME: We should use the variables actual alignment as an argument.
1065 VoidPtr->addRetAttr(llvm::Attribute::get(
1066 CGM.getLLVMContext(), llvm::Attribute::Alignment,
1067 CGM.getContext().getTargetInfo().getNewAlign() / 8));
1068
1069 // Cast the void pointer and get the address of the globalized variable.
1070 llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1071 llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1072 VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1073 LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1074 Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1075 Rec.second.GlobalizedVal = VoidPtr;
1076
1077 // Assign the local allocation to the newly globalized location.
1078 if (EscapedParam) {
1079 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1080 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1081 }
1082 if (auto *DI = CGF.getDebugInfo())
1083 VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1084 }
1085 for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1086 // Use actual memory size of the VLA object including the padding
1087 // for alignment purposes.
1088 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1089 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1090 Size = Bld.CreateNUWAdd(
1091 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1092 llvm::Value *AlignVal =
1093 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1094
1095 Size = Bld.CreateUDiv(Size, AlignVal);
1096 Size = Bld.CreateNUWMul(Size, AlignVal);
1097
1098 // Allocate space for this VLA object to be globalized.
1099 llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1100 llvm::CallBase *VoidPtr =
1101 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1102 CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1103 AllocArgs, VD->getName());
1104 VoidPtr->addRetAttr(
1105 llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
1106 CGM.getContext().getTargetInfo().getNewAlign()));
1107
1108 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1109 std::pair<llvm::Value *, llvm::Value *>(
1110 {VoidPtr, CGF.getTypeSize(VD->getType())}));
1111 LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1112 CGM.getContext().getDeclAlign(VD),
1113 AlignmentSource::Decl);
1114 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1115 Base.getAddress(CGF));
1116 }
1117 I->getSecond().MappedParams->apply(CGF);
1118}
1119
1120void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1121 bool WithSPMDCheck) {
1122 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1123 getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1124 return;
1125
1126 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1127 if (I != FunctionGlobalizedDecls.end()) {
1128 // Deallocate the memory for each globalized VLA object
1129 for (auto AddrSizePair :
1130 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1131 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1132 CGM.getModule(), OMPRTL___kmpc_free_shared),
1133 {AddrSizePair.first, AddrSizePair.second});
1134 }
1135 // Deallocate the memory for each globalized value
1136 for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1137 const auto *VD = cast<VarDecl>(Rec.first);
1138 I->getSecond().MappedParams->restore(CGF);
1139
1140 llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1141 CGF.getTypeSize(VD->getType())};
1142 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1143 CGM.getModule(), OMPRTL___kmpc_free_shared),
1144 FreeArgs);
1145 }
1146 }
1147}
1148
1149void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1150 const OMPExecutableDirective &D,
1151 SourceLocation Loc,
1152 llvm::Function *OutlinedFn,
1153 ArrayRef<llvm::Value *> CapturedVars) {
1154 if (!CGF.HaveInsertPoint())
1155 return;
1156
1157 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1158 /*Name=*/".zero.addr");
1159 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1160 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1161 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1162 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1163 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1164 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1165}
1166
1167void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1168 SourceLocation Loc,
1169 llvm::Function *OutlinedFn,
1170 ArrayRef<llvm::Value *> CapturedVars,
1171 const Expr *IfCond,
1172 llvm::Value *NumThreads) {
1173 if (!CGF.HaveInsertPoint())
1174 return;
1175
1176 auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1177 NumThreads](CodeGenFunction &CGF,
1178 PrePostActionTy &Action) {
1179 CGBuilderTy &Bld = CGF.Builder;
1180 llvm::Value *NumThreadsVal = NumThreads;
1181 llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1182 llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1183 if (WFn)
1184 ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1185 llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1186
1187 // Create a private scope that will globalize the arguments
1188 // passed from the outside of the target region.
1189 // TODO: Is that needed?
1190 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1191
1192 Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1193 llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1194 "captured_vars_addrs");
1195 // There's something to share.
1196 if (!CapturedVars.empty()) {
1197 // Prepare for parallel region. Indicate the outlined function.
1198 ASTContext &Ctx = CGF.getContext();
1199 unsigned Idx = 0;
1200 for (llvm::Value *V : CapturedVars) {
1201 Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1202 llvm::Value *PtrV;
1203 if (V->getType()->isIntegerTy())
1204 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1205 else
1206 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1207 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1208 Ctx.getPointerType(Ctx.VoidPtrTy));
1209 ++Idx;
1210 }
1211 }
1212
1213 llvm::Value *IfCondVal = nullptr;
1214 if (IfCond)
1215 IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1216 /* isSigned */ false);
1217 else
1218 IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1219
1220 if (!NumThreadsVal)
1221 NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1222 else
1223 NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1224
1225 assert(IfCondVal && "Expected a value")(static_cast <bool> (IfCondVal && "Expected a value"
) ? void (0) : __assert_fail ("IfCondVal && \"Expected a value\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1225, __extension__
__PRETTY_FUNCTION__))
;
1226 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1227 llvm::Value *Args[] = {
1228 RTLoc,
1229 getThreadID(CGF, Loc),
1230 IfCondVal,
1231 NumThreadsVal,
1232 llvm::ConstantInt::get(CGF.Int32Ty, -1),
1233 FnPtr,
1234 ID,
1235 Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1236 CGF.VoidPtrPtrTy),
1237 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1238 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1239 CGM.getModule(), OMPRTL___kmpc_parallel_51),
1240 Args);
1241 };
1242
1243 RegionCodeGenTy RCG(ParallelGen);
1244 RCG(CGF);
1245}
1246
1247void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1248 // Always emit simple barriers!
1249 if (!CGF.HaveInsertPoint())
1250 return;
1251 // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1252 // This function does not use parameters, so we can emit just default values.
1253 llvm::Value *Args[] = {
1254 llvm::ConstantPointerNull::get(
1255 cast<llvm::PointerType>(getIdentTyPointerTy())),
1256 llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1257 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1258 CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1259 Args);
1260}
1261
1262void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1263 SourceLocation Loc,
1264 OpenMPDirectiveKind Kind, bool,
1265 bool) {
1266 // Always emit simple barriers!
1267 if (!CGF.HaveInsertPoint())
1268 return;
1269 // Build call __kmpc_cancel_barrier(loc, thread_id);
1270 unsigned Flags = getDefaultFlagsForBarriers(Kind);
1271 llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1272 getThreadID(CGF, Loc)};
1273
1274 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1275 CGM.getModule(), OMPRTL___kmpc_barrier),
1276 Args);
1277}
1278
1279void CGOpenMPRuntimeGPU::emitCriticalRegion(
1280 CodeGenFunction &CGF, StringRef CriticalName,
1281 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1282 const Expr *Hint) {
1283 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1284 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1285 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1286 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1287 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1288
1289 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1290
1291 // Get the mask of active threads in the warp.
1292 llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1293 CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1294 // Fetch team-local id of the thread.
1295 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1296
1297 // Get the width of the team.
1298 llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1299
1300 // Initialize the counter variable for the loop.
1301 QualType Int32Ty =
1302 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1303 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1304 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1305 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1306 /*isInit=*/true);
1307
1308 // Block checks if loop counter exceeds upper bound.
1309 CGF.EmitBlock(LoopBB);
1310 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1311 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1312 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1313
1314 // Block tests which single thread should execute region, and which threads
1315 // should go straight to synchronisation point.
1316 CGF.EmitBlock(TestBB);
1317 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1318 llvm::Value *CmpThreadToCounter =
1319 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1320 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1321
1322 // Block emits the body of the critical region.
1323 CGF.EmitBlock(BodyBB);
1324
1325 // Output the critical statement.
1326 CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1327 Hint);
1328
1329 // After the body surrounded by the critical region, the single executing
1330 // thread will jump to the synchronisation point.
1331 // Block waits for all threads in current team to finish then increments the
1332 // counter variable and returns to the loop.
1333 CGF.EmitBlock(SyncBB);
1334 // Reconverge active threads in the warp.
1335 (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1336 CGM.getModule(), OMPRTL___kmpc_syncwarp),
1337 Mask);
1338
1339 llvm::Value *IncCounterVal =
1340 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1341 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1342 CGF.EmitBranch(LoopBB);
1343
1344 // Block that is reached when all threads in the team complete the region.
1345 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1346}
1347
1348/// Cast value to the specified type.
1349static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1350 QualType ValTy, QualType CastTy,
1351 SourceLocation Loc) {
1352 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(CastTy).isZero() && "Cast type must sized.") ? void (
0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1353, __extension__
__PRETTY_FUNCTION__))
1353 "Cast type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(CastTy).isZero() && "Cast type must sized.") ? void (
0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1353, __extension__
__PRETTY_FUNCTION__))
;
1354 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(ValTy).isZero() && "Val type must sized.") ? void (0
) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1355, __extension__
__PRETTY_FUNCTION__))
1355 "Val type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars
(ValTy).isZero() && "Val type must sized.") ? void (0
) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1355, __extension__
__PRETTY_FUNCTION__))
;
1356 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1357 if (ValTy == CastTy)
1358 return Val;
1359 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1360 CGF.getContext().getTypeSizeInChars(CastTy))
1361 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1362 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1363 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1364 CastTy->hasSignedIntegerRepresentation());
1365 Address CastItem = CGF.CreateMemTemp(CastTy);
1366 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1367 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()),
1368 Val->getType());
1369 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1370 LValueBaseInfo(AlignmentSource::Type),
1371 TBAAAccessInfo());
1372 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1373 LValueBaseInfo(AlignmentSource::Type),
1374 TBAAAccessInfo());
1375}
1376
1377/// This function creates calls to one of two shuffle functions to copy
1378/// variables between lanes in a warp.
1379static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1380 llvm::Value *Elem,
1381 QualType ElemType,
1382 llvm::Value *Offset,
1383 SourceLocation Loc) {
1384 CodeGenModule &CGM = CGF.CGM;
1385 CGBuilderTy &Bld = CGF.Builder;
1386 CGOpenMPRuntimeGPU &RT =
1387 *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1388 llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1389
1390 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1391 assert(Size.getQuantity() <= 8 &&(static_cast <bool> (Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.") ? void (0) :
__assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1392, __extension__
__PRETTY_FUNCTION__))
1392 "Unsupported bitwidth in shuffle instruction.")(static_cast <bool> (Size.getQuantity() <= 8 &&
"Unsupported bitwidth in shuffle instruction.") ? void (0) :
__assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1392, __extension__
__PRETTY_FUNCTION__))
;
1393
1394 RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1395 ? OMPRTL___kmpc_shuffle_int32
1396 : OMPRTL___kmpc_shuffle_int64;
1397
1398 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1399 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1400 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1401 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1402 llvm::Value *WarpSize =
1403 Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1404
1405 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1406 OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1407 {ElemCast, Offset, WarpSize});
1408
1409 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1410}
1411
1412static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1413 Address DestAddr, QualType ElemType,
1414 llvm::Value *Offset, SourceLocation Loc) {
1415 CGBuilderTy &Bld = CGF.Builder;
1416
1417 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1418 // Create the loop over the big sized data.
1419 // ptr = (void*)Elem;
1420 // ptrEnd = (void*) Elem + 1;
1421 // Step = 8;
1422 // while (ptr + Step < ptrEnd)
1423 // shuffle((int64_t)*ptr);
1424 // Step = 4;
1425 // while (ptr + Step < ptrEnd)
1426 // shuffle((int32_t)*ptr);
1427 // ...
1428 Address ElemPtr = DestAddr;
1429 Address Ptr = SrcAddr;
1430 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1431 Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
1432 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1433 if (Size < CharUnits::fromQuantity(IntSize))
1434 continue;
1435 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1436 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1437 /*Signed=*/1);
1438 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1439 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
1440 IntTy);
1441 ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1442 ElemPtr, IntTy->getPointerTo(), IntTy);
1443 if (Size.getQuantity() / IntSize > 1) {
1444 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1445 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1446 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1447 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1448 CGF.EmitBlock(PreCondBB);
1449 llvm::PHINode *PhiSrc =
1450 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1451 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1452 llvm::PHINode *PhiDest =
1453 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1454 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1455 Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1456 ElemPtr =
1457 Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1458 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1459 CGF.Int8Ty, PtrEnd.getPointer(),
1460 Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
1461 CGF.VoidPtrTy));
1462 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1463 ThenBB, ExitBB);
1464 CGF.EmitBlock(ThenBB);
1465 llvm::Value *Res = createRuntimeShuffleFunction(
1466 CGF,
1467 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1468 LValueBaseInfo(AlignmentSource::Type),
1469 TBAAAccessInfo()),
1470 IntType, Offset, Loc);
1471 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1472 LValueBaseInfo(AlignmentSource::Type),
1473 TBAAAccessInfo());
1474 Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1475 Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1476 PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1477 PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1478 CGF.EmitBranch(PreCondBB);
1479 CGF.EmitBlock(ExitBB);
1480 } else {
1481 llvm::Value *Res = createRuntimeShuffleFunction(
1482 CGF,
1483 CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1484 LValueBaseInfo(AlignmentSource::Type),
1485 TBAAAccessInfo()),
1486 IntType, Offset, Loc);
1487 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1488 LValueBaseInfo(AlignmentSource::Type),
1489 TBAAAccessInfo());
1490 Ptr = Bld.CreateConstGEP(Ptr, 1);
1491 ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1492 }
1493 Size = Size % IntSize;
1494 }
1495}
1496
1497namespace {
1498enum CopyAction : unsigned {
1499 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1500 // the warp using shuffle instructions.
1501 RemoteLaneToThread,
1502 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1503 ThreadCopy,
1504 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1505 ThreadToScratchpad,
1506 // ScratchpadToThread: Copy from a scratchpad array in global memory
1507 // containing team-reduced data to a thread's stack.
1508 ScratchpadToThread,
1509};
1510} // namespace
1511
1512struct CopyOptionsTy {
1513 llvm::Value *RemoteLaneOffset;
1514 llvm::Value *ScratchpadIndex;
1515 llvm::Value *ScratchpadWidth;
1516};
1517
1518/// Emit instructions to copy a Reduce list, which contains partially
1519/// aggregated values, in the specified direction.
1520static void emitReductionListCopy(
1521 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1522 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1523 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1524
1525 CodeGenModule &CGM = CGF.CGM;
1526 ASTContext &C = CGM.getContext();
1527 CGBuilderTy &Bld = CGF.Builder;
1528
1529 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1530 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1531 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1532
1533 // Iterates, element-by-element, through the source Reduce list and
1534 // make a copy.
1535 unsigned Idx = 0;
1536 unsigned Size = Privates.size();
1537 for (const Expr *Private : Privates) {
1538 Address SrcElementAddr = Address::invalid();
1539 Address DestElementAddr = Address::invalid();
1540 Address DestElementPtrAddr = Address::invalid();
1541 // Should we shuffle in an element from a remote lane?
1542 bool ShuffleInElement = false;
1543 // Set to true to update the pointer in the dest Reduce list to a
1544 // newly created element.
1545 bool UpdateDestListPtr = false;
1546 // Increment the src or dest pointer to the scratchpad, for each
1547 // new element.
1548 bool IncrScratchpadSrc = false;
1549 bool IncrScratchpadDest = false;
1550 QualType PrivatePtrType = C.getPointerType(Private->getType());
1551 llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
1552
1553 switch (Action) {
1554 case RemoteLaneToThread: {
1555 // Step 1.1: Get the address for the src element in the Reduce list.
1556 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1557 SrcElementAddr =
1558 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1559 SrcElementPtrAddr, PrivateLlvmPtrType),
1560 PrivatePtrType->castAs<PointerType>());
1561
1562 // Step 1.2: Create a temporary to store the element in the destination
1563 // Reduce list.
1564 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1565 DestElementAddr =
1566 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1567 ShuffleInElement = true;
1568 UpdateDestListPtr = true;
1569 break;
1570 }
1571 case ThreadCopy: {
1572 // Step 1.1: Get the address for the src element in the Reduce list.
1573 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1574 SrcElementAddr =
1575 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1576 SrcElementPtrAddr, PrivateLlvmPtrType),
1577 PrivatePtrType->castAs<PointerType>());
1578
1579 // Step 1.2: Get the address for dest element. The destination
1580 // element has already been created on the thread's stack.
1581 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1582 DestElementAddr =
1583 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1584 DestElementPtrAddr, PrivateLlvmPtrType),
1585 PrivatePtrType->castAs<PointerType>());
1586 break;
1587 }
1588 case ThreadToScratchpad: {
1589 // Step 1.1: Get the address for the src element in the Reduce list.
1590 Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1591 SrcElementAddr =
1592 CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1593 SrcElementPtrAddr, PrivateLlvmPtrType),
1594 PrivatePtrType->castAs<PointerType>());
1595
1596 // Step 1.2: Get the address for dest element:
1597 // address = base + index * ElementSizeInChars.
1598 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1599 llvm::Value *CurrentOffset =
1600 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1601 llvm::Value *ScratchPadElemAbsolutePtrVal =
1602 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1603 ScratchPadElemAbsolutePtrVal =
1604 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1605 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1606 C.getTypeAlignInChars(Private->getType()));
1607 IncrScratchpadDest = true;
1608 break;
1609 }
1610 case ScratchpadToThread: {
1611 // Step 1.1: Get the address for the src element in the scratchpad.
1612 // address = base + index * ElementSizeInChars.
1613 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1614 llvm::Value *CurrentOffset =
1615 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1616 llvm::Value *ScratchPadElemAbsolutePtrVal =
1617 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1618 ScratchPadElemAbsolutePtrVal =
1619 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1620 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1621 C.getTypeAlignInChars(Private->getType()));
1622 IncrScratchpadSrc = true;
1623
1624 // Step 1.2: Create a temporary to store the element in the destination
1625 // Reduce list.
1626 DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1627 DestElementAddr =
1628 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1629 UpdateDestListPtr = true;
1630 break;
1631 }
1632 }
1633
1634 // Regardless of src and dest of copy, we emit the load of src
1635 // element as this is required in all directions
1636 SrcElementAddr = Bld.CreateElementBitCast(
1637 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1638 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1639 SrcElementAddr.getElementType());
1640
1641 // Now that all active lanes have read the element in the
1642 // Reduce list, shuffle over the value from the remote lane.
1643 if (ShuffleInElement) {
1644 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1645 RemoteLaneOffset, Private->getExprLoc());
1646 } else {
1647 switch (CGF.getEvaluationKind(Private->getType())) {
1648 case TEK_Scalar: {
1649 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1650 SrcElementAddr, /*Volatile=*/false, Private->getType(),
1651 Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
1652 TBAAAccessInfo());
1653 // Store the source element value to the dest element address.
1654 CGF.EmitStoreOfScalar(
1655 Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
1656 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
1657 break;
1658 }
1659 case TEK_Complex: {
1660 CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
1661 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1662 Private->getExprLoc());
1663 CGF.EmitStoreOfComplex(
1664 Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1665 /*isInit=*/false);
1666 break;
1667 }
1668 case TEK_Aggregate:
1669 CGF.EmitAggregateCopy(
1670 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1671 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1672 Private->getType(), AggValueSlot::DoesNotOverlap);
1673 break;
1674 }
1675 }
1676
1677 // Step 3.1: Modify reference in dest Reduce list as needed.
1678 // Modifying the reference in Reduce list to point to the newly
1679 // created element. The element is live in the current function
1680 // scope and that of functions it invokes (i.e., reduce_function).
1681 // RemoteReduceData[i] = (void*)&RemoteElem
1682 if (UpdateDestListPtr) {
1683 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1684 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1685 DestElementPtrAddr, /*Volatile=*/false,
1686 C.VoidPtrTy);
1687 }
1688
1689 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1690 // address of the next element in scratchpad memory, unless we're currently
1691 // processing the last one. Memory alignment is also taken care of here.
1692 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1693 // FIXME: This code doesn't make any sense, it's trying to perform
1694 // integer arithmetic on pointers.
1695 llvm::Value *ScratchpadBasePtr =
1696 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1697 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1698 ScratchpadBasePtr = Bld.CreateNUWAdd(
1699 ScratchpadBasePtr,
1700 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
1701
1702 // Take care of global memory alignment for performance
1703 ScratchpadBasePtr = Bld.CreateNUWSub(
1704 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1705 ScratchpadBasePtr = Bld.CreateUDiv(
1706 ScratchpadBasePtr,
1707 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1708 ScratchpadBasePtr = Bld.CreateNUWAdd(
1709 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1710 ScratchpadBasePtr = Bld.CreateNUWMul(
1711 ScratchpadBasePtr,
1712 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1713
1714 if (IncrScratchpadDest)
1715 DestBase =
1716 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1717 else /* IncrScratchpadSrc = true */
1718 SrcBase =
1719 Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1720 }
1721
1722 ++Idx;
1723 }
1724}
1725
1726/// This function emits a helper that gathers Reduce lists from the first
1727/// lane of every active warp to lanes in the first warp.
1728///
1729/// void inter_warp_copy_func(void* reduce_data, num_warps)
1730/// shared smem[warp_size];
1731/// For all data entries D in reduce_data:
1732/// sync
1733/// If (I am the first lane in each warp)
1734/// Copy my local D to smem[warp_id]
1735/// sync
1736/// if (I am the first warp)
1737/// Copy smem[thread_id] to my local D
1738static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1739 ArrayRef<const Expr *> Privates,
1740 QualType ReductionArrayTy,
1741 SourceLocation Loc) {
1742 ASTContext &C = CGM.getContext();
1743 llvm::Module &M = CGM.getModule();
1744
1745 // ReduceList: thread local Reduce list.
1746 // At the stage of the computation when this function is called, partially
1747 // aggregated values reside in the first lane of every active warp.
1748 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1749 C.VoidPtrTy, ImplicitParamDecl::Other);
1750 // NumWarps: number of warps active in the parallel region. This could
1751 // be smaller than 32 (max warps in a CTA) for partial block reduction.
1752 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1753 C.getIntTypeForBitwidth(32, /* Signed */ true),
1754 ImplicitParamDecl::Other);
1755 FunctionArgList Args;
1756 Args.push_back(&ReduceListArg);
1757 Args.push_back(&NumWarpsArg);
1758
1759 const CGFunctionInfo &CGFI =
1760 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1761 auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
1762 llvm::GlobalValue::InternalLinkage,
1763 "_omp_reduction_inter_warp_copy_func", &M);
1764 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
1765 Fn->setDoesNotRecurse();
1766 CodeGenFunction CGF(CGM);
1767 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
1768
1769 CGBuilderTy &Bld = CGF.Builder;
1770
1771 // This array is used as a medium to transfer, one reduce element at a time,
1772 // the data from the first lane of every warp to lanes in the first warp
1773 // in order to perform the final step of a reduction in a parallel region
1774 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1775 // for reduced latency, as well as to have a distinct copy for concurrently
1776 // executing target regions. The array is declared with common linkage so
1777 // as to be shared across compilation units.
1778 StringRef TransferMediumName =
1779 "__openmp_nvptx_data_transfer_temporary_storage";
1780 llvm::GlobalVariable *TransferMedium =
1781 M.getGlobalVariable(TransferMediumName);
1782 unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1783 if (!TransferMedium) {
11
Assuming 'TransferMedium' is non-null
12
Taking false branch
1784 auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
1785 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1786 TransferMedium = new llvm::GlobalVariable(
1787 M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
1788 llvm::UndefValue::get(Ty), TransferMediumName,
1789 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1790 SharedAddressSpace);
1791 CGM.addCompilerUsedGlobal(TransferMedium);
1792 }
1793
1794 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1795 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1796 llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1797 // nvptx_lane_id = nvptx_id % warpsize
1798 llvm::Value *LaneID = getNVPTXLaneID(CGF);
13
Calling 'getNVPTXLaneID'
1799 // nvptx_warp_id = nvptx_id / warpsize
1800 llvm::Value *WarpID = getNVPTXWarpID(CGF);
1801
1802 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1803 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
1804 Address LocalReduceList(
1805 Bld.CreatePointerBitCastOrAddrSpaceCast(
1806 CGF.EmitLoadOfScalar(
1807 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
1808 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
1809 ElemTy->getPointerTo()),
1810 ElemTy, CGF.getPointerAlign());
1811
1812 unsigned Idx = 0;
1813 for (const Expr *Private : Privates) {
1814 //
1815 // Warp master copies reduce element to transfer medium in __shared__
1816 // memory.
1817 //
1818 unsigned RealTySize =
1819 C.getTypeSizeInChars(Private->getType())
1820 .alignTo(C.getTypeAlignInChars(Private->getType()))
1821 .getQuantity();
1822 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1823 unsigned NumIters = RealTySize / TySize;
1824 if (NumIters == 0)
1825 continue;
1826 QualType CType = C.getIntTypeForBitwidth(
1827 C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
1828 llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
1829 CharUnits Align = CharUnits::fromQuantity(TySize);
1830 llvm::Value *Cnt = nullptr;
1831 Address CntAddr = Address::invalid();
1832 llvm::BasicBlock *PrecondBB = nullptr;
1833 llvm::BasicBlock *ExitBB = nullptr;
1834 if (NumIters > 1) {
1835 CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1836 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
1837 /*Volatile=*/false, C.IntTy);
1838 PrecondBB = CGF.createBasicBlock("precond");
1839 ExitBB = CGF.createBasicBlock("exit");
1840 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
1841 // There is no need to emit line number for unconditional branch.
1842 (void)ApplyDebugLocation::CreateEmpty(CGF);
1843 CGF.EmitBlock(PrecondBB);
1844 Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
1845 llvm::Value *Cmp =
1846 Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
1847 Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
1848 CGF.EmitBlock(BodyBB);
1849 }
1850 // kmpc_barrier.
1851 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1852 /*EmitChecks=*/false,
1853 /*ForceSimpleCall=*/true);
1854 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1855 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1856 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1857
1858 // if (lane_id == 0)
1859 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
1860 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1861 CGF.EmitBlock(ThenBB);
1862
1863 // Reduce element = LocalReduceList[i]
1864 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1865 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1866 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1867 // elemptr = ((CopyType*)(elemptrptr)) + I
1868 Address ElemPtr(ElemPtrPtr, CGF.Int8Ty, Align);
1869 ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
1870 if (NumIters > 1)
1871 ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
1872
1873 // Get pointer to location in transfer medium.
1874 // MediumPtr = &medium[warp_id]
1875 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1876 TransferMedium->getValueType(), TransferMedium,
1877 {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1878 // Casting to actual data type.
1879 // MediumPtr = (CopyType*)MediumPtrAddr;
1880 Address MediumPtr(
1881 Bld.CreateBitCast(
1882 MediumPtrVal,
1883 CopyType->getPointerTo(
1884 MediumPtrVal->getType()->getPointerAddressSpace())),
1885 CopyType, Align);
1886
1887 // elem = *elemptr
1888 //*MediumPtr = elem
1889 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1890 ElemPtr, /*Volatile=*/false, CType, Loc,
1891 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
1892 // Store the source element value to the dest element address.
1893 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
1894 LValueBaseInfo(AlignmentSource::Type),
1895 TBAAAccessInfo());
1896
1897 Bld.CreateBr(MergeBB);
1898
1899 CGF.EmitBlock(ElseBB);
1900 Bld.CreateBr(MergeBB);
1901
1902 CGF.EmitBlock(MergeBB);
1903
1904 // kmpc_barrier.
1905 CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1906 /*EmitChecks=*/false,
1907 /*ForceSimpleCall=*/true);
1908
1909 //
1910 // Warp 0 copies reduce element from transfer medium.
1911 //
1912 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1913 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1914 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1915
1916 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1917 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1918 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
1919
1920 // Up to 32 threads in warp 0 are active.
1921 llvm::Value *IsActiveThread =
1922 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1923 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1924
1925 CGF.EmitBlock(W0ThenBB);
1926
1927 // SrcMediumPtr = &medium[tid]
1928 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1929 TransferMedium->getValueType(), TransferMedium,
1930 {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1931 // SrcMediumVal = *SrcMediumPtr;
1932 Address SrcMediumPtr(
1933 Bld.CreateBitCast(
1934 SrcMediumPtrVal,
1935 CopyType->getPointerTo(
1936 SrcMediumPtrVal->getType()->getPointerAddressSpace())),
1937 CopyType, Align);
1938
1939 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
1940 Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1941 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1942 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
1943 Address TargetElemPtr(TargetElemPtrVal, CGF.Int8Ty, Align);
1944 TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
1945 if (NumIters > 1)
1946 TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
1947
1948 // *TargetElemPtr = SrcMediumVal;
1949 llvm::Value *SrcMediumValue =
1950 CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
1951 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1952 CType);
1953 Bld.CreateBr(W0MergeBB);
1954
1955 CGF.EmitBlock(W0ElseBB);
1956 Bld.CreateBr(W0MergeBB);
1957
1958 CGF.EmitBlock(W0MergeBB);
1959
1960 if (NumIters > 1) {
1961 Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
1962 CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
1963 CGF.EmitBranch(PrecondBB);
1964 (void)ApplyDebugLocation::CreateEmpty(CGF);
1965 CGF.EmitBlock(ExitBB);
1966 }
1967 RealTySize %= TySize;
1968 }
1969 ++Idx;
1970 }
1971
1972 CGF.FinishFunction();
1973 return Fn;
1974}
1975
1976/// Emit a helper that reduces data across two OpenMP threads (lanes)
1977/// in the same warp. It uses shuffle instructions to copy over data from
1978/// a remote lane's stack. The reduction algorithm performed is specified
1979/// by the fourth parameter.
1980///
1981/// Algorithm Versions.
1982/// Full Warp Reduce (argument value 0):
1983/// This algorithm assumes that all 32 lanes are active and gathers
1984/// data from these 32 lanes, producing a single resultant value.
1985/// Contiguous Partial Warp Reduce (argument value 1):
1986/// This algorithm assumes that only a *contiguous* subset of lanes
1987/// are active. This happens for the last warp in a parallel region
1988/// when the user specified num_threads is not an integer multiple of
1989/// 32. This contiguous subset always starts with the zeroth lane.
1990/// Partial Warp Reduce (argument value 2):
1991/// This algorithm gathers data from any number of lanes at any position.
1992/// All reduced values are stored in the lowest possible lane. The set
1993/// of problems every algorithm addresses is a super set of those
1994/// addressable by algorithms with a lower version number. Overhead
1995/// increases as algorithm version increases.
1996///
1997/// Terminology
1998/// Reduce element:
1999/// Reduce element refers to the individual data field with primitive
2000/// data types to be combined and reduced across threads.
2001/// Reduce list:
2002/// Reduce list refers to a collection of local, thread-private
2003/// reduce elements.
2004/// Remote Reduce list:
2005/// Remote Reduce list refers to a collection of remote (relative to
2006/// the current thread) reduce elements.
2007///
2008/// We distinguish between three states of threads that are important to
2009/// the implementation of this function.
2010/// Alive threads:
2011/// Threads in a warp executing the SIMT instruction, as distinguished from
2012/// threads that are inactive due to divergent control flow.
2013/// Active threads:
2014/// The minimal set of threads that has to be alive upon entry to this
2015/// function. The computation is correct iff active threads are alive.
2016/// Some threads are alive but they are not active because they do not
2017/// contribute to the computation in any useful manner. Turning them off
2018/// may introduce control flow overheads without any tangible benefits.
2019/// Effective threads:
2020/// In order to comply with the argument requirements of the shuffle
2021/// function, we must keep all lanes holding data alive. But at most
2022/// half of them perform value aggregation; we refer to this half of
2023/// threads as effective. The other half is simply handing off their
2024/// data.
2025///
2026/// Procedure
2027/// Value shuffle:
2028/// In this step active threads transfer data from higher lane positions
2029/// in the warp to lower lane positions, creating Remote Reduce list.
2030/// Value aggregation:
2031/// In this step, effective threads combine their thread local Reduce list
2032/// with Remote Reduce list and store the result in the thread local
2033/// Reduce list.
2034/// Value copy:
2035/// In this step, we deal with the assumption made by algorithm 2
2036/// (i.e. contiguity assumption). When we have an odd number of lanes
2037/// active, say 2k+1, only k threads will be effective and therefore k
2038/// new values will be produced. However, the Reduce list owned by the
2039/// (2k+1)th thread is ignored in the value aggregation. Therefore
2040/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2041/// that the contiguity assumption still holds.
2042static llvm::Function *emitShuffleAndReduceFunction(
2043 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2044 QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2045 ASTContext &C = CGM.getContext();
2046
2047 // Thread local Reduce list used to host the values of data to be reduced.
2048 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2049 C.VoidPtrTy, ImplicitParamDecl::Other);
2050 // Current lane id; could be logical.
2051 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2052 ImplicitParamDecl::Other);
2053 // Offset of the remote source lane relative to the current lane.
2054 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2055 C.ShortTy, ImplicitParamDecl::Other);
2056 // Algorithm version. This is expected to be known at compile time.
2057 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2058 C.ShortTy, ImplicitParamDecl::Other);
2059 FunctionArgList Args;
2060 Args.push_back(&ReduceListArg);
2061 Args.push_back(&LaneIDArg);
2062 Args.push_back(&RemoteLaneOffsetArg);
2063 Args.push_back(&AlgoVerArg);
2064
2065 const CGFunctionInfo &CGFI =
2066 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2067 auto *Fn = llvm::Function::Create(
2068 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2069 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2070 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2071 Fn->setDoesNotRecurse();
2072
2073 CodeGenFunction CGF(CGM);
2074 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2075
2076 CGBuilderTy &Bld = CGF.Builder;
2077
2078 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2079 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2080 Address LocalReduceList(
2081 Bld.CreatePointerBitCastOrAddrSpaceCast(
2082 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2083 C.VoidPtrTy, SourceLocation()),
2084 ElemTy->getPointerTo()),
2085 ElemTy, CGF.getPointerAlign());
2086
2087 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2088 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2089 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2090
2091 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2092 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2093 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2094
2095 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2096 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2097 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2098
2099 // Create a local thread-private variable to host the Reduce list
2100 // from a remote lane.
2101 Address RemoteReduceList =
2102 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2103
2104 // This loop iterates through the list of reduce elements and copies,
2105 // element by element, from a remote lane in the warp to RemoteReduceList,
2106 // hosted on the thread's stack.
2107 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2108 LocalReduceList, RemoteReduceList,
2109 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2110 /*ScratchpadIndex=*/nullptr,
2111 /*ScratchpadWidth=*/nullptr});
2112
2113 // The actions to be performed on the Remote Reduce list is dependent
2114 // on the algorithm version.
2115 //
2116 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2117 // LaneId % 2 == 0 && Offset > 0):
2118 // do the reduction value aggregation
2119 //
2120 // The thread local variable Reduce list is mutated in place to host the
2121 // reduced data, which is the aggregated value produced from local and
2122 // remote lanes.
2123 //
2124 // Note that AlgoVer is expected to be a constant integer known at compile
2125 // time.
2126 // When AlgoVer==0, the first conjunction evaluates to true, making
2127 // the entire predicate true during compile time.
2128 // When AlgoVer==1, the second conjunction has only the second part to be
2129 // evaluated during runtime. Other conjunctions evaluates to false
2130 // during compile time.
2131 // When AlgoVer==2, the third conjunction has only the second part to be
2132 // evaluated during runtime. Other conjunctions evaluates to false
2133 // during compile time.
2134 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2135
2136 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2137 llvm::Value *CondAlgo1 = Bld.CreateAnd(
2138 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2139
2140 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2141 llvm::Value *CondAlgo2 = Bld.CreateAnd(
2142 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2143 CondAlgo2 = Bld.CreateAnd(
2144 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2145
2146 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2147 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2148
2149 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2150 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2151 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2152 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2153
2154 CGF.EmitBlock(ThenBB);
2155 // reduce_function(LocalReduceList, RemoteReduceList)
2156 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2157 LocalReduceList.getPointer(), CGF.VoidPtrTy);
2158 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2159 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2160 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2161 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2162 Bld.CreateBr(MergeBB);
2163
2164 CGF.EmitBlock(ElseBB);
2165 Bld.CreateBr(MergeBB);
2166
2167 CGF.EmitBlock(MergeBB);
2168
2169 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2170 // Reduce list.
2171 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2172 llvm::Value *CondCopy = Bld.CreateAnd(
2173 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2174
2175 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2176 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2177 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2178 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2179
2180 CGF.EmitBlock(CpyThenBB);
2181 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2182 RemoteReduceList, LocalReduceList);
2183 Bld.CreateBr(CpyMergeBB);
2184
2185 CGF.EmitBlock(CpyElseBB);
2186 Bld.CreateBr(CpyMergeBB);
2187
2188 CGF.EmitBlock(CpyMergeBB);
2189
2190 CGF.FinishFunction();
2191 return Fn;
2192}
2193
2194/// This function emits a helper that copies all the reduction variables from
2195/// the team into the provided global buffer for the reduction variables.
2196///
2197/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2198/// For all data entries D in reduce_data:
2199/// Copy local D to buffer.D[Idx]
2200static llvm::Value *emitListToGlobalCopyFunction(
2201 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2202 QualType ReductionArrayTy, SourceLocation Loc,
2203 const RecordDecl *TeamReductionRec,
2204 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2205 &VarFieldMap) {
2206 ASTContext &C = CGM.getContext();
2207
2208 // Buffer: global reduction buffer.
2209 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2210 C.VoidPtrTy, ImplicitParamDecl::Other);
2211 // Idx: index of the buffer.
2212 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2213 ImplicitParamDecl::Other);
2214 // ReduceList: thread local Reduce list.
2215 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2216 C.VoidPtrTy, ImplicitParamDecl::Other);
2217 FunctionArgList Args;
2218 Args.push_back(&BufferArg);
2219 Args.push_back(&IdxArg);
2220 Args.push_back(&ReduceListArg);
2221
2222 const CGFunctionInfo &CGFI =
2223 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2224 auto *Fn = llvm::Function::Create(
2225 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2226 "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2227 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2228 Fn->setDoesNotRecurse();
2229 CodeGenFunction CGF(CGM);
2230 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2231
2232 CGBuilderTy &Bld = CGF.Builder;
2233
2234 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2235 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2236 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2237 Address LocalReduceList(
2238 Bld.CreatePointerBitCastOrAddrSpaceCast(
2239 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2240 C.VoidPtrTy, Loc),
2241 ElemTy->getPointerTo()),
2242 ElemTy, CGF.getPointerAlign());
2243 QualType StaticTy = C.getRecordType(TeamReductionRec);
2244 llvm::Type *LLVMReductionsBufferTy =
2245 CGM.getTypes().ConvertTypeForMem(StaticTy);
2246 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2247 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2248 LLVMReductionsBufferTy->getPointerTo());
2249 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2250 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2251 /*Volatile=*/false, C.IntTy,
2252 Loc)};
2253 unsigned Idx = 0;
2254 for (const Expr *Private : Privates) {
2255 // Reduce element = LocalReduceList[i]
2256 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2257 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2258 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2259 // elemptr = ((CopyType*)(elemptrptr)) + I
2260 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2261 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2262 ElemPtrPtr, ElemTy->getPointerTo());
2263 Address ElemPtr =
2264 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2265 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2266 // Global = Buffer.VD[Idx];
2267 const FieldDecl *FD = VarFieldMap.lookup(VD);
2268 LValue GlobLVal = CGF.EmitLValueForField(
2269 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2270 Address GlobAddr = GlobLVal.getAddress(CGF);
2271 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2272 GlobAddr.getPointer(), Idxs);
2273 GlobLVal.setAddress(Address(BufferPtr,
2274 CGF.ConvertTypeForMem(Private->getType()),
2275 GlobAddr.getAlignment()));
2276 switch (CGF.getEvaluationKind(Private->getType())) {
2277 case TEK_Scalar: {
2278 llvm::Value *V = CGF.EmitLoadOfScalar(
2279 ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2280 LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2281 CGF.EmitStoreOfScalar(V, GlobLVal);
2282 break;
2283 }
2284 case TEK_Complex: {
2285 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2286 CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2287 CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2288 break;
2289 }
2290 case TEK_Aggregate:
2291 CGF.EmitAggregateCopy(GlobLVal,
2292 CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2293 Private->getType(), AggValueSlot::DoesNotOverlap);
2294 break;
2295 }
2296 ++Idx;
2297 }
2298
2299 CGF.FinishFunction();
2300 return Fn;
2301}
2302
2303/// This function emits a helper that reduces all the reduction variables from
2304/// the team into the provided global buffer for the reduction variables.
2305///
2306/// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2307/// void *GlobPtrs[];
2308/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2309/// ...
2310/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2311/// reduce_function(GlobPtrs, reduce_data);
2312static llvm::Value *emitListToGlobalReduceFunction(
2313 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2314 QualType ReductionArrayTy, SourceLocation Loc,
2315 const RecordDecl *TeamReductionRec,
2316 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2317 &VarFieldMap,
2318 llvm::Function *ReduceFn) {
2319 ASTContext &C = CGM.getContext();
2320
2321 // Buffer: global reduction buffer.
2322 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2323 C.VoidPtrTy, ImplicitParamDecl::Other);
2324 // Idx: index of the buffer.
2325 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2326 ImplicitParamDecl::Other);
2327 // ReduceList: thread local Reduce list.
2328 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2329 C.VoidPtrTy, ImplicitParamDecl::Other);
2330 FunctionArgList Args;
2331 Args.push_back(&BufferArg);
2332 Args.push_back(&IdxArg);
2333 Args.push_back(&ReduceListArg);
2334
2335 const CGFunctionInfo &CGFI =
2336 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2337 auto *Fn = llvm::Function::Create(
2338 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2339 "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2340 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2341 Fn->setDoesNotRecurse();
2342 CodeGenFunction CGF(CGM);
2343 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2344
2345 CGBuilderTy &Bld = CGF.Builder;
2346
2347 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2348 QualType StaticTy = C.getRecordType(TeamReductionRec);
2349 llvm::Type *LLVMReductionsBufferTy =
2350 CGM.getTypes().ConvertTypeForMem(StaticTy);
2351 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2352 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2353 LLVMReductionsBufferTy->getPointerTo());
2354
2355 // 1. Build a list of reduction variables.
2356 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2357 Address ReductionList =
2358 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2359 auto IPriv = Privates.begin();
2360 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2361 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2362 /*Volatile=*/false, C.IntTy,
2363 Loc)};
2364 unsigned Idx = 0;
2365 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2366 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2367 // Global = Buffer.VD[Idx];
2368 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2369 const FieldDecl *FD = VarFieldMap.lookup(VD);
2370 LValue GlobLVal = CGF.EmitLValueForField(
2371 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2372 Address GlobAddr = GlobLVal.getAddress(CGF);
2373 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2374 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2375 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2376 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2377 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2378 // Store array size.
2379 ++Idx;
2380 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2381 llvm::Value *Size = CGF.Builder.CreateIntCast(
2382 CGF.getVLASize(
2383 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2384 .NumElts,
2385 CGF.SizeTy, /*isSigned=*/false);
2386 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2387 Elem);
2388 }
2389 }
2390
2391 // Call reduce_function(GlobalReduceList, ReduceList)
2392 llvm::Value *GlobalReduceList =
2393 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2394 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2395 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2396 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2397 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2398 CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2399 CGF.FinishFunction();
2400 return Fn;
2401}
2402
2403/// This function emits a helper that copies all the reduction variables from
2404/// the team into the provided global buffer for the reduction variables.
2405///
2406/// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2407/// For all data entries D in reduce_data:
2408/// Copy buffer.D[Idx] to local D;
2409static llvm::Value *emitGlobalToListCopyFunction(
2410 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2411 QualType ReductionArrayTy, SourceLocation Loc,
2412 const RecordDecl *TeamReductionRec,
2413 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2414 &VarFieldMap) {
2415 ASTContext &C = CGM.getContext();
2416
2417 // Buffer: global reduction buffer.
2418 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2419 C.VoidPtrTy, ImplicitParamDecl::Other);
2420 // Idx: index of the buffer.
2421 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2422 ImplicitParamDecl::Other);
2423 // ReduceList: thread local Reduce list.
2424 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2425 C.VoidPtrTy, ImplicitParamDecl::Other);
2426 FunctionArgList Args;
2427 Args.push_back(&BufferArg);
2428 Args.push_back(&IdxArg);
2429 Args.push_back(&ReduceListArg);
2430
2431 const CGFunctionInfo &CGFI =
2432 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2433 auto *Fn = llvm::Function::Create(
2434 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2435 "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2436 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2437 Fn->setDoesNotRecurse();
2438 CodeGenFunction CGF(CGM);
2439 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2440
2441 CGBuilderTy &Bld = CGF.Builder;
2442
2443 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2444 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2445 llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2446 Address LocalReduceList(
2447 Bld.CreatePointerBitCastOrAddrSpaceCast(
2448 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2449 C.VoidPtrTy, Loc),
2450 ElemTy->getPointerTo()),
2451 ElemTy, CGF.getPointerAlign());
2452 QualType StaticTy = C.getRecordType(TeamReductionRec);
2453 llvm::Type *LLVMReductionsBufferTy =
2454 CGM.getTypes().ConvertTypeForMem(StaticTy);
2455 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2456 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2457 LLVMReductionsBufferTy->getPointerTo());
2458
2459 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2460 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2461 /*Volatile=*/false, C.IntTy,
2462 Loc)};
2463 unsigned Idx = 0;
2464 for (const Expr *Private : Privates) {
2465 // Reduce element = LocalReduceList[i]
2466 Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2467 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2468 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2469 // elemptr = ((CopyType*)(elemptrptr)) + I
2470 ElemTy = CGF.ConvertTypeForMem(Private->getType());
2471 ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2472 ElemPtrPtr, ElemTy->getPointerTo());
2473 Address ElemPtr =
2474 Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2475 const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2476 // Global = Buffer.VD[Idx];
2477 const FieldDecl *FD = VarFieldMap.lookup(VD);
2478 LValue GlobLVal = CGF.EmitLValueForField(
2479 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2480 Address GlobAddr = GlobLVal.getAddress(CGF);
2481 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2482 GlobAddr.getPointer(), Idxs);
2483 GlobLVal.setAddress(Address(BufferPtr,
2484 CGF.ConvertTypeForMem(Private->getType()),
2485 GlobAddr.getAlignment()));
2486 switch (CGF.getEvaluationKind(Private->getType())) {
2487 case TEK_Scalar: {
2488 llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2489 CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2490 LValueBaseInfo(AlignmentSource::Type),
2491 TBAAAccessInfo());
2492 break;
2493 }
2494 case TEK_Complex: {
2495 CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
2496 CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2497 /*isInit=*/false);
2498 break;
2499 }
2500 case TEK_Aggregate:
2501 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2502 GlobLVal, Private->getType(),
2503 AggValueSlot::DoesNotOverlap);
2504 break;
2505 }
2506 ++Idx;
2507 }
2508
2509 CGF.FinishFunction();
2510 return Fn;
2511}
2512
2513/// This function emits a helper that reduces all the reduction variables from
2514/// the team into the provided global buffer for the reduction variables.
2515///
2516/// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2517/// void *GlobPtrs[];
2518/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2519/// ...
2520/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2521/// reduce_function(reduce_data, GlobPtrs);
2522static llvm::Value *emitGlobalToListReduceFunction(
2523 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2524 QualType ReductionArrayTy, SourceLocation Loc,
2525 const RecordDecl *TeamReductionRec,
2526 const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2527 &VarFieldMap,
2528 llvm::Function *ReduceFn) {
2529 ASTContext &C = CGM.getContext();
2530
2531 // Buffer: global reduction buffer.
2532 ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2533 C.VoidPtrTy, ImplicitParamDecl::Other);
2534 // Idx: index of the buffer.
2535 ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2536 ImplicitParamDecl::Other);
2537 // ReduceList: thread local Reduce list.
2538 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2539 C.VoidPtrTy, ImplicitParamDecl::Other);
2540 FunctionArgList Args;
2541 Args.push_back(&BufferArg);
2542 Args.push_back(&IdxArg);
2543 Args.push_back(&ReduceListArg);
2544
2545 const CGFunctionInfo &CGFI =
2546 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2547 auto *Fn = llvm::Function::Create(
2548 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2549 "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2550 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2551 Fn->setDoesNotRecurse();
2552 CodeGenFunction CGF(CGM);
2553 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2554
2555 CGBuilderTy &Bld = CGF.Builder;
2556
2557 Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2558 QualType StaticTy = C.getRecordType(TeamReductionRec);
2559 llvm::Type *LLVMReductionsBufferTy =
2560 CGM.getTypes().ConvertTypeForMem(StaticTy);
2561 llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2562 CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2563 LLVMReductionsBufferTy->getPointerTo());
2564
2565 // 1. Build a list of reduction variables.
2566 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2567 Address ReductionList =
2568 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2569 auto IPriv = Privates.begin();
2570 llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2571 CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2572 /*Volatile=*/false, C.IntTy,
2573 Loc)};
2574 unsigned Idx = 0;
2575 for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2576 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2577 // Global = Buffer.VD[Idx];
2578 const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2579 const FieldDecl *FD = VarFieldMap.lookup(VD);
2580 LValue GlobLVal = CGF.EmitLValueForField(
2581 CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2582 Address GlobAddr = GlobLVal.getAddress(CGF);
2583 llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2584 GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2585 llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2586 CGF.EmitStoreOfScalar(Ptr, Elem, /*Volatile=*/false, C.VoidPtrTy);
2587 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2588 // Store array size.
2589 ++Idx;
2590 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2591 llvm::Value *Size = CGF.Builder.CreateIntCast(
2592 CGF.getVLASize(
2593 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2594 .NumElts,
2595 CGF.SizeTy, /*isSigned=*/false);
2596 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2597 Elem);
2598 }
2599 }
2600
2601 // Call reduce_function(ReduceList, GlobalReduceList)
2602 llvm::Value *GlobalReduceList =
2603 CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2604 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2605 llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2606 AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2607 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2608 CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2609 CGF.FinishFunction();
2610 return Fn;
2611}
2612
2613///
2614/// Design of OpenMP reductions on the GPU
2615///
2616/// Consider a typical OpenMP program with one or more reduction
2617/// clauses:
2618///
2619/// float foo;
2620/// double bar;
2621/// #pragma omp target teams distribute parallel for \
2622/// reduction(+:foo) reduction(*:bar)
2623/// for (int i = 0; i < N; i++) {
2624/// foo += A[i]; bar *= B[i];
2625/// }
2626///
2627/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2628/// all teams. In our OpenMP implementation on the NVPTX device an
2629/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2630/// within a team are mapped to CUDA threads within a threadblock.
2631/// Our goal is to efficiently aggregate values across all OpenMP
2632/// threads such that:
2633///
2634/// - the compiler and runtime are logically concise, and
2635/// - the reduction is performed efficiently in a hierarchical
2636/// manner as follows: within OpenMP threads in the same warp,
2637/// across warps in a threadblock, and finally across teams on
2638/// the NVPTX device.
2639///
2640/// Introduction to Decoupling
2641///
2642/// We would like to decouple the compiler and the runtime so that the
2643/// latter is ignorant of the reduction variables (number, data types)
2644/// and the reduction operators. This allows a simpler interface
2645/// and implementation while still attaining good performance.
2646///
2647/// Pseudocode for the aforementioned OpenMP program generated by the
2648/// compiler is as follows:
2649///
2650/// 1. Create private copies of reduction variables on each OpenMP
2651/// thread: 'foo_private', 'bar_private'
2652/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2653/// to it and writes the result in 'foo_private' and 'bar_private'
2654/// respectively.
2655/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2656/// and store the result on the team master:
2657///
2658/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2659/// reduceData, shuffleReduceFn, interWarpCpyFn)
2660///
2661/// where:
2662/// struct ReduceData {
2663/// double *foo;
2664/// double *bar;
2665/// } reduceData
2666/// reduceData.foo = &foo_private
2667/// reduceData.bar = &bar_private
2668///
2669/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2670/// auxiliary functions generated by the compiler that operate on
2671/// variables of type 'ReduceData'. They aid the runtime perform
2672/// algorithmic steps in a data agnostic manner.
2673///
2674/// 'shuffleReduceFn' is a pointer to a function that reduces data
2675/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2676/// same warp. It takes the following arguments as input:
2677///
2678/// a. variable of type 'ReduceData' on the calling lane,
2679/// b. its lane_id,
2680/// c. an offset relative to the current lane_id to generate a
2681/// remote_lane_id. The remote lane contains the second
2682/// variable of type 'ReduceData' that is to be reduced.
2683/// d. an algorithm version parameter determining which reduction
2684/// algorithm to use.
2685///
2686/// 'shuffleReduceFn' retrieves data from the remote lane using
2687/// efficient GPU shuffle intrinsics and reduces, using the
2688/// algorithm specified by the 4th parameter, the two operands
2689/// element-wise. The result is written to the first operand.
2690///
2691/// Different reduction algorithms are implemented in different
2692/// runtime functions, all calling 'shuffleReduceFn' to perform
2693/// the essential reduction step. Therefore, based on the 4th
2694/// parameter, this function behaves slightly differently to
2695/// cooperate with the runtime to ensure correctness under
2696/// different circumstances.
2697///
2698/// 'InterWarpCpyFn' is a pointer to a function that transfers
2699/// reduced variables across warps. It tunnels, through CUDA
2700/// shared memory, the thread-private data of type 'ReduceData'
2701/// from lane 0 of each warp to a lane in the first warp.
2702/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2703/// The last team writes the global reduced value to memory.
2704///
2705/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2706/// reduceData, shuffleReduceFn, interWarpCpyFn,
2707/// scratchpadCopyFn, loadAndReduceFn)
2708///
2709/// 'scratchpadCopyFn' is a helper that stores reduced
2710/// data from the team master to a scratchpad array in
2711/// global memory.
2712///
2713/// 'loadAndReduceFn' is a helper that loads data from
2714/// the scratchpad array and reduces it with the input
2715/// operand.
2716///
2717/// These compiler generated functions hide address
2718/// calculation and alignment information from the runtime.
2719/// 5. if ret == 1:
2720/// The team master of the last team stores the reduced
2721/// result to the globals in memory.
2722/// foo += reduceData.foo; bar *= reduceData.bar
2723///
2724///
2725/// Warp Reduction Algorithms
2726///
2727/// On the warp level, we have three algorithms implemented in the
2728/// OpenMP runtime depending on the number of active lanes:
2729///
2730/// Full Warp Reduction
2731///
2732/// The reduce algorithm within a warp where all lanes are active
2733/// is implemented in the runtime as follows:
2734///
2735/// full_warp_reduce(void *reduce_data,
2736/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2737/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2738/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2739/// }
2740///
2741/// The algorithm completes in log(2, WARPSIZE) steps.
2742///
2743/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2744/// not used therefore we save instructions by not retrieving lane_id
2745/// from the corresponding special registers. The 4th parameter, which
2746/// represents the version of the algorithm being used, is set to 0 to
2747/// signify full warp reduction.
2748///
2749/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2750///
2751/// #reduce_elem refers to an element in the local lane's data structure
2752/// #remote_elem is retrieved from a remote lane
2753/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2754/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2755///
2756/// Contiguous Partial Warp Reduction
2757///
2758/// This reduce algorithm is used within a warp where only the first
2759/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2760/// number of OpenMP threads in a parallel region is not a multiple of
2761/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2762///
2763/// void
2764/// contiguous_partial_reduce(void *reduce_data,
2765/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2766/// int size, int lane_id) {
2767/// int curr_size;
2768/// int offset;
2769/// curr_size = size;
2770/// mask = curr_size/2;
2771/// while (offset>0) {
2772/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2773/// curr_size = (curr_size+1)/2;
2774/// offset = curr_size/2;
2775/// }
2776/// }
2777///
2778/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2779///
2780/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2781/// if (lane_id < offset)
2782/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2783/// else
2784/// reduce_elem = remote_elem
2785///
2786/// This algorithm assumes that the data to be reduced are located in a
2787/// contiguous subset of lanes starting from the first. When there is
2788/// an odd number of active lanes, the data in the last lane is not
2789/// aggregated with any other lane's dat but is instead copied over.
2790///
2791/// Dispersed Partial Warp Reduction
2792///
2793/// This algorithm is used within a warp when any discontiguous subset of
2794/// lanes are active. It is used to implement the reduction operation
2795/// across lanes in an OpenMP simd region or in a nested parallel region.
2796///
2797/// void
2798/// dispersed_partial_reduce(void *reduce_data,
2799/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2800/// int size, remote_id;
2801/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2802/// do {
2803/// remote_id = next_active_lane_id_right_after_me();
2804/// # the above function returns 0 of no active lane
2805/// # is present right after the current lane.
2806/// size = number_of_active_lanes_in_this_warp();
2807/// logical_lane_id /= 2;
2808/// ShuffleReduceFn(reduce_data, logical_lane_id,
2809/// remote_id-1-threadIdx.x, 2);
2810/// } while (logical_lane_id % 2 == 0 && size > 1);
2811/// }
2812///
2813/// There is no assumption made about the initial state of the reduction.
2814/// Any number of lanes (>=1) could be active at any position. The reduction
2815/// result is returned in the first active lane.
2816///
2817/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2818///
2819/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2820/// if (lane_id % 2 == 0 && offset > 0)
2821/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2822/// else
2823/// reduce_elem = remote_elem
2824///
2825///
2826/// Intra-Team Reduction
2827///
2828/// This function, as implemented in the runtime call
2829/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2830/// threads in a team. It first reduces within a warp using the
2831/// aforementioned algorithms. We then proceed to gather all such
2832/// reduced values at the first warp.
2833///
2834/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2835/// data from each of the "warp master" (zeroth lane of each warp, where
2836/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2837/// a mathematical sense) the problem of reduction across warp masters in
2838/// a block to the problem of warp reduction.
2839///
2840///
2841/// Inter-Team Reduction
2842///
2843/// Once a team has reduced its data to a single value, it is stored in
2844/// a global scratchpad array. Since each team has a distinct slot, this
2845/// can be done without locking.
2846///
2847/// The last team to write to the scratchpad array proceeds to reduce the
2848/// scratchpad array. One or more workers in the last team use the helper
2849/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2850/// the k'th worker reduces every k'th element.
2851///
2852/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2853/// reduce across workers and compute a globally reduced value.
2854///
2855void CGOpenMPRuntimeGPU::emitReduction(
2856 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2857 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
2858 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2859 if (!CGF.HaveInsertPoint())
1
Taking false branch
2860 return;
2861
2862 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2863#ifndef NDEBUG
2864 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2865#endif
2866
2867 if (Options.SimpleReduction) {
2
Assuming field 'SimpleReduction' is false
2868 assert(!TeamsReduction && !ParallelReduction &&(static_cast <bool> (!TeamsReduction && !ParallelReduction
&& "Invalid reduction selection in emitReduction.") ?
void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2869, __extension__
__PRETTY_FUNCTION__))
2869 "Invalid reduction selection in emitReduction.")(static_cast <bool> (!TeamsReduction && !ParallelReduction
&& "Invalid reduction selection in emitReduction.") ?
void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2869, __extension__
__PRETTY_FUNCTION__))
;
2870 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2871 ReductionOps, Options);
2872 return;
2873 }
2874
2875 assert((TeamsReduction || ParallelReduction) &&(static_cast <bool> ((TeamsReduction || ParallelReduction
) && "Invalid reduction selection in emitReduction.")
? void (0) : __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2876, __extension__
__PRETTY_FUNCTION__))
3
Taking false branch
4
Assuming 'TeamsReduction' is false
5
Assuming 'ParallelReduction' is true
6
'?' condition is true
2876 "Invalid reduction selection in emitReduction.")(static_cast <bool> ((TeamsReduction || ParallelReduction
) && "Invalid reduction selection in emitReduction.")
? void (0) : __assert_fail ("(TeamsReduction || ParallelReduction) && \"Invalid reduction selection in emitReduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2876, __extension__
__PRETTY_FUNCTION__))
;
2877
2878 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2879 // RedList, shuffle_reduce_func, interwarp_copy_func);
2880 // or
2881 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2882 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2883 llvm::Value *ThreadId = getThreadID(CGF, Loc);
2884
2885 llvm::Value *Res;
2886 ASTContext &C = CGM.getContext();
2887 // 1. Build a list of reduction variables.
2888 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2889 auto Size = RHSExprs.size();
2890 for (const Expr *E : Privates) {
7
Assuming '__begin1' is equal to '__end1'
2891 if (E->getType()->isVariablyModifiedType())
2892 // Reserve place for array size.
2893 ++Size;
2894 }
2895 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2896 QualType ReductionArrayTy =
2897 C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
2898 /*IndexTypeQuals=*/0);
2899 Address ReductionList =
2900 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2901 auto IPriv = Privates.begin();
2902 unsigned Idx = 0;
2903 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
8
Assuming 'I' is >= 'E'
9
Loop condition is false. Execution continues on line 2923
2904 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2905 CGF.Builder.CreateStore(
2906 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2907 CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
2908 Elem);
2909 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2910 // Store array size.
2911 ++Idx;
2912 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2913 llvm::Value *Size = CGF.Builder.CreateIntCast(
2914 CGF.getVLASize(
2915 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2916 .NumElts,
2917 CGF.SizeTy, /*isSigned=*/false);
2918 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2919 Elem);
2920 }
2921 }
2922
2923 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2924 ReductionList.getPointer(), CGF.VoidPtrTy);
2925 llvm::Function *ReductionFn =
2926 emitReductionFunction(Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
2927 Privates, LHSExprs, RHSExprs, ReductionOps);
2928 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2929 llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2930 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
2931 llvm::Value *InterWarpCopyFn =
2932 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
10
Calling 'emitInterWarpCopyFunction'
2933
2934 if (ParallelReduction) {
2935 llvm::Value *Args[] = {RTLoc,
2936 ThreadId,
2937 CGF.Builder.getInt32(RHSExprs.size()),
2938 ReductionArrayTySize,
2939 RL,
2940 ShuffleAndReduceFn,
2941 InterWarpCopyFn};
2942
2943 Res = CGF.EmitRuntimeCall(
2944 OMPBuilder.getOrCreateRuntimeFunction(
2945 CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2946 Args);
2947 } else {
2948 assert(TeamsReduction && "expected teams reduction.")(static_cast <bool> (TeamsReduction && "expected teams reduction."
) ? void (0) : __assert_fail ("TeamsReduction && \"expected teams reduction.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2948, __extension__
__PRETTY_FUNCTION__))
;
2949 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2950 llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2951 int Cnt = 0;
2952 for (const Expr *DRE : Privates) {
2953 PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
2954 ++Cnt;
2955 }
2956 const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
2957 CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
2958 C.getLangOpts().OpenMPCUDAReductionBufNum);
2959 TeamsReductions.push_back(TeamReductionRec);
2960 if (!KernelTeamsReductionPtr) {
2961 KernelTeamsReductionPtr = new llvm::GlobalVariable(
2962 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
2963 llvm::GlobalValue::InternalLinkage, nullptr,
2964 "_openmp_teams_reductions_buffer_$_$ptr");
2965 }
2966 llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
2967 Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
2968 /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
2969 llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2970 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2971 llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2972 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2973 ReductionFn);
2974 llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2975 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2976 llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2977 CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2978 ReductionFn);
2979
2980 llvm::Value *Args[] = {
2981 RTLoc,
2982 ThreadId,
2983 GlobalBufferPtr,
2984 CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
2985 RL,
2986 ShuffleAndReduceFn,
2987 InterWarpCopyFn,
2988 GlobalToBufferCpyFn,
2989 GlobalToBufferRedFn,
2990 BufferToGlobalCpyFn,
2991 BufferToGlobalRedFn};
2992
2993 Res = CGF.EmitRuntimeCall(
2994 OMPBuilder.getOrCreateRuntimeFunction(
2995 CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2996 Args);
2997 }
2998
2999 // 5. Build if (res == 1)
3000 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3001 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3002 llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3003 Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
3004 CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3005
3006 // 6. Build then branch: where we have reduced values in the master
3007 // thread in each team.
3008 // __kmpc_end_reduce{_nowait}(<gtid>);
3009 // break;
3010 CGF.EmitBlock(ThenBB);
3011
3012 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3013 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3014 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3015 auto IPriv = Privates.begin();
3016 auto ILHS = LHSExprs.begin();
3017 auto IRHS = RHSExprs.begin();
3018 for (const Expr *E : ReductionOps) {
3019 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3020 cast<DeclRefExpr>(*IRHS));
3021 ++IPriv;
3022 ++ILHS;
3023 ++IRHS;
3024 }
3025 };
3026 llvm::Value *EndArgs[] = {ThreadId};
3027 RegionCodeGenTy RCG(CodeGen);
3028 NVPTXActionTy Action(
3029 nullptr, std::nullopt,
3030 OMPBuilder.getOrCreateRuntimeFunction(
3031 CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3032 EndArgs);
3033 RCG.setAction(Action);
3034 RCG(CGF);
3035 // There is no need to emit line number for unconditional branch.
3036 (void)ApplyDebugLocation::CreateEmpty(CGF);
3037 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
3038}
3039
3040const VarDecl *
3041CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3042 const VarDecl *NativeParam) const {
3043 if (!NativeParam->getType()->isReferenceType())
3044 return NativeParam;
3045 QualType ArgType = NativeParam->getType();
3046 QualifierCollector QC;
3047 const Type *NonQualTy = QC.strip(ArgType);
3048 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3049 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3050 if (Attr->getCaptureKind() == OMPC_map) {
3051 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3052 LangAS::opencl_global);
3053 }
3054 }
3055 ArgType = CGM.getContext().getPointerType(PointeeTy);
3056 QC.addRestrict();
3057 enum { NVPTX_local_addr = 5 };
3058 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3059 ArgType = QC.apply(CGM.getContext(), ArgType);
3060 if (isa<ImplicitParamDecl>(NativeParam))
3061 return ImplicitParamDecl::Create(
3062 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3063 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3064 return ParmVarDecl::Create(
3065 CGM.getContext(),
3066 const_cast<DeclContext *>(NativeParam->getDeclContext()),
3067 NativeParam->getBeginLoc(), NativeParam->getLocation(),
3068 NativeParam->getIdentifier(), ArgType,
3069 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3070}
3071
3072Address
3073CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3074 const VarDecl *NativeParam,
3075 const VarDecl *TargetParam) const {
3076 assert(NativeParam != TargetParam &&(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3078, __extension__
__PRETTY_FUNCTION__))
3077 NativeParam->getType()->isReferenceType() &&(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3078, __extension__
__PRETTY_FUNCTION__))
3078 "Native arg must not be the same as target arg.")(static_cast <bool> (NativeParam != TargetParam &&
NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg."
) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3078, __extension__
__PRETTY_FUNCTION__))
;
3079 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3080 QualType NativeParamType = NativeParam->getType();
3081 QualifierCollector QC;
3082 const Type *NonQualTy = QC.strip(NativeParamType);
3083 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3084 unsigned NativePointeeAddrSpace =
3085 CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
3086 QualType TargetTy = TargetParam->getType();
3087 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3088 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
3089 // First cast to generic.
3090 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3091 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3092 cast<llvm::PointerType>(TargetAddr->getType()), /*AddrSpace=*/0));
3093 // Cast from generic to native address space.
3094 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3095 TargetAddr, llvm::PointerType::getWithSamePointeeType(
3096 cast<llvm::PointerType>(TargetAddr->getType()),
3097 NativePointeeAddrSpace));
3098 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3099 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3100 NativeParamType);
3101 return NativeParamAddr;
3102}
3103
3104void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3105 CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3106 ArrayRef<llvm::Value *> Args) const {
3107 SmallVector<llvm::Value *, 4> TargetArgs;
3108 TargetArgs.reserve(Args.size());
3109 auto *FnType = OutlinedFn.getFunctionType();
3110 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3111 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3112 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3113 break;
3114 }
3115 llvm::Type *TargetType = FnType->getParamType(I);
3116 llvm::Value *NativeArg = Args[I];
3117 if (!TargetType->isPointerTy()) {
3118 TargetArgs.emplace_back(NativeArg);
3119 continue;
3120 }
3121 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3122 NativeArg, llvm::PointerType::getWithSamePointeeType(
3123 cast<llvm::PointerType>(NativeArg->getType()), /*AddrSpace*/ 0));
3124 TargetArgs.emplace_back(
3125 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3126 }
3127 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3128}
3129
3130/// Emit function which wraps the outline parallel region
3131/// and controls the arguments which are passed to this function.
3132/// The wrapper ensures that the outlined function is called
3133/// with the correct arguments when data is shared.
3134llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3135 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3136 ASTContext &Ctx = CGM.getContext();
3137 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3138
3139 // Create a function that takes as argument the source thread.
3140 FunctionArgList WrapperArgs;
3141 QualType Int16QTy =
3142 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3143 QualType Int32QTy =
3144 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3145 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3146 /*Id=*/nullptr, Int16QTy,
3147 ImplicitParamDecl::Other);
3148 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3149 /*Id=*/nullptr, Int32QTy,
3150 ImplicitParamDecl::Other);
3151 WrapperArgs.emplace_back(&ParallelLevelArg);
3152 WrapperArgs.emplace_back(&WrapperArg);
3153
3154 const CGFunctionInfo &CGFI =
3155 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3156
3157 auto *Fn = llvm::Function::Create(
3158 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3159 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3160
3161 // Ensure we do not inline the function. This is trivially true for the ones
3162 // passed to __kmpc_fork_call but the ones calles in serialized regions
3163 // could be inlined. This is not a perfect but it is closer to the invariant
3164 // we want, namely, every data environment starts with a new function.
3165 // TODO: We should pass the if condition to the runtime function and do the
3166 // handling there. Much cleaner code.
3167 Fn->addFnAttr(llvm::Attribute::NoInline);
3168
3169 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3170 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3171 Fn->setDoesNotRecurse();
3172
3173 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3174 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3175 D.getBeginLoc(), D.getBeginLoc());
3176
3177 const auto *RD = CS.getCapturedRecordDecl();
3178 auto CurField = RD->field_begin();
3179
3180 Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3181 /*Name=*/".zero.addr");
3182 CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3183 // Get the array of arguments.
3184 SmallVector<llvm::Value *, 8> Args;
3185
3186 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3187 Args.emplace_back(ZeroAddr.getPointer());
3188
3189 CGBuilderTy &Bld = CGF.Builder;
3190 auto CI = CS.capture_begin();
3191
3192 // Use global memory for data sharing.
3193 // Handle passing of global args to workers.
3194 Address GlobalArgs =
3195 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3196 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3197 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3198 CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3199 CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3200 DataSharingArgs);
3201
3202 // Retrieve the shared variables from the list of references returned
3203 // by the runtime. Pass the variables to the outlined function.
3204 Address SharedArgListAddress = Address::invalid();
3205 if (CS.capture_size() > 0 ||
3206 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3207 SharedArgListAddress = CGF.EmitLoadOfPointer(
3208 GlobalArgs, CGF.getContext()
3209 .getPointerType(CGF.getContext().VoidPtrTy)
3210 .castAs<PointerType>());
3211 }
3212 unsigned Idx = 0;
3213 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3214 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3215 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3216 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3217 llvm::Value *LB = CGF.EmitLoadOfScalar(
3218 TypedAddress,
3219 /*Volatile=*/false,
3220 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3221 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3222 Args.emplace_back(LB);
3223 ++Idx;
3224 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3225 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3226 Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3227 llvm::Value *UB = CGF.EmitLoadOfScalar(
3228 TypedAddress,
3229 /*Volatile=*/false,
3230 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3231 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3232 Args.emplace_back(UB);
3233 ++Idx;
3234 }
3235 if (CS.capture_size() > 0) {
3236 ASTContext &CGFContext = CGF.getContext();
3237 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3238 QualType ElemTy = CurField->getType();
3239 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3240 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3241 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
3242 CGF.ConvertTypeForMem(ElemTy));
3243 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3244 /*Volatile=*/false,
3245 CGFContext.getPointerType(ElemTy),
3246 CI->getLocation());
3247 if (CI->capturesVariableByCopy() &&
3248 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3249 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3250 CI->getLocation());
3251 }
3252 Args.emplace_back(Arg);
3253 }
3254 }
3255
3256 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3257 CGF.FinishFunction();
3258 return Fn;
3259}
3260
3261void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3262 const Decl *D) {
3263 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3264 return;
3265
3266 assert(D && "Expected function or captured|block decl.")(static_cast <bool> (D && "Expected function or captured|block decl."
) ? void (0) : __assert_fail ("D && \"Expected function or captured|block decl.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3266, __extension__
__PRETTY_FUNCTION__))
;
3267 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&(static_cast <bool> (FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 && "Function is registered already.") ? void
(0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3268, __extension__
__PRETTY_FUNCTION__))
3268 "Function is registered already.")(static_cast <bool> (FunctionGlobalizedDecls.count(CGF.
CurFn) == 0 && "Function is registered already.") ? void
(0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3268, __extension__
__PRETTY_FUNCTION__))
;
3269 assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&(static_cast <bool> ((!TeamAndReductions.first || TeamAndReductions
.first == D) && "Team is set but not processed.") ? void
(0) : __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3270, __extension__
__PRETTY_FUNCTION__))
3270 "Team is set but not processed.")(static_cast <bool> ((!TeamAndReductions.first || TeamAndReductions
.first == D) && "Team is set but not processed.") ? void
(0) : __assert_fail ("(!TeamAndReductions.first || TeamAndReductions.first == D) && \"Team is set but not processed.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3270, __extension__
__PRETTY_FUNCTION__))
;
3271 const Stmt *Body = nullptr;
3272 bool NeedToDelayGlobalization = false;
3273 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3274 Body = FD->getBody();
3275 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3276 Body = BD->getBody();
3277 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3278 Body = CD->getBody();
3279 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3280 if (NeedToDelayGlobalization &&
3281 getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3282 return;
3283 }
3284 if (!Body)
3285 return;
3286 CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3287 VarChecker.Visit(Body);
3288 const RecordDecl *GlobalizedVarsRecord =
3289 VarChecker.getGlobalizedRecord(IsInTTDRegion);
3290 TeamAndReductions.first = nullptr;
3291 TeamAndReductions.second.clear();
3292 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3293 VarChecker.getEscapedVariableLengthDecls();
3294 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3295 return;
3296 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3297 I->getSecond().MappedParams =
3298 std::make_unique<CodeGenFunction::OMPMapVars>();
3299 I->getSecond().EscapedParameters.insert(
3300 VarChecker.getEscapedParameters().begin(),
3301 VarChecker.getEscapedParameters().end());
3302 I->getSecond().EscapedVariableLengthDecls.append(
3303 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3304 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3305 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3306 assert(VD->isCanonicalDecl() && "Expected canonical declaration")(static_cast <bool> (VD->isCanonicalDecl() &&
"Expected canonical declaration") ? void (0) : __assert_fail
("VD->isCanonicalDecl() && \"Expected canonical declaration\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3306, __extension__
__PRETTY_FUNCTION__))
;
3307 Data.insert(std::make_pair(VD, MappedVarData()));
3308 }
3309 if (!NeedToDelayGlobalization) {
3310 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
3311 struct GlobalizationScope final : EHScopeStack::Cleanup {
3312 GlobalizationScope() = default;
3313
3314 void Emit(CodeGenFunction &CGF, Flags flags) override {
3315 static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3316 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
3317 }
3318 };
3319 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3320 }
3321}
3322
3323Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3324 const VarDecl *VD) {
3325 if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3326 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3327 auto AS = LangAS::Default;
3328 switch (A->getAllocatorType()) {
3329 // Use the default allocator here as by default local vars are
3330 // threadlocal.
3331 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3332 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3333 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3334 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3335 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3336 // Follow the user decision - use default allocation.
3337 return Address::invalid();
3338 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3339 // TODO: implement aupport for user-defined allocators.
3340 return Address::invalid();
3341 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3342 AS = LangAS::cuda_constant;
3343 break;
3344 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3345 AS = LangAS::cuda_shared;
3346 break;
3347 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3348 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3349 break;
3350 }
3351 llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3352 auto *GV = new llvm::GlobalVariable(
3353 CGM.getModule(), VarTy, /*isConstant=*/false,
3354 llvm::GlobalValue::InternalLinkage, llvm::Constant::getNullValue(VarTy),
3355 VD->getName(),
3356 /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3357 CGM.getContext().getTargetAddressSpace(AS));
3358 CharUnits Align = CGM.getContext().getDeclAlign(VD);
3359 GV->setAlignment(Align.getAsAlign());
3360 return Address(
3361 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3362 GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3363 VD->getType().getAddressSpace()))),
3364 VarTy, Align);
3365 }
3366
3367 if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3368 return Address::invalid();
3369
3370 VD = VD->getCanonicalDecl();
3371 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3372 if (I == FunctionGlobalizedDecls.end())
3373 return Address::invalid();
3374 auto VDI = I->getSecond().LocalVarData.find(VD);
3375 if (VDI != I->getSecond().LocalVarData.end())
3376 return VDI->second.PrivateAddr;
3377 if (VD->hasAttrs()) {
3378 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3379 E(VD->attr_end());
3380 IT != E; ++IT) {
3381 auto VDI = I->getSecond().LocalVarData.find(
3382 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3383 ->getCanonicalDecl());
3384 if (VDI != I->getSecond().LocalVarData.end())
3385 return VDI->second.PrivateAddr;
3386 }
3387 }
3388
3389 return Address::invalid();
3390}
3391
3392void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3393 FunctionGlobalizedDecls.erase(CGF.CurFn);
3394 CGOpenMPRuntime::functionFinished(CGF);
3395}
3396
3397void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3398 CodeGenFunction &CGF, const OMPLoopDirective &S,
3399 OpenMPDistScheduleClauseKind &ScheduleKind,
3400 llvm::Value *&Chunk) const {
3401 auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3402 if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3403 ScheduleKind = OMPC_DIST_SCHEDULE_static;
3404 Chunk = CGF.EmitScalarConversion(
3405 RT.getGPUNumThreads(CGF),
3406 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3407 S.getIterationVariable()->getType(), S.getBeginLoc());
3408 return;
3409 }
3410 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3411 CGF, S, ScheduleKind, Chunk);
3412}
3413
3414void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3415 CodeGenFunction &CGF, const OMPLoopDirective &S,
3416 OpenMPScheduleClauseKind &ScheduleKind,
3417 const Expr *&ChunkExpr) const {
3418 ScheduleKind = OMPC_SCHEDULE_static;
3419 // Chunk size is 1 in this case.
3420 llvm::APInt ChunkSize(32, 1);
3421 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3422 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3423 SourceLocation());
3424}
3425
3426void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3427 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3428 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) && " Expected target-based directive."
) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3429, __extension__
__PRETTY_FUNCTION__))
3429 " Expected target-based directive.")(static_cast <bool> (isOpenMPTargetExecutionDirective(D
.getDirectiveKind()) && " Expected target-based directive."
) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3429, __extension__
__PRETTY_FUNCTION__))
;
3430 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3431 for (const CapturedStmt::Capture &C : CS->captures()) {
3432 // Capture variables captured by reference in lambdas for target-based
3433 // directives.
3434 if (!C.capturesVariable())
3435 continue;
3436 const VarDecl *VD = C.getCapturedVar();
3437 const auto *RD = VD->getType()
3438 .getCanonicalType()
3439 .getNonReferenceType()
3440 ->getAsCXXRecordDecl();
3441 if (!RD || !RD->isLambda())
3442 continue;
3443 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3444 LValue VDLVal;
3445 if (VD->getType().getCanonicalType()->isReferenceType())
3446 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3447 else
3448 VDLVal = CGF.MakeAddrLValue(
3449 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3450 llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3451 FieldDecl *ThisCapture = nullptr;
3452 RD->getCaptureFields(Captures, ThisCapture);
3453 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3454 LValue ThisLVal =
3455 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3456 llvm::Value *CXXThis = CGF.LoadCXXThis();
3457 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3458 }
3459 for (const LambdaCapture &LC : RD->captures()) {
3460 if (LC.getCaptureKind() != LCK_ByRef)
3461 continue;
3462 const ValueDecl *VD = LC.getCapturedVar();
3463 // FIXME: For now VD is always a VarDecl because OpenMP does not support
3464 // capturing structured bindings in lambdas yet.
3465 if (!CS->capturesVariable(cast<VarDecl>(VD)))
3466 continue;
3467 auto It = Captures.find(VD);
3468 assert(It != Captures.end() && "Found lambda capture without field.")(static_cast <bool> (It != Captures.end() && "Found lambda capture without field."
) ? void (0) : __assert_fail ("It != Captures.end() && \"Found lambda capture without field.\""
, "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3468, __extension__
__PRETTY_FUNCTION__))
;
3469 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3470 Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3471 if (VD->getType().getCanonicalType()->isReferenceType())
3472 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3473 VD->getType().getCanonicalType())
3474 .getAddress(CGF);
3475 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3476 }
3477 }
3478}
3479
3480bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3481 LangAS &AS) {
3482 if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3483 return false;
3484 const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3485 switch(A->getAllocatorType()) {
3486 case OMPAllocateDeclAttr::OMPNullMemAlloc:
3487 case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3488 // Not supported, fallback to the default mem space.
3489 case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3490 case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3491 case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3492 case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3493 case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3494 AS = LangAS::Default;
3495 return true;
3496 case OMPAllocateDeclAttr::OMPConstMemAlloc:
3497 AS = LangAS::cuda_constant;
3498 return true;
3499 case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3500 AS = LangAS::cuda_shared;
3501 return true;
3502 case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3503 llvm_unreachable("Expected predefined allocator for the variables with the "::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the "
"static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3504)
3504 "static storage.")::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the "
"static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3504)
;
3505 }
3506 return false;
3507}
3508
3509// Get current CudaArch and ignore any unknown values
3510static CudaArch getCudaArch(CodeGenModule &CGM) {
3511 if (!CGM.getTarget().hasFeature("ptx"))
3512 return CudaArch::UNKNOWN;
3513 for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3514 if (Feature.getValue()) {
3515 CudaArch Arch = StringToCudaArch(Feature.getKey());
3516 if (Arch != CudaArch::UNKNOWN)
3517 return Arch;
3518 }
3519 }
3520 return CudaArch::UNKNOWN;
3521}
3522
3523/// Check to see if target architecture supports unified addressing which is
3524/// a restriction for OpenMP requires clause "unified_shared_memory".
3525void CGOpenMPRuntimeGPU::processRequiresDirective(
3526 const OMPRequiresDecl *D) {
3527 for (const OMPClause *Clause : D->clauselists()) {
3528 if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3529 CudaArch Arch = getCudaArch(CGM);
3530 switch (Arch) {
3531 case CudaArch::SM_20:
3532 case CudaArch::SM_21:
3533 case CudaArch::SM_30:
3534 case CudaArch::SM_32:
3535 case CudaArch::SM_35:
3536 case CudaArch::SM_37:
3537 case CudaArch::SM_50:
3538 case CudaArch::SM_52:
3539 case CudaArch::SM_53: {
3540 SmallString<256> Buffer;
3541 llvm::raw_svector_ostream Out(Buffer);
3542 Out << "Target architecture " << CudaArchToString(Arch)
3543 << " does not support unified addressing";
3544 CGM.Error(Clause->getBeginLoc(), Out.str());
3545 return;
3546 }
3547 case CudaArch::SM_60:
3548 case CudaArch::SM_61:
3549 case CudaArch::SM_62:
3550 case CudaArch::SM_70:
3551 case CudaArch::SM_72:
3552 case CudaArch::SM_75:
3553 case CudaArch::SM_80:
3554 case CudaArch::SM_86:
3555 case CudaArch::SM_87:
3556 case CudaArch::SM_89:
3557 case CudaArch::SM_90:
3558 case CudaArch::GFX600:
3559 case CudaArch::GFX601:
3560 case CudaArch::GFX602:
3561 case CudaArch::GFX700:
3562 case CudaArch::GFX701:
3563 case CudaArch::GFX702:
3564 case CudaArch::GFX703:
3565 case CudaArch::GFX704:
3566 case CudaArch::GFX705:
3567 case CudaArch::GFX801:
3568 case CudaArch::GFX802:
3569 case CudaArch::GFX803:
3570 case CudaArch::GFX805:
3571 case CudaArch::GFX810:
3572 case CudaArch::GFX900:
3573 case CudaArch::GFX902:
3574 case CudaArch::GFX904:
3575 case CudaArch::GFX906:
3576 case CudaArch::GFX908:
3577 case CudaArch::GFX909:
3578 case CudaArch::GFX90a:
3579 case CudaArch::GFX90c:
3580 case CudaArch::GFX940:
3581 case CudaArch::GFX1010:
3582 case CudaArch::GFX1011:
3583 case CudaArch::GFX1012:
3584 case CudaArch::GFX1013:
3585 case CudaArch::GFX1030:
3586 case CudaArch::GFX1031:
3587 case CudaArch::GFX1032:
3588 case CudaArch::GFX1033:
3589 case CudaArch::GFX1034:
3590 case CudaArch::GFX1035:
3591 case CudaArch::GFX1036:
3592 case CudaArch::GFX1100:
3593 case CudaArch::GFX1101:
3594 case CudaArch::GFX1102:
3595 case CudaArch::GFX1103:
3596 case CudaArch::Generic:
3597 case CudaArch::UNUSED:
3598 case CudaArch::UNKNOWN:
3599 break;
3600 case CudaArch::LAST:
3601 llvm_unreachable("Unexpected Cuda arch.")::llvm::llvm_unreachable_internal("Unexpected Cuda arch.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp"
, 3601)
;
3602 }
3603 }
3604 }
3605 CGOpenMPRuntime::processRequiresDirective(D);
3606}
3607
3608void CGOpenMPRuntimeGPU::clear() {
3609
3610 if (!TeamsReductions.empty()) {
3611 ASTContext &C = CGM.getContext();
3612 RecordDecl *StaticRD = C.buildImplicitRecord(
3613 "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3614 StaticRD->startDefinition();
3615 for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3616 QualType RecTy = C.getRecordType(TeamReductionRec);
3617 auto *Field = FieldDecl::Create(
3618 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3619 C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3620 /*BW=*/nullptr, /*Mutable=*/false,
3621 /*InitStyle=*/ICIS_NoInit);
3622 Field->setAccess(AS_public);
3623 StaticRD->addDecl(Field);
3624 }
3625 StaticRD->completeDefinition();
3626 QualType StaticTy = C.getRecordType(StaticRD);
3627 llvm::Type *LLVMReductionsBufferTy =
3628 CGM.getTypes().ConvertTypeForMem(StaticTy);
3629 // FIXME: nvlink does not handle weak linkage correctly (object with the
3630 // different size are reported as erroneous).
3631 // Restore CommonLinkage as soon as nvlink is fixed.
3632 auto *GV = new llvm::GlobalVariable(
3633 CGM.getModule(), LLVMReductionsBufferTy,
3634 /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
3635 llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3636 "_openmp_teams_reductions_buffer_$_");
3637 KernelTeamsReductionPtr->setInitializer(
3638 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3639 CGM.VoidPtrTy));
3640 }
3641 CGOpenMPRuntime::clear();
3642}
3643
3644llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
3645 CGBuilderTy &Bld = CGF.Builder;
3646 llvm::Module *M = &CGF.CGM.getModule();
3647 const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3648 llvm::Function *F = M->getFunction(LocSize);
3649 if (!F) {
3650 F = llvm::Function::Create(
3651 llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),
3652 llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
3653 }
3654 return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");
3655}
3656
3657llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
3658 ArrayRef<llvm::Value *> Args{};
3659 return CGF.EmitRuntimeCall(
3660 OMPBuilder.getOrCreateRuntimeFunction(
3661 CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
3662 Args);
3663}
3664
3665llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
3666 ArrayRef<llvm::Value *> Args{};
3667 return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3668 CGM.getModule(), OMPRTL___kmpc_get_warp_size),
3669 Args);
3670}