/build/source/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Bug Summary

File:	build/source/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
Warning:	line 1098, column 5 Value stored to 'Size' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name CGOpenMPRuntimeGPU.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -relaxed-aliasing -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm -resource-dir /usr/lib/llvm-17/lib/clang/17 -I tools/clang/lib/CodeGen -I /build/source/clang/lib/CodeGen -I /build/source/clang/include -I tools/clang/include -I include -I /build/source/llvm/include -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm=build-llvm -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm=build-llvm -fcoverage-prefix-map=/build/source/= -O3 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -fdebug-prefix-map=/build/source/build-llvm=build-llvm -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

1	//===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This provides a generalized class for OpenMP runtime code generation
10	// specialized by GPU targets NVPTX and AMDGCN.
11	//
12	//===----------------------------------------------------------------------===//
13
14	#include "CGOpenMPRuntimeGPU.h"
15	#include "CodeGenFunction.h"
16	#include "clang/AST/Attr.h"
17	#include "clang/AST/DeclOpenMP.h"
18	#include "clang/AST/OpenMPClause.h"
19	#include "clang/AST/StmtOpenMP.h"
20	#include "clang/AST/StmtVisitor.h"
21	#include "clang/Basic/Cuda.h"
22	#include "llvm/ADT/SmallPtrSet.h"
23	#include "llvm/Frontend/OpenMP/OMPGridValues.h"
24	#include "llvm/Support/MathExtras.h"
25
26	using namespace clang;
27	using namespace CodeGen;
28	using namespace llvm::omp;
29
30	namespace {
31	/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32	class NVPTXActionTy final : public PrePostActionTy {
33	llvm::FunctionCallee EnterCallee = nullptr;
34	ArrayRef<llvm::Value *> EnterArgs;
35	llvm::FunctionCallee ExitCallee = nullptr;
36	ArrayRef<llvm::Value *> ExitArgs;
37	bool Conditional = false;
38	llvm::BasicBlock *ContBlock = nullptr;
39
40	public:
41	NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42	ArrayRef<llvm::Value *> EnterArgs,
43	llvm::FunctionCallee ExitCallee,
44	ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45	: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46	ExitArgs(ExitArgs), Conditional(Conditional) {}
47	void Enter(CodeGenFunction &CGF) override {
48	llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49	if (Conditional) {
50	llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51	auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52	ContBlock = CGF.createBasicBlock("omp_if.end");
53	// Generate the branch (If-stmt)
54	CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55	CGF.EmitBlock(ThenBlock);
56	}
57	}
58	void Done(CodeGenFunction &CGF) {
59	// Emit the rest of blocks/branches
60	CGF.EmitBranch(ContBlock);
61	CGF.EmitBlock(ContBlock, true);
62	}
63	void Exit(CodeGenFunction &CGF) override {
64	CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65	}
66	};
67
68	/// A class to track the execution mode when codegening directives within
69	/// a target region. The appropriate mode (SPMD\|NON-SPMD) is set on entry
70	/// to the target region and used by containing directives such as 'parallel'
71	/// to emit optimized code.
72	class ExecutionRuntimeModesRAII {
73	private:
74	CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
75	CGOpenMPRuntimeGPU::EM_Unknown;
76	CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;
77
78	public:
79	ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
80	CGOpenMPRuntimeGPU::ExecutionMode EntryMode)
81	: ExecMode(ExecMode) {
82	SavedExecMode = ExecMode;
83	ExecMode = EntryMode;
84	}
85	~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86	};
87
88	/// GPU Configuration: This information can be derived from cuda registers,
89	/// however, providing compile time constants helps generate more efficient
90	/// code. For all practical purposes this is fine because the configuration
91	/// is the same for all known NVPTX architectures.
92	enum MachineConfiguration : unsigned {
93	/// See "llvm/Frontend/OpenMP/OMPGridValues.h" for various related target
94	/// specific Grid Values like GV_Warp_Size, GV_Slot_Size
95
96	/// Global memory alignment for performance.
97	GlobalMemoryAlignment = 128,
98	};
99
100	static const ValueDecl getPrivateItem(const Expr RefExpr) {
101	RefExpr = RefExpr->IgnoreParens();
102	if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
103	const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
104	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
105	Base = TempASE->getBase()->IgnoreParenImpCasts();
106	RefExpr = Base;
107	} else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(RefExpr)) {
108	const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
109	while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
110	Base = TempOASE->getBase()->IgnoreParenImpCasts();
111	while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
112	Base = TempASE->getBase()->IgnoreParenImpCasts();
113	RefExpr = Base;
114	}
115	RefExpr = RefExpr->IgnoreParenImpCasts();
116	if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
117	return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
118	const auto *ME = cast<MemberExpr>(RefExpr);
119	return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
120	}
121
122
123	static RecordDecl *buildRecordForGlobalizedVars(
124	ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
125	ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
126	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
127	&MappedDeclsFields, int BufSize) {
128	using VarsDataTy = std::pair<CharUnits /Align/, const ValueDecl *>;
129	if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
130	return nullptr;
131	SmallVector<VarsDataTy, 4> GlobalizedVars;
132	for (const ValueDecl *D : EscapedDecls)
133	GlobalizedVars.emplace_back(
134	CharUnits::fromQuantity(std::max(
135	C.getDeclAlign(D).getQuantity(),
136	static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
137	D);
138	for (const ValueDecl *D : EscapedDeclsForTeams)
139	GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
140	llvm::stable_sort(GlobalizedVars, [](VarsDataTy L, VarsDataTy R) {
141	return L.first > R.first;
142	});
143
144	// Build struct _globalized_locals_ty {
145	// /* globalized vars */[WarSize] align (max(decl_align,
146	// GlobalMemoryAlignment))
147	// /* globalized vars */ for EscapedDeclsForTeams
148	// };
149	RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
150	GlobalizedRD->startDefinition();
151	llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
152	EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
153	for (const auto &Pair : GlobalizedVars) {
154	const ValueDecl *VD = Pair.second;
155	QualType Type = VD->getType();
156	if (Type->isLValueReferenceType())
157	Type = C.getPointerType(Type.getNonReferenceType());
158	else
159	Type = Type.getNonReferenceType();
160	SourceLocation Loc = VD->getLocation();
161	FieldDecl *Field;
162	if (SingleEscaped.count(VD)) {
163	Field = FieldDecl::Create(
164	C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
165	C.getTrivialTypeSourceInfo(Type, SourceLocation()),
166	/BW=/nullptr, /Mutable=/false,
167	/InitStyle=/ICIS_NoInit);
168	Field->setAccess(AS_public);
169	if (VD->hasAttrs()) {
170	for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
171	E(VD->getAttrs().end());
172	I != E; ++I)
173	Field->addAttr(*I);
174	}
175	} else {
176	llvm::APInt ArraySize(32, BufSize);
177	Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
178	0);
179	Field = FieldDecl::Create(
180	C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
181	C.getTrivialTypeSourceInfo(Type, SourceLocation()),
182	/BW=/nullptr, /Mutable=/false,
183	/InitStyle=/ICIS_NoInit);
184	Field->setAccess(AS_public);
185	llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
186	static_cast<CharUnits::QuantityType>(
187	GlobalMemoryAlignment)));
188	Field->addAttr(AlignedAttr::CreateImplicit(
189	C, /IsAlignmentExpr=/true,
190	IntegerLiteral::Create(C, Align,
191	C.getIntTypeForBitwidth(32, /Signed=/0),
192	SourceLocation()),
193	{}, AlignedAttr::GNU_aligned));
194	}
195	GlobalizedRD->addDecl(Field);
196	MappedDeclsFields.try_emplace(VD, Field);
197	}
198	GlobalizedRD->completeDefinition();
199	return GlobalizedRD;
200	}
201
202	/// Get the list of variables that can escape their declaration context.
203	class CheckVarsEscapingDeclContext final
204	: public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
205	CodeGenFunction &CGF;
206	llvm::SetVector<const ValueDecl *> EscapedDecls;
207	llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
208	llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
209	RecordDecl *GlobalizedRD = nullptr;
210	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > MappedDeclsFields;
211	bool AllEscaped = false;
212	bool IsForCombinedParallelRegion = false;
213
214	void markAsEscaped(const ValueDecl *VD) {
215	// Do not globalize declare target variables.
216	if (!isa<VarDecl>(VD) \|\|
217	OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
218	return;
219	VD = cast<ValueDecl>(VD->getCanonicalDecl());
220	// Use user-specified allocation.
221	if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
222	return;
223	// Variables captured by value must be globalized.
224	if (auto *CSI = CGF.CapturedStmtInfo) {
225	if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
226	// Check if need to capture the variable that was already captured by
227	// value in the outer region.
228	if (!IsForCombinedParallelRegion) {
229	if (!FD->hasAttrs())
230	return;
231	const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
232	if (!Attr)
233	return;
234	if (((Attr->getCaptureKind() != OMPC_map) &&
235	!isOpenMPPrivate(Attr->getCaptureKind())) \|\|
236	((Attr->getCaptureKind() == OMPC_map) &&
237	!FD->getType()->isAnyPointerType()))
238	return;
239	}
240	if (!FD->getType()->isReferenceType()) {
241	assert(!VD->getType()->isVariablyModifiedType() &&(static_cast <bool> (!VD->getType()->isVariablyModifiedType () && "Parameter captured by value with variably modified type" ) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 242, __extension__ __PRETTY_FUNCTION__))
242	"Parameter captured by value with variably modified type")(static_cast <bool> (!VD->getType()->isVariablyModifiedType () && "Parameter captured by value with variably modified type" ) ? void (0) : __assert_fail ("!VD->getType()->isVariablyModifiedType() && \"Parameter captured by value with variably modified type\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 242, __extension__ __PRETTY_FUNCTION__));
243	EscapedParameters.insert(VD);
244	} else if (!IsForCombinedParallelRegion) {
245	return;
246	}
247	}
248	}
249	if ((!CGF.CapturedStmtInfo \|\|
250	(IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
251	VD->getType()->isReferenceType())
252	// Do not globalize variables with reference type.
253	return;
254	if (VD->getType()->isVariablyModifiedType())
255	EscapedVariableLengthDecls.insert(VD);
256	else
257	EscapedDecls.insert(VD);
258	}
259
260	void VisitValueDecl(const ValueDecl *VD) {
261	if (VD->getType()->isLValueReferenceType())
262	markAsEscaped(VD);
263	if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
264	if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
265	const bool SavedAllEscaped = AllEscaped;
266	AllEscaped = VD->getType()->isLValueReferenceType();
267	Visit(VarD->getInit());
268	AllEscaped = SavedAllEscaped;
269	}
270	}
271	}
272	void VisitOpenMPCapturedStmt(const CapturedStmt *S,
273	ArrayRef<OMPClause *> Clauses,
274	bool IsCombinedParallelRegion) {
275	if (!S)
276	return;
277	for (const CapturedStmt::Capture &C : S->captures()) {
278	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
279	const ValueDecl *VD = C.getCapturedVar();
280	bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
281	if (IsCombinedParallelRegion) {
282	// Check if the variable is privatized in the combined construct and
283	// those private copies must be shared in the inner parallel
284	// directive.
285	IsForCombinedParallelRegion = false;
286	for (const OMPClause *C : Clauses) {
287	if (!isOpenMPPrivate(C->getClauseKind()) \|\|
288	C->getClauseKind() == OMPC_reduction \|\|
289	C->getClauseKind() == OMPC_linear \|\|
290	C->getClauseKind() == OMPC_private)
291	continue;
292	ArrayRef<const Expr *> Vars;
293	if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
294	Vars = PC->getVarRefs();
295	else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
296	Vars = PC->getVarRefs();
297	else
298	llvm_unreachable("Unexpected clause.")::llvm::llvm_unreachable_internal("Unexpected clause.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp" , 298);
299	for (const auto *E : Vars) {
300	const Decl *D =
301	cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
302	if (D == VD->getCanonicalDecl()) {
303	IsForCombinedParallelRegion = true;
304	break;
305	}
306	}
307	if (IsForCombinedParallelRegion)
308	break;
309	}
310	}
311	markAsEscaped(VD);
312	if (isa<OMPCapturedExprDecl>(VD))
313	VisitValueDecl(VD);
314	IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
315	}
316	}
317	}
318
319	void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
320	assert(!GlobalizedRD &&(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already." ) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 321, __extension__ __PRETTY_FUNCTION__))
321	"Record for globalized variables is built already.")(static_cast <bool> (!GlobalizedRD && "Record for globalized variables is built already." ) ? void (0) : __assert_fail ("!GlobalizedRD && \"Record for globalized variables is built already.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 321, __extension__ __PRETTY_FUNCTION__));
322	ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
323	unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
324	if (IsInTTDRegion)
325	EscapedDeclsForTeams = EscapedDecls.getArrayRef();
326	else
327	EscapedDeclsForParallel = EscapedDecls.getArrayRef();
328	GlobalizedRD = ::buildRecordForGlobalizedVars(
329	CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
330	MappedDeclsFields, WarpSize);
331	}
332
333	public:
334	CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
335	ArrayRef<const ValueDecl *> TeamsReductions)
336	: CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
337	}
338	virtual ~CheckVarsEscapingDeclContext() = default;
339	void VisitDeclStmt(const DeclStmt *S) {
340	if (!S)
341	return;
342	for (const Decl *D : S->decls())
343	if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
344	VisitValueDecl(VD);
345	}
346	void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
347	if (!D)
348	return;
349	if (!D->hasAssociatedStmt())
350	return;
351	if (const auto *S =
352	dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
353	// Do not analyze directives that do not actually require capturing,
354	// like `omp for` or `omp simd` directives.
355	llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
356	getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
357	if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
358	VisitStmt(S->getCapturedStmt());
359	return;
360	}
361	VisitOpenMPCapturedStmt(
362	S, D->clauses(),
363	CaptureRegions.back() == OMPD_parallel &&
364	isOpenMPDistributeDirective(D->getDirectiveKind()));
365	}
366	}
367	void VisitCapturedStmt(const CapturedStmt *S) {
368	if (!S)
369	return;
370	for (const CapturedStmt::Capture &C : S->captures()) {
371	if (C.capturesVariable() && !C.capturesVariableByCopy()) {
372	const ValueDecl *VD = C.getCapturedVar();
373	markAsEscaped(VD);
374	if (isa<OMPCapturedExprDecl>(VD))
375	VisitValueDecl(VD);
376	}
377	}
378	}
379	void VisitLambdaExpr(const LambdaExpr *E) {
380	if (!E)
381	return;
382	for (const LambdaCapture &C : E->captures()) {
383	if (C.capturesVariable()) {
384	if (C.getCaptureKind() == LCK_ByRef) {
385	const ValueDecl *VD = C.getCapturedVar();
386	markAsEscaped(VD);
387	if (E->isInitCapture(&C) \|\| isa<OMPCapturedExprDecl>(VD))
388	VisitValueDecl(VD);
389	}
390	}
391	}
392	}
393	void VisitBlockExpr(const BlockExpr *E) {
394	if (!E)
395	return;
396	for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
397	if (C.isByRef()) {
398	const VarDecl *VD = C.getVariable();
399	markAsEscaped(VD);
400	if (isa<OMPCapturedExprDecl>(VD) \|\| VD->isInitCapture())
401	VisitValueDecl(VD);
402	}
403	}
404	}
405	void VisitCallExpr(const CallExpr *E) {
406	if (!E)
407	return;
408	for (const Expr *Arg : E->arguments()) {
409	if (!Arg)
410	continue;
411	if (Arg->isLValue()) {
412	const bool SavedAllEscaped = AllEscaped;
413	AllEscaped = true;
414	Visit(Arg);
415	AllEscaped = SavedAllEscaped;
416	} else {
417	Visit(Arg);
418	}
419	}
420	Visit(E->getCallee());
421	}
422	void VisitDeclRefExpr(const DeclRefExpr *E) {
423	if (!E)
424	return;
425	const ValueDecl *VD = E->getDecl();
426	if (AllEscaped)
427	markAsEscaped(VD);
428	if (isa<OMPCapturedExprDecl>(VD))
429	VisitValueDecl(VD);
430	else if (VD->isInitCapture())
431	VisitValueDecl(VD);
432	}
433	void VisitUnaryOperator(const UnaryOperator *E) {
434	if (!E)
435	return;
436	if (E->getOpcode() == UO_AddrOf) {
437	const bool SavedAllEscaped = AllEscaped;
438	AllEscaped = true;
439	Visit(E->getSubExpr());
440	AllEscaped = SavedAllEscaped;
441	} else {
442	Visit(E->getSubExpr());
443	}
444	}
445	void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
446	if (!E)
447	return;
448	if (E->getCastKind() == CK_ArrayToPointerDecay) {
449	const bool SavedAllEscaped = AllEscaped;
450	AllEscaped = true;
451	Visit(E->getSubExpr());
452	AllEscaped = SavedAllEscaped;
453	} else {
454	Visit(E->getSubExpr());
455	}
456	}
457	void VisitExpr(const Expr *E) {
458	if (!E)
459	return;
460	bool SavedAllEscaped = AllEscaped;
461	if (!E->isLValue())
462	AllEscaped = false;
463	for (const Stmt *Child : E->children())
464	if (Child)
465	Visit(Child);
466	AllEscaped = SavedAllEscaped;
467	}
468	void VisitStmt(const Stmt *S) {
469	if (!S)
470	return;
471	for (const Stmt *Child : S->children())
472	if (Child)
473	Visit(Child);
474	}
475
476	/// Returns the record that handles all the escaped local variables and used
477	/// instead of their original storage.
478	const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
479	if (!GlobalizedRD)
480	buildRecordForGlobalizedVars(IsInTTDRegion);
481	return GlobalizedRD;
482	}
483
484	/// Returns the field in the globalized record for the escaped variable.
485	const FieldDecl getFieldForGlobalizedVar(const ValueDecl VD) const {
486	assert(GlobalizedRD &&(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already." ) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 487, __extension__ __PRETTY_FUNCTION__))
487	"Record for globalized variables must be generated already.")(static_cast <bool> (GlobalizedRD && "Record for globalized variables must be generated already." ) ? void (0) : __assert_fail ("GlobalizedRD && \"Record for globalized variables must be generated already.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 487, __extension__ __PRETTY_FUNCTION__));
488	auto I = MappedDeclsFields.find(VD);
489	if (I == MappedDeclsFields.end())
490	return nullptr;
491	return I->getSecond();
492	}
493
494	/// Returns the list of the escaped local variables/parameters.
495	ArrayRef<const ValueDecl *> getEscapedDecls() const {
496	return EscapedDecls.getArrayRef();
497	}
498
499	/// Checks if the escaped local variable is actually a parameter passed by
500	/// value.
501	const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
502	return EscapedParameters;
503	}
504
505	/// Returns the list of the escaped variables with the variably modified
506	/// types.
507	ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
508	return EscapedVariableLengthDecls.getArrayRef();
509	}
510	};
511	} // anonymous namespace
512
513	/// Get the id of the warp in the block.
514	/// We assume that the warp size is 32, which is always the case
515	/// on the NVPTX device, to generate more efficient code.
516	static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
517	CGBuilderTy &Bld = CGF.Builder;
518	unsigned LaneIDBits =
519	llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
520	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
521	return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
522	}
523
524	/// Get the id of the current lane in the Warp.
525	/// We assume that the warp size is 32, which is always the case
526	/// on the NVPTX device, to generate more efficient code.
527	static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
528	CGBuilderTy &Bld = CGF.Builder;
529	unsigned LaneIDBits =
530	llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
531	unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
532	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
533	return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
534	"nvptx_lane_id");
535	}
536
537	CGOpenMPRuntimeGPU::ExecutionMode
538	CGOpenMPRuntimeGPU::getExecutionMode() const {
539	return CurrentExecutionMode;
540	}
541
542	static CGOpenMPRuntimeGPU::DataSharingMode
543	getDataSharingMode(CodeGenModule &CGM) {
544	return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeGPU::CUDA
545	: CGOpenMPRuntimeGPU::Generic;
546	}
547
548	/// Check for inner (nested) SPMD construct, if any
549	static bool hasNestedSPMDDirective(ASTContext &Ctx,
550	const OMPExecutableDirective &D) {
551	const auto *CS = D.getInnermostCapturedStmt();
552	const auto *Body =
553	CS->getCapturedStmt()->IgnoreContainers(/IgnoreCaptured=/true);
554	const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
555
556	if (const auto *NestedDir =
557	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
558	OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
559	switch (D.getDirectiveKind()) {
560	case OMPD_target:
561	if (isOpenMPParallelDirective(DKind))
562	return true;
563	if (DKind == OMPD_teams) {
564	Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
565	/IgnoreCaptured=/true);
566	if (!Body)
567	return false;
568	ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
569	if (const auto *NND =
570	dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
571	DKind = NND->getDirectiveKind();
572	if (isOpenMPParallelDirective(DKind))
573	return true;
574	}
575	}
576	return false;
577	case OMPD_target_teams:
578	return isOpenMPParallelDirective(DKind);
579	case OMPD_target_simd:
580	case OMPD_target_parallel:
581	case OMPD_target_parallel_for:
582	case OMPD_target_parallel_for_simd:
583	case OMPD_target_teams_distribute:
584	case OMPD_target_teams_distribute_simd:
585	case OMPD_target_teams_distribute_parallel_for:
586	case OMPD_target_teams_distribute_parallel_for_simd:
587	case OMPD_parallel:
588	case OMPD_for:
589	case OMPD_parallel_for:
590	case OMPD_parallel_master:
591	case OMPD_parallel_sections:
592	case OMPD_for_simd:
593	case OMPD_parallel_for_simd:
594	case OMPD_cancel:
595	case OMPD_cancellation_point:
596	case OMPD_ordered:
597	case OMPD_threadprivate:
598	case OMPD_allocate:
599	case OMPD_task:
600	case OMPD_simd:
601	case OMPD_sections:
602	case OMPD_section:
603	case OMPD_single:
604	case OMPD_master:
605	case OMPD_critical:
606	case OMPD_taskyield:
607	case OMPD_barrier:
608	case OMPD_taskwait:
609	case OMPD_taskgroup:
610	case OMPD_atomic:
611	case OMPD_flush:
612	case OMPD_depobj:
613	case OMPD_scan:
614	case OMPD_teams:
615	case OMPD_target_data:
616	case OMPD_target_exit_data:
617	case OMPD_target_enter_data:
618	case OMPD_distribute:
619	case OMPD_distribute_simd:
620	case OMPD_distribute_parallel_for:
621	case OMPD_distribute_parallel_for_simd:
622	case OMPD_teams_distribute:
623	case OMPD_teams_distribute_simd:
624	case OMPD_teams_distribute_parallel_for:
625	case OMPD_teams_distribute_parallel_for_simd:
626	case OMPD_target_update:
627	case OMPD_declare_simd:
628	case OMPD_declare_variant:
629	case OMPD_begin_declare_variant:
630	case OMPD_end_declare_variant:
631	case OMPD_declare_target:
632	case OMPD_end_declare_target:
633	case OMPD_declare_reduction:
634	case OMPD_declare_mapper:
635	case OMPD_taskloop:
636	case OMPD_taskloop_simd:
637	case OMPD_master_taskloop:
638	case OMPD_master_taskloop_simd:
639	case OMPD_parallel_master_taskloop:
640	case OMPD_parallel_master_taskloop_simd:
641	case OMPD_requires:
642	case OMPD_unknown:
643	default:
644	llvm_unreachable("Unexpected directive.")::llvm::llvm_unreachable_internal("Unexpected directive.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp" , 644);
645	}
646	}
647
648	return false;
649	}
650
651	static bool supportsSPMDExecutionMode(ASTContext &Ctx,
652	const OMPExecutableDirective &D) {
653	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
654	switch (DirectiveKind) {
655	case OMPD_target:
656	case OMPD_target_teams:
657	return hasNestedSPMDDirective(Ctx, D);
658	case OMPD_target_parallel:
659	case OMPD_target_parallel_for:
660	case OMPD_target_parallel_for_simd:
661	case OMPD_target_teams_distribute_parallel_for:
662	case OMPD_target_teams_distribute_parallel_for_simd:
663	case OMPD_target_simd:
664	case OMPD_target_teams_distribute_simd:
665	return true;
666	case OMPD_target_teams_distribute:
667	return false;
668	case OMPD_parallel:
669	case OMPD_for:
670	case OMPD_parallel_for:
671	case OMPD_parallel_master:
672	case OMPD_parallel_sections:
673	case OMPD_for_simd:
674	case OMPD_parallel_for_simd:
675	case OMPD_cancel:
676	case OMPD_cancellation_point:
677	case OMPD_ordered:
678	case OMPD_threadprivate:
679	case OMPD_allocate:
680	case OMPD_task:
681	case OMPD_simd:
682	case OMPD_sections:
683	case OMPD_section:
684	case OMPD_single:
685	case OMPD_master:
686	case OMPD_critical:
687	case OMPD_taskyield:
688	case OMPD_barrier:
689	case OMPD_taskwait:
690	case OMPD_taskgroup:
691	case OMPD_atomic:
692	case OMPD_flush:
693	case OMPD_depobj:
694	case OMPD_scan:
695	case OMPD_teams:
696	case OMPD_target_data:
697	case OMPD_target_exit_data:
698	case OMPD_target_enter_data:
699	case OMPD_distribute:
700	case OMPD_distribute_simd:
701	case OMPD_distribute_parallel_for:
702	case OMPD_distribute_parallel_for_simd:
703	case OMPD_teams_distribute:
704	case OMPD_teams_distribute_simd:
705	case OMPD_teams_distribute_parallel_for:
706	case OMPD_teams_distribute_parallel_for_simd:
707	case OMPD_target_update:
708	case OMPD_declare_simd:
709	case OMPD_declare_variant:
710	case OMPD_begin_declare_variant:
711	case OMPD_end_declare_variant:
712	case OMPD_declare_target:
713	case OMPD_end_declare_target:
714	case OMPD_declare_reduction:
715	case OMPD_declare_mapper:
716	case OMPD_taskloop:
717	case OMPD_taskloop_simd:
718	case OMPD_master_taskloop:
719	case OMPD_master_taskloop_simd:
720	case OMPD_parallel_master_taskloop:
721	case OMPD_parallel_master_taskloop_simd:
722	case OMPD_requires:
723	case OMPD_unknown:
724	default:
725	break;
726	}
727	llvm_unreachable(::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target." , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 728)
728	"Unknown programming model for OpenMP directive on NVPTX target.")::llvm::llvm_unreachable_internal("Unknown programming model for OpenMP directive on NVPTX target." , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 728);
729	}
730
731	void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
732	StringRef ParentName,
733	llvm::Function *&OutlinedFn,
734	llvm::Constant *&OutlinedFnID,
735	bool IsOffloadEntry,
736	const RegionCodeGenTy &CodeGen) {
737	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
738	EntryFunctionState EST;
739	WrapperFunctionsMap.clear();
740
741	// Emit target region as a standalone region.
742	class NVPTXPrePostActionTy : public PrePostActionTy {
743	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
744
745	public:
746	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
747	: EST(EST) {}
748	void Enter(CodeGenFunction &CGF) override {
749	auto &RT =
750	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
751	RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
752	// Skip target region initialization.
753	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
754	}
755	void Exit(CodeGenFunction &CGF) override {
756	auto &RT =
757	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
758	RT.clearLocThreadIdInsertPt(CGF);
759	RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
760	}
761	} Action(EST);
762	CodeGen.setAction(Action);
763	IsInTTDRegion = true;
764	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
765	IsOffloadEntry, CodeGen);
766	IsInTTDRegion = false;
767	}
768
769	void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
770	EntryFunctionState &EST, bool IsSPMD) {
771	CGBuilderTy &Bld = CGF.Builder;
772	Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD));
773	if (!IsSPMD)
774	emitGenericVarsProlog(CGF, EST.Loc);
775	}
776
777	void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
778	EntryFunctionState &EST,
779	bool IsSPMD) {
780	if (!IsSPMD)
781	emitGenericVarsEpilog(CGF);
782
783	CGBuilderTy &Bld = CGF.Builder;
784	OMPBuilder.createTargetDeinit(Bld, IsSPMD);
785	}
786
787	void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
788	StringRef ParentName,
789	llvm::Function *&OutlinedFn,
790	llvm::Constant *&OutlinedFnID,
791	bool IsOffloadEntry,
792	const RegionCodeGenTy &CodeGen) {
793	ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
794	EntryFunctionState EST;
795
796	// Emit target region as a standalone region.
797	class NVPTXPrePostActionTy : public PrePostActionTy {
798	CGOpenMPRuntimeGPU &RT;
799	CGOpenMPRuntimeGPU::EntryFunctionState &EST;
800
801	public:
802	NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
803	CGOpenMPRuntimeGPU::EntryFunctionState &EST)
804	: RT(RT), EST(EST) {}
805	void Enter(CodeGenFunction &CGF) override {
806	RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
807	// Skip target region initialization.
808	RT.setLocThreadIdInsertPt(CGF, /AtCurrentPoint=/true);
809	}
810	void Exit(CodeGenFunction &CGF) override {
811	RT.clearLocThreadIdInsertPt(CGF);
812	RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
813	}
814	} Action(*this, EST);
815	CodeGen.setAction(Action);
816	IsInTTDRegion = true;
817	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
818	IsOffloadEntry, CodeGen);
819	IsInTTDRegion = false;
820	}
821
822	// Create a unique global variable to indicate the execution mode of this target
823	// region. The execution mode is either 'generic', or 'spmd' depending on the
824	// target directive. This variable is picked up by the offload library to setup
825	// the device appropriately before kernel launch. If the execution mode is
826	// 'generic', the runtime reserves one warp for the master, otherwise, all
827	// warps participate in parallel work.
828	static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
829	bool Mode) {
830	auto *GVMode = new llvm::GlobalVariable(
831	CGM.getModule(), CGM.Int8Ty, /isConstant=/true,
832	llvm::GlobalValue::WeakAnyLinkage,
833	llvm::ConstantInt::get(CGM.Int8Ty, Mode ? OMP_TGT_EXEC_MODE_SPMD
834	: OMP_TGT_EXEC_MODE_GENERIC),
835	Twine(Name, "_exec_mode"));
836	GVMode->setVisibility(llvm::GlobalVariable::ProtectedVisibility);
837	CGM.addCompilerUsedGlobal(GVMode);
838	}
839
840	void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
841	const OMPExecutableDirective &D, StringRef ParentName,
842	llvm::Function &OutlinedFn, llvm::Constant &OutlinedFnID,
843	bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
844	if (!IsOffloadEntry) // Nothing to do.
845	return;
846
847	assert(!ParentName.empty() && "Invalid target region parent name!")(static_cast <bool> (!ParentName.empty() && "Invalid target region parent name!" ) ? void (0) : __assert_fail ("!ParentName.empty() && \"Invalid target region parent name!\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 847, __extension__ __PRETTY_FUNCTION__));
848
849	bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
850	if (Mode)
851	emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
852	CodeGen);
853	else
854	emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
855	CodeGen);
856
857	setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
858	}
859
860	CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
861	: CGOpenMPRuntime(CGM) {
862	llvm::OpenMPIRBuilderConfig Config(CGM.getLangOpts().OpenMPIsDevice, true,
863	hasRequiresUnifiedSharedMemory(),
864	CGM.getLangOpts().OpenMPOffloadMandatory);
865	OMPBuilder.setConfig(Config);
866
867	if (!CGM.getLangOpts().OpenMPIsDevice)
868	llvm_unreachable("OpenMP can only handle device code.")::llvm::llvm_unreachable_internal("OpenMP can only handle device code." , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 868);
869
870	llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
871	if (CGM.getLangOpts().NoGPULib \|\| CGM.getLangOpts().OMPHostIRFile.empty())
872	return;
873
874	OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
875	"__omp_rtl_debug_kind");
876	OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
877	"__omp_rtl_assume_teams_oversubscription");
878	OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
879	"__omp_rtl_assume_threads_oversubscription");
880	OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
881	"__omp_rtl_assume_no_thread_state");
882	OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
883	"__omp_rtl_assume_no_nested_parallelism");
884	}
885
886	void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
887	ProcBindKind ProcBind,
888	SourceLocation Loc) {
889	// Do nothing in case of SPMD mode and L0 parallel.
890	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
891	return;
892
893	CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
894	}
895
896	void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,
897	llvm::Value *NumThreads,
898	SourceLocation Loc) {
899	// Nothing to do.
900	}
901
902	void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,
903	const Expr *NumTeams,
904	const Expr *ThreadLimit,
905	SourceLocation Loc) {}
906
907	llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(
908	CodeGenFunction &CGF, const OMPExecutableDirective &D,
909	const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
910	const RegionCodeGenTy &CodeGen) {
911	// Emit target region as a standalone region.
912	bool PrevIsInTTDRegion = IsInTTDRegion;
913	IsInTTDRegion = false;
914	auto *OutlinedFun =
915	cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
916	CGF, D, ThreadIDVar, InnermostKind, CodeGen));
917	IsInTTDRegion = PrevIsInTTDRegion;
918	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
919	llvm::Function *WrapperFun =
920	createParallelDataSharingWrapper(OutlinedFun, D);
921	WrapperFunctionsMap[OutlinedFun] = WrapperFun;
922	}
923
924	return OutlinedFun;
925	}
926
927	/// Get list of lastprivate variables from the teams distribute ... or
928	/// teams {distribute ...} directives.
929	static void
930	getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
931	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
932	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind ()) && "expected teams directive.") ? void (0) : __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 933, __extension__ __PRETTY_FUNCTION__))
933	"expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind ()) && "expected teams directive.") ? void (0) : __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 933, __extension__ __PRETTY_FUNCTION__));
934	const OMPExecutableDirective *Dir = &D;
935	if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
936	if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(
937	Ctx,
938	D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
939	/IgnoreCaptured=/true))) {
940	Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
941	if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
942	Dir = nullptr;
943	}
944	}
945	if (!Dir)
946	return;
947	for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
948	for (const Expr *E : C->getVarRefs())
949	Vars.push_back(getPrivateItem(E));
950	}
951	}
952
953	/// Get list of reduction variables from the teams ... directives.
954	static void
955	getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,
956	llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
957	assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind ()) && "expected teams directive.") ? void (0) : __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 958, __extension__ __PRETTY_FUNCTION__))
958	"expected teams directive.")(static_cast <bool> (isOpenMPTeamsDirective(D.getDirectiveKind ()) && "expected teams directive.") ? void (0) : __assert_fail ("isOpenMPTeamsDirective(D.getDirectiveKind()) && \"expected teams directive.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 958, __extension__ __PRETTY_FUNCTION__));
959	for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
960	for (const Expr *E : C->privates())
961	Vars.push_back(getPrivateItem(E));
962	}
963	}
964
965	llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
966	CodeGenFunction &CGF, const OMPExecutableDirective &D,
967	const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
968	const RegionCodeGenTy &CodeGen) {
969	SourceLocation Loc = D.getBeginLoc();
970
971	const RecordDecl *GlobalizedRD = nullptr;
972	llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
973	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > MappedDeclsFields;
974	unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
975	// Globalize team reductions variable unconditionally in all modes.
976	if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
977	getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
978	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
979	getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
980	if (!LastPrivatesReductions.empty()) {
981	GlobalizedRD = ::buildRecordForGlobalizedVars(
982	CGM.getContext(), std::nullopt, LastPrivatesReductions,
983	MappedDeclsFields, WarpSize);
984	}
985	} else if (!LastPrivatesReductions.empty()) {
986	assert(!TeamAndReductions.first &&(static_cast <bool> (!TeamAndReductions.first && "Previous team declaration is not expected.") ? void (0) : __assert_fail ("!TeamAndReductions.first && \"Previous team declaration is not expected.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 987, __extension__ __PRETTY_FUNCTION__))
987	"Previous team declaration is not expected.")(static_cast <bool> (!TeamAndReductions.first && "Previous team declaration is not expected.") ? void (0) : __assert_fail ("!TeamAndReductions.first && \"Previous team declaration is not expected.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 987, __extension__ __PRETTY_FUNCTION__));
988	TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
989	std::swap(TeamAndReductions.second, LastPrivatesReductions);
990	}
991
992	// Emit target region as a standalone region.
993	class NVPTXPrePostActionTy : public PrePostActionTy {
994	SourceLocation &Loc;
995	const RecordDecl *GlobalizedRD;
996	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
997	&MappedDeclsFields;
998
999	public:
1000	NVPTXPrePostActionTy(
1001	SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1002	llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
1003	&MappedDeclsFields)
1004	: Loc(Loc), GlobalizedRD(GlobalizedRD),
1005	MappedDeclsFields(MappedDeclsFields) {}
1006	void Enter(CodeGenFunction &CGF) override {
1007	auto &Rt =
1008	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1009	if (GlobalizedRD) {
1010	auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1011	I->getSecond().MappedParams =
1012	std::make_unique<CodeGenFunction::OMPMapVars>();
1013	DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1014	for (const auto &Pair : MappedDeclsFields) {
1015	assert(Pair.getFirst()->isCanonicalDecl() &&(static_cast <bool> (Pair.getFirst()->isCanonicalDecl () && "Expected canonical declaration") ? void (0) : __assert_fail ("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1016, __extension__ __PRETTY_FUNCTION__))
1016	"Expected canonical declaration")(static_cast <bool> (Pair.getFirst()->isCanonicalDecl () && "Expected canonical declaration") ? void (0) : __assert_fail ("Pair.getFirst()->isCanonicalDecl() && \"Expected canonical declaration\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1016, __extension__ __PRETTY_FUNCTION__));
1017	Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1018	}
1019	}
1020	Rt.emitGenericVarsProlog(CGF, Loc);
1021	}
1022	void Exit(CodeGenFunction &CGF) override {
1023	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1024	.emitGenericVarsEpilog(CGF);
1025	}
1026	} Action(Loc, GlobalizedRD, MappedDeclsFields);
1027	CodeGen.setAction(Action);
1028	llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1029	CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1030
1031	return OutlinedFun;
1032	}
1033
1034	void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1035	SourceLocation Loc,
1036	bool WithSPMDCheck) {
1037	if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1038	getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1039	return;
1040
1041	CGBuilderTy &Bld = CGF.Builder;
1042
1043	const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1044	if (I == FunctionGlobalizedDecls.end())
1045	return;
1046
1047	for (auto &Rec : I->getSecond().LocalVarData) {
1048	const auto *VD = cast<VarDecl>(Rec.first);
1049	bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1050	QualType VarTy = VD->getType();
1051
1052	// Get the local allocation of a firstprivate variable before sharing
1053	llvm::Value *ParValue;
1054	if (EscapedParam) {
1055	LValue ParLVal =
1056	CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1057	ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1058	}
1059
1060	// Allocate space for the variable to be globalized
1061	llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1062	llvm::CallBase *VoidPtr =
1063	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1064	CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1065	AllocArgs, VD->getName());
1066	// FIXME: We should use the variables actual alignment as an argument.
1067	VoidPtr->addRetAttr(llvm::Attribute::get(
1068	CGM.getLLVMContext(), llvm::Attribute::Alignment,
1069	CGM.getContext().getTargetInfo().getNewAlign() / 8));
1070
1071	// Cast the void pointer and get the address of the globalized variable.
1072	llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1073	llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1074	VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1075	LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
1076	Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
1077	Rec.second.GlobalizedVal = VoidPtr;
1078
1079	// Assign the local allocation to the newly globalized location.
1080	if (EscapedParam) {
1081	CGF.EmitStoreOfScalar(ParValue, VarAddr);
1082	I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
1083	}
1084	if (auto *DI = CGF.getDebugInfo())
1085	VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1086	}
1087	for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
1088	// Use actual memory size of the VLA object including the padding
1089	// for alignment purposes.
1090	llvm::Value *Size = CGF.getTypeSize(VD->getType());
1091	CharUnits Align = CGM.getContext().getDeclAlign(VD);
1092	Size = Bld.CreateNUWAdd(
1093	Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1094	llvm::Value *AlignVal =
1095	llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1096
1097	Size = Bld.CreateUDiv(Size, AlignVal);
1098	Size = Bld.CreateNUWMul(Size, AlignVal);
	Value stored to 'Size' is never read
1099
1100	// Allocate space for this VLA object to be globalized.
1101	llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1102	llvm::CallBase *VoidPtr =
1103	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1104	CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1105	AllocArgs, VD->getName());
1106	VoidPtr->addRetAttr(
1107	llvm::Attribute::get(CGM.getLLVMContext(), llvm::Attribute::Alignment,
1108	CGM.getContext().getTargetInfo().getNewAlign()));
1109
1110	I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(
1111	std::pair<llvm::Value , llvm::Value >(
1112	{VoidPtr, CGF.getTypeSize(VD->getType())}));
1113	LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
1114	CGM.getContext().getDeclAlign(VD),
1115	AlignmentSource::Decl);
1116	I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1117	Base.getAddress(CGF));
1118	}
1119	I->getSecond().MappedParams->apply(CGF);
1120	}
1121
1122	void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
1123	bool WithSPMDCheck) {
1124	if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic &&
1125	getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1126	return;
1127
1128	const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1129	if (I != FunctionGlobalizedDecls.end()) {
1130	// Deallocate the memory for each globalized VLA object
1131	for (const auto &AddrSizePair :
1132	llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1133	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1134	CGM.getModule(), OMPRTL___kmpc_free_shared),
1135	{AddrSizePair.first, AddrSizePair.second});
1136	}
1137	// Deallocate the memory for each globalized value
1138	for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1139	const auto *VD = cast<VarDecl>(Rec.first);
1140	I->getSecond().MappedParams->restore(CGF);
1141
1142	llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1143	CGF.getTypeSize(VD->getType())};
1144	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1145	CGM.getModule(), OMPRTL___kmpc_free_shared),
1146	FreeArgs);
1147	}
1148	}
1149	}
1150
1151	void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,
1152	const OMPExecutableDirective &D,
1153	SourceLocation Loc,
1154	llvm::Function *OutlinedFn,
1155	ArrayRef<llvm::Value *> CapturedVars) {
1156	if (!CGF.HaveInsertPoint())
1157	return;
1158
1159	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1160	/Name=/".zero.addr");
1161	CGF.Builder.CreateStore(CGF.Builder.getInt32(/C/ 0), ZeroAddr);
1162	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1163	OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1164	OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1165	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1166	emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1167	}
1168
1169	void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
1170	SourceLocation Loc,
1171	llvm::Function *OutlinedFn,
1172	ArrayRef<llvm::Value *> CapturedVars,
1173	const Expr *IfCond,
1174	llvm::Value *NumThreads) {
1175	if (!CGF.HaveInsertPoint())
1176	return;
1177
1178	auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1179	NumThreads](CodeGenFunction &CGF,
1180	PrePostActionTy &Action) {
1181	CGBuilderTy &Bld = CGF.Builder;
1182	llvm::Value *NumThreadsVal = NumThreads;
1183	llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1184	llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1185	if (WFn)
1186	ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1187	llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1188
1189	// Create a private scope that will globalize the arguments
1190	// passed from the outside of the target region.
1191	// TODO: Is that needed?
1192	CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1193
1194	Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1195	llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1196	"captured_vars_addrs");
1197	// There's something to share.
1198	if (!CapturedVars.empty()) {
1199	// Prepare for parallel region. Indicate the outlined function.
1200	ASTContext &Ctx = CGF.getContext();
1201	unsigned Idx = 0;
1202	for (llvm::Value *V : CapturedVars) {
1203	Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1204	llvm::Value *PtrV;
1205	if (V->getType()->isIntegerTy())
1206	PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1207	else
1208	PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1209	CGF.EmitStoreOfScalar(PtrV, Dst, /Volatile=/false,
1210	Ctx.getPointerType(Ctx.VoidPtrTy));
1211	++Idx;
1212	}
1213	}
1214
1215	llvm::Value *IfCondVal = nullptr;
1216	if (IfCond)
1217	IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1218	/* isSigned */ false);
1219	else
1220	IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1221
1222	if (!NumThreadsVal)
1223	NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1224	else
1225	NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1226
1227	assert(IfCondVal && "Expected a value")(static_cast <bool> (IfCondVal && "Expected a value" ) ? void (0) : __assert_fail ("IfCondVal && \"Expected a value\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1227, __extension__ __PRETTY_FUNCTION__));
1228	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1229	llvm::Value *Args[] = {
1230	RTLoc,
1231	getThreadID(CGF, Loc),
1232	IfCondVal,
1233	NumThreadsVal,
1234	llvm::ConstantInt::get(CGF.Int32Ty, -1),
1235	FnPtr,
1236	ID,
1237	Bld.CreateBitOrPointerCast(CapturedVarsAddrs.getPointer(),
1238	CGF.VoidPtrPtrTy),
1239	llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1240	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1241	CGM.getModule(), OMPRTL___kmpc_parallel_51),
1242	Args);
1243	};
1244
1245	RegionCodeGenTy RCG(ParallelGen);
1246	RCG(CGF);
1247	}
1248
1249	void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1250	// Always emit simple barriers!
1251	if (!CGF.HaveInsertPoint())
1252	return;
1253	// Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1254	// This function does not use parameters, so we can emit just default values.
1255	llvm::Value *Args[] = {
1256	llvm::ConstantPointerNull::get(
1257	cast<llvm::PointerType>(getIdentTyPointerTy())),
1258	llvm::ConstantInt::get(CGF.Int32Ty, /V=/0, /isSigned=/true)};
1259	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1260	CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1261	Args);
1262	}
1263
1264	void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,
1265	SourceLocation Loc,
1266	OpenMPDirectiveKind Kind, bool,
1267	bool) {
1268	// Always emit simple barriers!
1269	if (!CGF.HaveInsertPoint())
1270	return;
1271	// Build call __kmpc_cancel_barrier(loc, thread_id);
1272	unsigned Flags = getDefaultFlagsForBarriers(Kind);
1273	llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1274	getThreadID(CGF, Loc)};
1275
1276	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1277	CGM.getModule(), OMPRTL___kmpc_barrier),
1278	Args);
1279	}
1280
1281	void CGOpenMPRuntimeGPU::emitCriticalRegion(
1282	CodeGenFunction &CGF, StringRef CriticalName,
1283	const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1284	const Expr *Hint) {
1285	llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1286	llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1287	llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1288	llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1289	llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1290
1291	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1292
1293	// Get the mask of active threads in the warp.
1294	llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1295	CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1296	// Fetch team-local id of the thread.
1297	llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1298
1299	// Get the width of the team.
1300	llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1301
1302	// Initialize the counter variable for the loop.
1303	QualType Int32Ty =
1304	CGF.getContext().getIntTypeForBitwidth(/DestWidth=/32, /Signed=/0);
1305	Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1306	LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1307	CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1308	/isInit=/true);
1309
1310	// Block checks if loop counter exceeds upper bound.
1311	CGF.EmitBlock(LoopBB);
1312	llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1313	llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1314	CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1315
1316	// Block tests which single thread should execute region, and which threads
1317	// should go straight to synchronisation point.
1318	CGF.EmitBlock(TestBB);
1319	CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1320	llvm::Value *CmpThreadToCounter =
1321	CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1322	CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1323
1324	// Block emits the body of the critical region.
1325	CGF.EmitBlock(BodyBB);
1326
1327	// Output the critical statement.
1328	CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1329	Hint);
1330
1331	// After the body surrounded by the critical region, the single executing
1332	// thread will jump to the synchronisation point.
1333	// Block waits for all threads in current team to finish then increments the
1334	// counter variable and returns to the loop.
1335	CGF.EmitBlock(SyncBB);
1336	// Reconverge active threads in the warp.
1337	(void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1338	CGM.getModule(), OMPRTL___kmpc_syncwarp),
1339	Mask);
1340
1341	llvm::Value *IncCounterVal =
1342	CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1343	CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1344	CGF.EmitBranch(LoopBB);
1345
1346	// Block that is reached when all threads in the team complete the region.
1347	CGF.EmitBlock(ExitBB, /IsFinished=/true);
1348	}
1349
1350	/// Cast value to the specified type.
1351	static llvm::Value castValueToType(CodeGenFunction &CGF, llvm::Value Val,
1352	QualType ValTy, QualType CastTy,
1353	SourceLocation Loc) {
1354	assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars (CastTy).isZero() && "Cast type must sized.") ? void ( 0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1355, __extension__ __PRETTY_FUNCTION__))
1355	"Cast type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars (CastTy).isZero() && "Cast type must sized.") ? void ( 0) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(CastTy).isZero() && \"Cast type must sized.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1355, __extension__ __PRETTY_FUNCTION__));
1356	assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&(static_cast <bool> (!CGF.getContext().getTypeSizeInChars (ValTy).isZero() && "Val type must sized.") ? void (0 ) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1357, __extension__ __PRETTY_FUNCTION__))
1357	"Val type must sized.")(static_cast <bool> (!CGF.getContext().getTypeSizeInChars (ValTy).isZero() && "Val type must sized.") ? void (0 ) : __assert_fail ("!CGF.getContext().getTypeSizeInChars(ValTy).isZero() && \"Val type must sized.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1357, __extension__ __PRETTY_FUNCTION__));
1358	llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1359	if (ValTy == CastTy)
1360	return Val;
1361	if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1362	CGF.getContext().getTypeSizeInChars(CastTy))
1363	return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1364	if (CastTy->isIntegerType() && ValTy->isIntegerType())
1365	return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1366	CastTy->hasSignedIntegerRepresentation());
1367	Address CastItem = CGF.CreateMemTemp(CastTy);
1368	Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1369	CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()),
1370	Val->getType());
1371	CGF.EmitStoreOfScalar(Val, ValCastItem, /Volatile=/false, ValTy,
1372	LValueBaseInfo(AlignmentSource::Type),
1373	TBAAAccessInfo());
1374	return CGF.EmitLoadOfScalar(CastItem, /Volatile=/false, CastTy, Loc,
1375	LValueBaseInfo(AlignmentSource::Type),
1376	TBAAAccessInfo());
1377	}
1378
1379	/// This function creates calls to one of two shuffle functions to copy
1380	/// variables between lanes in a warp.
1381	static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1382	llvm::Value *Elem,
1383	QualType ElemType,
1384	llvm::Value *Offset,
1385	SourceLocation Loc) {
1386	CodeGenModule &CGM = CGF.CGM;
1387	CGBuilderTy &Bld = CGF.Builder;
1388	CGOpenMPRuntimeGPU &RT =
1389	(static_cast<CGOpenMPRuntimeGPU >(&CGM.getOpenMPRuntime()));
1390	llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1391
1392	CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1393	assert(Size.getQuantity() <= 8 &&(static_cast <bool> (Size.getQuantity() <= 8 && "Unsupported bitwidth in shuffle instruction.") ? void (0) : __assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1394, __extension__ __PRETTY_FUNCTION__))
1394	"Unsupported bitwidth in shuffle instruction.")(static_cast <bool> (Size.getQuantity() <= 8 && "Unsupported bitwidth in shuffle instruction.") ? void (0) : __assert_fail ("Size.getQuantity() <= 8 && \"Unsupported bitwidth in shuffle instruction.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 1394, __extension__ __PRETTY_FUNCTION__));
1395
1396	RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1397	? OMPRTL___kmpc_shuffle_int32
1398	: OMPRTL___kmpc_shuffle_int64;
1399
1400	// Cast all types to 32- or 64-bit values before calling shuffle routines.
1401	QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1402	Size.getQuantity() <= 4 ? 32 : 64, /Signed=/1);
1403	llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1404	llvm::Value *WarpSize =
1405	Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /isSigned=/true);
1406
1407	llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1408	OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1409	{ElemCast, Offset, WarpSize});
1410
1411	return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1412	}
1413
1414	static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1415	Address DestAddr, QualType ElemType,
1416	llvm::Value *Offset, SourceLocation Loc) {
1417	CGBuilderTy &Bld = CGF.Builder;
1418
1419	CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1420	// Create the loop over the big sized data.
1421	// ptr = (void*)Elem;
1422	// ptrEnd = (void*) Elem + 1;
1423	// Step = 8;
1424	// while (ptr + Step < ptrEnd)
1425	// shuffle((int64_t)*ptr);
1426	// Step = 4;
1427	// while (ptr + Step < ptrEnd)
1428	// shuffle((int32_t)*ptr);
1429	// ...
1430	Address ElemPtr = DestAddr;
1431	Address Ptr = SrcAddr;
1432	Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
1433	Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
1434	for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1435	if (Size < CharUnits::fromQuantity(IntSize))
1436	continue;
1437	QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1438	CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1439	/Signed=/1);
1440	llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1441	Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
1442	IntTy);
1443	ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1444	ElemPtr, IntTy->getPointerTo(), IntTy);
1445	if (Size.getQuantity() / IntSize > 1) {
1446	llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1447	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1448	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1449	llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1450	CGF.EmitBlock(PreCondBB);
1451	llvm::PHINode *PhiSrc =
1452	Bld.CreatePHI(Ptr.getType(), /NumReservedValues=/2);
1453	PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
1454	llvm::PHINode *PhiDest =
1455	Bld.CreatePHI(ElemPtr.getType(), /NumReservedValues=/2);
1456	PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
1457	Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1458	ElemPtr =
1459	Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1460	llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1461	CGF.Int8Ty, PtrEnd.getPointer(),
1462	Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
1463	CGF.VoidPtrTy));
1464	Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1465	ThenBB, ExitBB);
1466	CGF.EmitBlock(ThenBB);
1467	llvm::Value *Res = createRuntimeShuffleFunction(
1468	CGF,
1469	CGF.EmitLoadOfScalar(Ptr, /Volatile=/false, IntType, Loc,
1470	LValueBaseInfo(AlignmentSource::Type),
1471	TBAAAccessInfo()),
1472	IntType, Offset, Loc);
1473	CGF.EmitStoreOfScalar(Res, ElemPtr, /Volatile=/false, IntType,
1474	LValueBaseInfo(AlignmentSource::Type),
1475	TBAAAccessInfo());
1476	Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1477	Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1478	PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
1479	PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
1480	CGF.EmitBranch(PreCondBB);
1481	CGF.EmitBlock(ExitBB);
1482	} else {
1483	llvm::Value *Res = createRuntimeShuffleFunction(
1484	CGF,
1485	CGF.EmitLoadOfScalar(Ptr, /Volatile=/false, IntType, Loc,
1486	LValueBaseInfo(AlignmentSource::Type),
1487	TBAAAccessInfo()),
1488	IntType, Offset, Loc);
1489	CGF.EmitStoreOfScalar(Res, ElemPtr, /Volatile=/false, IntType,
1490	LValueBaseInfo(AlignmentSource::Type),
1491	TBAAAccessInfo());
1492	Ptr = Bld.CreateConstGEP(Ptr, 1);
1493	ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1494	}
1495	Size = Size % IntSize;
1496	}
1497	}
1498
1499	namespace {
1500	enum CopyAction : unsigned {
1501	// RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1502	// the warp using shuffle instructions.
1503	RemoteLaneToThread,
1504	// ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1505	ThreadCopy,
1506	// ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1507	ThreadToScratchpad,
1508	// ScratchpadToThread: Copy from a scratchpad array in global memory
1509	// containing team-reduced data to a thread's stack.
1510	ScratchpadToThread,
1511	};
1512	} // namespace
1513
1514	struct CopyOptionsTy {
1515	llvm::Value *RemoteLaneOffset;
1516	llvm::Value *ScratchpadIndex;
1517	llvm::Value *ScratchpadWidth;
1518	};
1519
1520	/// Emit instructions to copy a Reduce list, which contains partially
1521	/// aggregated values, in the specified direction.
1522	static void emitReductionListCopy(
1523	CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1524	ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1525	CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1526
1527	CodeGenModule &CGM = CGF.CGM;
1528	ASTContext &C = CGM.getContext();
1529	CGBuilderTy &Bld = CGF.Builder;
1530
1531	llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1532	llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1533	llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1534
1535	// Iterates, element-by-element, through the source Reduce list and
1536	// make a copy.
1537	unsigned Idx = 0;
1538	unsigned Size = Privates.size();
1539	for (const Expr *Private : Privates) {
1540	Address SrcElementAddr = Address::invalid();
1541	Address DestElementAddr = Address::invalid();
1542	Address DestElementPtrAddr = Address::invalid();
1543	// Should we shuffle in an element from a remote lane?
1544	bool ShuffleInElement = false;
1545	// Set to true to update the pointer in the dest Reduce list to a
1546	// newly created element.
1547	bool UpdateDestListPtr = false;
1548	// Increment the src or dest pointer to the scratchpad, for each
1549	// new element.
1550	bool IncrScratchpadSrc = false;
1551	bool IncrScratchpadDest = false;
1552	QualType PrivatePtrType = C.getPointerType(Private->getType());
1553	llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
1554
1555	switch (Action) {
1556	case RemoteLaneToThread: {
1557	// Step 1.1: Get the address for the src element in the Reduce list.
1558	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1559	SrcElementAddr =
1560	CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1561	SrcElementPtrAddr, PrivateLlvmPtrType),
1562	PrivatePtrType->castAs<PointerType>());
1563
1564	// Step 1.2: Create a temporary to store the element in the destination
1565	// Reduce list.
1566	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1567	DestElementAddr =
1568	CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1569	ShuffleInElement = true;
1570	UpdateDestListPtr = true;
1571	break;
1572	}
1573	case ThreadCopy: {
1574	// Step 1.1: Get the address for the src element in the Reduce list.
1575	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1576	SrcElementAddr =
1577	CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1578	SrcElementPtrAddr, PrivateLlvmPtrType),
1579	PrivatePtrType->castAs<PointerType>());
1580
1581	// Step 1.2: Get the address for dest element. The destination
1582	// element has already been created on the thread's stack.
1583	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1584	DestElementAddr =
1585	CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1586	DestElementPtrAddr, PrivateLlvmPtrType),
1587	PrivatePtrType->castAs<PointerType>());
1588	break;
1589	}
1590	case ThreadToScratchpad: {
1591	// Step 1.1: Get the address for the src element in the Reduce list.
1592	Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1593	SrcElementAddr =
1594	CGF.EmitLoadOfPointer(CGF.Builder.CreateElementBitCast(
1595	SrcElementPtrAddr, PrivateLlvmPtrType),
1596	PrivatePtrType->castAs<PointerType>());
1597
1598	// Step 1.2: Get the address for dest element:
1599	// address = base + index * ElementSizeInChars.
1600	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1601	llvm::Value *CurrentOffset =
1602	Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1603	llvm::Value *ScratchPadElemAbsolutePtrVal =
1604	Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
1605	ScratchPadElemAbsolutePtrVal =
1606	Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1607	DestElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1608	C.getTypeAlignInChars(Private->getType()));
1609	IncrScratchpadDest = true;
1610	break;
1611	}
1612	case ScratchpadToThread: {
1613	// Step 1.1: Get the address for the src element in the scratchpad.
1614	// address = base + index * ElementSizeInChars.
1615	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1616	llvm::Value *CurrentOffset =
1617	Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
1618	llvm::Value *ScratchPadElemAbsolutePtrVal =
1619	Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
1620	ScratchPadElemAbsolutePtrVal =
1621	Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1622	SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, CGF.Int8Ty,
1623	C.getTypeAlignInChars(Private->getType()));
1624	IncrScratchpadSrc = true;
1625
1626	// Step 1.2: Create a temporary to store the element in the destination
1627	// Reduce list.
1628	DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1629	DestElementAddr =
1630	CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1631	UpdateDestListPtr = true;
1632	break;
1633	}
1634	}
1635
1636	// Regardless of src and dest of copy, we emit the load of src
1637	// element as this is required in all directions
1638	SrcElementAddr = Bld.CreateElementBitCast(
1639	SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1640	DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1641	SrcElementAddr.getElementType());
1642
1643	// Now that all active lanes have read the element in the
1644	// Reduce list, shuffle over the value from the remote lane.
1645	if (ShuffleInElement) {
1646	shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1647	RemoteLaneOffset, Private->getExprLoc());
1648	} else {
1649	switch (CGF.getEvaluationKind(Private->getType())) {
1650	case TEK_Scalar: {
1651	llvm::Value *Elem = CGF.EmitLoadOfScalar(
1652	SrcElementAddr, /Volatile=/false, Private->getType(),
1653	Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
1654	TBAAAccessInfo());
1655	// Store the source element value to the dest element address.
1656	CGF.EmitStoreOfScalar(
1657	Elem, DestElementAddr, /Volatile=/false, Private->getType(),
1658	LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
1659	break;
1660	}
1661	case TEK_Complex: {
1662	CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
1663	CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1664	Private->getExprLoc());
1665	CGF.EmitStoreOfComplex(
1666	Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1667	/isInit=/false);
1668	break;
1669	}
1670	case TEK_Aggregate:
1671	CGF.EmitAggregateCopy(
1672	CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1673	CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1674	Private->getType(), AggValueSlot::DoesNotOverlap);
1675	break;
1676	}
1677	}
1678
1679	// Step 3.1: Modify reference in dest Reduce list as needed.
1680	// Modifying the reference in Reduce list to point to the newly
1681	// created element. The element is live in the current function
1682	// scope and that of functions it invokes (i.e., reduce_function).
1683	// RemoteReduceData[i] = (void*)&RemoteElem
1684	if (UpdateDestListPtr) {
1685	CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1686	DestElementAddr.getPointer(), CGF.VoidPtrTy),
1687	DestElementPtrAddr, /Volatile=/false,
1688	C.VoidPtrTy);
1689	}
1690
1691	// Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1692	// address of the next element in scratchpad memory, unless we're currently
1693	// processing the last one. Memory alignment is also taken care of here.
1694	if ((IncrScratchpadDest \|\| IncrScratchpadSrc) && (Idx + 1 < Size)) {
1695	// FIXME: This code doesn't make any sense, it's trying to perform
1696	// integer arithmetic on pointers.
1697	llvm::Value *ScratchpadBasePtr =
1698	IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1699	llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
1700	ScratchpadBasePtr = Bld.CreateNUWAdd(
1701	ScratchpadBasePtr,
1702	Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
1703
1704	// Take care of global memory alignment for performance
1705	ScratchpadBasePtr = Bld.CreateNUWSub(
1706	ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1707	ScratchpadBasePtr = Bld.CreateUDiv(
1708	ScratchpadBasePtr,
1709	llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1710	ScratchpadBasePtr = Bld.CreateNUWAdd(
1711	ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
1712	ScratchpadBasePtr = Bld.CreateNUWMul(
1713	ScratchpadBasePtr,
1714	llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1715
1716	if (IncrScratchpadDest)
1717	DestBase =
1718	Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1719	else /* IncrScratchpadSrc = true */
1720	SrcBase =
1721	Address(ScratchpadBasePtr, CGF.VoidPtrTy, CGF.getPointerAlign());
1722	}
1723
1724	++Idx;
1725	}
1726	}
1727
1728	/// This function emits a helper that gathers Reduce lists from the first
1729	/// lane of every active warp to lanes in the first warp.
1730	///
1731	/// void inter_warp_copy_func(void* reduce_data, num_warps)
1732	/// shared smem[warp_size];
1733	/// For all data entries D in reduce_data:
1734	/// sync
1735	/// If (I am the first lane in each warp)
1736	/// Copy my local D to smem[warp_id]
1737	/// sync
1738	/// if (I am the first warp)
1739	/// Copy smem[thread_id] to my local D
1740	static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1741	ArrayRef<const Expr *> Privates,
1742	QualType ReductionArrayTy,
1743	SourceLocation Loc) {
1744	ASTContext &C = CGM.getContext();
1745	llvm::Module &M = CGM.getModule();
1746
1747	// ReduceList: thread local Reduce list.
1748	// At the stage of the computation when this function is called, partially
1749	// aggregated values reside in the first lane of every active warp.
1750	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
1751	C.VoidPtrTy, ImplicitParamDecl::Other);
1752	// NumWarps: number of warps active in the parallel region. This could
1753	// be smaller than 32 (max warps in a CTA) for partial block reduction.
1754	ImplicitParamDecl NumWarpsArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
1755	C.getIntTypeForBitwidth(32, /* Signed */ true),
1756	ImplicitParamDecl::Other);
1757	FunctionArgList Args;
1758	Args.push_back(&ReduceListArg);
1759	Args.push_back(&NumWarpsArg);
1760
1761	const CGFunctionInfo &CGFI =
1762	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1763	auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
1764	llvm::GlobalValue::InternalLinkage,
1765	"_omp_reduction_inter_warp_copy_func", &M);
1766	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
1767	Fn->setDoesNotRecurse();
1768	CodeGenFunction CGF(CGM);
1769	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
1770
1771	CGBuilderTy &Bld = CGF.Builder;
1772
1773	// This array is used as a medium to transfer, one reduce element at a time,
1774	// the data from the first lane of every warp to lanes in the first warp
1775	// in order to perform the final step of a reduction in a parallel region
1776	// (reduction across warps). The array is placed in NVPTX __shared__ memory
1777	// for reduced latency, as well as to have a distinct copy for concurrently
1778	// executing target regions. The array is declared with common linkage so
1779	// as to be shared across compilation units.
1780	StringRef TransferMediumName =
1781	"__openmp_nvptx_data_transfer_temporary_storage";
1782	llvm::GlobalVariable *TransferMedium =
1783	M.getGlobalVariable(TransferMediumName);
1784	unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1785	if (!TransferMedium) {
1786	auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
1787	unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1788	TransferMedium = new llvm::GlobalVariable(
1789	M, Ty, /isConstant=/false, llvm::GlobalVariable::WeakAnyLinkage,
1790	llvm::UndefValue::get(Ty), TransferMediumName,
1791	/InsertBefore=/nullptr, llvm::GlobalVariable::NotThreadLocal,
1792	SharedAddressSpace);
1793	CGM.addCompilerUsedGlobal(TransferMedium);
1794	}
1795
1796	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1797	// Get the CUDA thread id of the current OpenMP thread on the GPU.
1798	llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1799	// nvptx_lane_id = nvptx_id % warpsize
1800	llvm::Value *LaneID = getNVPTXLaneID(CGF);
1801	// nvptx_warp_id = nvptx_id / warpsize
1802	llvm::Value *WarpID = getNVPTXWarpID(CGF);
1803
1804	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1805	llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
1806	Address LocalReduceList(
1807	Bld.CreatePointerBitCastOrAddrSpaceCast(
1808	CGF.EmitLoadOfScalar(
1809	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc,
1810	LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
1811	ElemTy->getPointerTo()),
1812	ElemTy, CGF.getPointerAlign());
1813
1814	unsigned Idx = 0;
1815	for (const Expr *Private : Privates) {
1816	//
1817	// Warp master copies reduce element to transfer medium in __shared__
1818	// memory.
1819	//
1820	unsigned RealTySize =
1821	C.getTypeSizeInChars(Private->getType())
1822	.alignTo(C.getTypeAlignInChars(Private->getType()))
1823	.getQuantity();
1824	for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1825	unsigned NumIters = RealTySize / TySize;
1826	if (NumIters == 0)
1827	continue;
1828	QualType CType = C.getIntTypeForBitwidth(
1829	C.toBits(CharUnits::fromQuantity(TySize)), /Signed=/1);
1830	llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
1831	CharUnits Align = CharUnits::fromQuantity(TySize);
1832	llvm::Value *Cnt = nullptr;
1833	Address CntAddr = Address::invalid();
1834	llvm::BasicBlock *PrecondBB = nullptr;
1835	llvm::BasicBlock *ExitBB = nullptr;
1836	if (NumIters > 1) {
1837	CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1838	CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
1839	/Volatile=/false, C.IntTy);
1840	PrecondBB = CGF.createBasicBlock("precond");
1841	ExitBB = CGF.createBasicBlock("exit");
1842	llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
1843	// There is no need to emit line number for unconditional branch.
1844	(void)ApplyDebugLocation::CreateEmpty(CGF);
1845	CGF.EmitBlock(PrecondBB);
1846	Cnt = CGF.EmitLoadOfScalar(CntAddr, /Volatile=/false, C.IntTy, Loc);
1847	llvm::Value *Cmp =
1848	Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
1849	Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
1850	CGF.EmitBlock(BodyBB);
1851	}
1852	// kmpc_barrier.
1853	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1854	/EmitChecks=/false,
1855	/ForceSimpleCall=/true);
1856	llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1857	llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1858	llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1859
1860	// if (lane_id == 0)
1861	llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
1862	Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1863	CGF.EmitBlock(ThenBB);
1864
1865	// Reduce element = LocalReduceList[i]
1866	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1867	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1868	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
1869	// elemptr = ((CopyType*)(elemptrptr)) + I
1870	Address ElemPtr(ElemPtrPtr, CGF.Int8Ty, Align);
1871	ElemPtr = Bld.CreateElementBitCast(ElemPtr, CopyType);
1872	if (NumIters > 1)
1873	ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
1874
1875	// Get pointer to location in transfer medium.
1876	// MediumPtr = &medium[warp_id]
1877	llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1878	TransferMedium->getValueType(), TransferMedium,
1879	{llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1880	// Casting to actual data type.
1881	// MediumPtr = (CopyType*)MediumPtrAddr;
1882	Address MediumPtr(
1883	Bld.CreateBitCast(
1884	MediumPtrVal,
1885	CopyType->getPointerTo(
1886	MediumPtrVal->getType()->getPointerAddressSpace())),
1887	CopyType, Align);
1888
1889	// elem = *elemptr
1890	//*MediumPtr = elem
1891	llvm::Value *Elem = CGF.EmitLoadOfScalar(
1892	ElemPtr, /Volatile=/false, CType, Loc,
1893	LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
1894	// Store the source element value to the dest element address.
1895	CGF.EmitStoreOfScalar(Elem, MediumPtr, /Volatile=/true, CType,
1896	LValueBaseInfo(AlignmentSource::Type),
1897	TBAAAccessInfo());
1898
1899	Bld.CreateBr(MergeBB);
1900
1901	CGF.EmitBlock(ElseBB);
1902	Bld.CreateBr(MergeBB);
1903
1904	CGF.EmitBlock(MergeBB);
1905
1906	// kmpc_barrier.
1907	CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1908	/EmitChecks=/false,
1909	/ForceSimpleCall=/true);
1910
1911	//
1912	// Warp 0 copies reduce element from transfer medium.
1913	//
1914	llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1915	llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1916	llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1917
1918	Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1919	llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1920	AddrNumWarpsArg, /Volatile=/false, C.IntTy, Loc);
1921
1922	// Up to 32 threads in warp 0 are active.
1923	llvm::Value *IsActiveThread =
1924	Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1925	Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1926
1927	CGF.EmitBlock(W0ThenBB);
1928
1929	// SrcMediumPtr = &medium[tid]
1930	llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1931	TransferMedium->getValueType(), TransferMedium,
1932	{llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1933	// SrcMediumVal = *SrcMediumPtr;
1934	Address SrcMediumPtr(
1935	Bld.CreateBitCast(
1936	SrcMediumPtrVal,
1937	CopyType->getPointerTo(
1938	SrcMediumPtrVal->getType()->getPointerAddressSpace())),
1939	CopyType, Align);
1940
1941	// TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
1942	Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1943	llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1944	TargetElemPtrPtr, /Volatile=/false, C.VoidPtrTy, Loc);
1945	Address TargetElemPtr(TargetElemPtrVal, CGF.Int8Ty, Align);
1946	TargetElemPtr = Bld.CreateElementBitCast(TargetElemPtr, CopyType);
1947	if (NumIters > 1)
1948	TargetElemPtr = Bld.CreateGEP(TargetElemPtr, Cnt);
1949
1950	// *TargetElemPtr = SrcMediumVal;
1951	llvm::Value *SrcMediumValue =
1952	CGF.EmitLoadOfScalar(SrcMediumPtr, /Volatile=/true, CType, Loc);
1953	CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /Volatile=/false,
1954	CType);
1955	Bld.CreateBr(W0MergeBB);
1956
1957	CGF.EmitBlock(W0ElseBB);
1958	Bld.CreateBr(W0MergeBB);
1959
1960	CGF.EmitBlock(W0MergeBB);
1961
1962	if (NumIters > 1) {
1963	Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /V=/1));
1964	CGF.EmitStoreOfScalar(Cnt, CntAddr, /Volatile=/false, C.IntTy);
1965	CGF.EmitBranch(PrecondBB);
1966	(void)ApplyDebugLocation::CreateEmpty(CGF);
1967	CGF.EmitBlock(ExitBB);
1968	}
1969	RealTySize %= TySize;
1970	}
1971	++Idx;
1972	}
1973
1974	CGF.FinishFunction();
1975	return Fn;
1976	}
1977
1978	/// Emit a helper that reduces data across two OpenMP threads (lanes)
1979	/// in the same warp. It uses shuffle instructions to copy over data from
1980	/// a remote lane's stack. The reduction algorithm performed is specified
1981	/// by the fourth parameter.
1982	///
1983	/// Algorithm Versions.
1984	/// Full Warp Reduce (argument value 0):
1985	/// This algorithm assumes that all 32 lanes are active and gathers
1986	/// data from these 32 lanes, producing a single resultant value.
1987	/// Contiguous Partial Warp Reduce (argument value 1):
1988	/// This algorithm assumes that only a contiguous subset of lanes
1989	/// are active. This happens for the last warp in a parallel region
1990	/// when the user specified num_threads is not an integer multiple of
1991	/// 32. This contiguous subset always starts with the zeroth lane.
1992	/// Partial Warp Reduce (argument value 2):
1993	/// This algorithm gathers data from any number of lanes at any position.
1994	/// All reduced values are stored in the lowest possible lane. The set
1995	/// of problems every algorithm addresses is a super set of those
1996	/// addressable by algorithms with a lower version number. Overhead
1997	/// increases as algorithm version increases.
1998	///
1999	/// Terminology
2000	/// Reduce element:
2001	/// Reduce element refers to the individual data field with primitive
2002	/// data types to be combined and reduced across threads.
2003	/// Reduce list:
2004	/// Reduce list refers to a collection of local, thread-private
2005	/// reduce elements.
2006	/// Remote Reduce list:
2007	/// Remote Reduce list refers to a collection of remote (relative to
2008	/// the current thread) reduce elements.
2009	///
2010	/// We distinguish between three states of threads that are important to
2011	/// the implementation of this function.
2012	/// Alive threads:
2013	/// Threads in a warp executing the SIMT instruction, as distinguished from
2014	/// threads that are inactive due to divergent control flow.
2015	/// Active threads:
2016	/// The minimal set of threads that has to be alive upon entry to this
2017	/// function. The computation is correct iff active threads are alive.
2018	/// Some threads are alive but they are not active because they do not
2019	/// contribute to the computation in any useful manner. Turning them off
2020	/// may introduce control flow overheads without any tangible benefits.
2021	/// Effective threads:
2022	/// In order to comply with the argument requirements of the shuffle
2023	/// function, we must keep all lanes holding data alive. But at most
2024	/// half of them perform value aggregation; we refer to this half of
2025	/// threads as effective. The other half is simply handing off their
2026	/// data.
2027	///
2028	/// Procedure
2029	/// Value shuffle:
2030	/// In this step active threads transfer data from higher lane positions
2031	/// in the warp to lower lane positions, creating Remote Reduce list.
2032	/// Value aggregation:
2033	/// In this step, effective threads combine their thread local Reduce list
2034	/// with Remote Reduce list and store the result in the thread local
2035	/// Reduce list.
2036	/// Value copy:
2037	/// In this step, we deal with the assumption made by algorithm 2
2038	/// (i.e. contiguity assumption). When we have an odd number of lanes
2039	/// active, say 2k+1, only k threads will be effective and therefore k
2040	/// new values will be produced. However, the Reduce list owned by the
2041	/// (2k+1)th thread is ignored in the value aggregation. Therefore
2042	/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2043	/// that the contiguity assumption still holds.
2044	static llvm::Function *emitShuffleAndReduceFunction(
2045	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2046	QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2047	ASTContext &C = CGM.getContext();
2048
2049	// Thread local Reduce list used to host the values of data to be reduced.
2050	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2051	C.VoidPtrTy, ImplicitParamDecl::Other);
2052	// Current lane id; could be logical.
2053	ImplicitParamDecl LaneIDArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.ShortTy,
2054	ImplicitParamDecl::Other);
2055	// Offset of the remote source lane relative to the current lane.
2056	ImplicitParamDecl RemoteLaneOffsetArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2057	C.ShortTy, ImplicitParamDecl::Other);
2058	// Algorithm version. This is expected to be known at compile time.
2059	ImplicitParamDecl AlgoVerArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2060	C.ShortTy, ImplicitParamDecl::Other);
2061	FunctionArgList Args;
2062	Args.push_back(&ReduceListArg);
2063	Args.push_back(&LaneIDArg);
2064	Args.push_back(&RemoteLaneOffsetArg);
2065	Args.push_back(&AlgoVerArg);
2066
2067	const CGFunctionInfo &CGFI =
2068	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2069	auto *Fn = llvm::Function::Create(
2070	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2071	"_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2072	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2073	Fn->setDoesNotRecurse();
2074
2075	CodeGenFunction CGF(CGM);
2076	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2077
2078	CGBuilderTy &Bld = CGF.Builder;
2079
2080	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2081	llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2082	Address LocalReduceList(
2083	Bld.CreatePointerBitCastOrAddrSpaceCast(
2084	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2085	C.VoidPtrTy, SourceLocation()),
2086	ElemTy->getPointerTo()),
2087	ElemTy, CGF.getPointerAlign());
2088
2089	Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2090	llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2091	AddrLaneIDArg, /Volatile=/false, C.ShortTy, SourceLocation());
2092
2093	Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2094	llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2095	AddrRemoteLaneOffsetArg, /Volatile=/false, C.ShortTy, SourceLocation());
2096
2097	Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2098	llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2099	AddrAlgoVerArg, /Volatile=/false, C.ShortTy, SourceLocation());
2100
2101	// Create a local thread-private variable to host the Reduce list
2102	// from a remote lane.
2103	Address RemoteReduceList =
2104	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2105
2106	// This loop iterates through the list of reduce elements and copies,
2107	// element by element, from a remote lane in the warp to RemoteReduceList,
2108	// hosted on the thread's stack.
2109	emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2110	LocalReduceList, RemoteReduceList,
2111	{/RemoteLaneOffset=/RemoteLaneOffsetArgVal,
2112	/ScratchpadIndex=/nullptr,
2113	/ScratchpadWidth=/nullptr});
2114
2115	// The actions to be performed on the Remote Reduce list is dependent
2116	// on the algorithm version.
2117	//
2118	// if (AlgoVer==0) \|\| (AlgoVer==1 && (LaneId < Offset)) \|\| (AlgoVer==2 &&
2119	// LaneId % 2 == 0 && Offset > 0):
2120	// do the reduction value aggregation
2121	//
2122	// The thread local variable Reduce list is mutated in place to host the
2123	// reduced data, which is the aggregated value produced from local and
2124	// remote lanes.
2125	//
2126	// Note that AlgoVer is expected to be a constant integer known at compile
2127	// time.
2128	// When AlgoVer==0, the first conjunction evaluates to true, making
2129	// the entire predicate true during compile time.
2130	// When AlgoVer==1, the second conjunction has only the second part to be
2131	// evaluated during runtime. Other conjunctions evaluates to false
2132	// during compile time.
2133	// When AlgoVer==2, the third conjunction has only the second part to be
2134	// evaluated during runtime. Other conjunctions evaluates to false
2135	// during compile time.
2136	llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2137
2138	llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2139	llvm::Value *CondAlgo1 = Bld.CreateAnd(
2140	Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2141
2142	llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2143	llvm::Value *CondAlgo2 = Bld.CreateAnd(
2144	Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2145	CondAlgo2 = Bld.CreateAnd(
2146	CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2147
2148	llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2149	CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2150
2151	llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2152	llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2153	llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2154	Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2155
2156	CGF.EmitBlock(ThenBB);
2157	// reduce_function(LocalReduceList, RemoteReduceList)
2158	llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2159	LocalReduceList.getPointer(), CGF.VoidPtrTy);
2160	llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2161	RemoteReduceList.getPointer(), CGF.VoidPtrTy);
2162	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2163	CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2164	Bld.CreateBr(MergeBB);
2165
2166	CGF.EmitBlock(ElseBB);
2167	Bld.CreateBr(MergeBB);
2168
2169	CGF.EmitBlock(MergeBB);
2170
2171	// if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2172	// Reduce list.
2173	Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2174	llvm::Value *CondCopy = Bld.CreateAnd(
2175	Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2176
2177	llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2178	llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2179	llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2180	Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2181
2182	CGF.EmitBlock(CpyThenBB);
2183	emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2184	RemoteReduceList, LocalReduceList);
2185	Bld.CreateBr(CpyMergeBB);
2186
2187	CGF.EmitBlock(CpyElseBB);
2188	Bld.CreateBr(CpyMergeBB);
2189
2190	CGF.EmitBlock(CpyMergeBB);
2191
2192	CGF.FinishFunction();
2193	return Fn;
2194	}
2195
2196	/// This function emits a helper that copies all the reduction variables from
2197	/// the team into the provided global buffer for the reduction variables.
2198	///
2199	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
2200	/// For all data entries D in reduce_data:
2201	/// Copy local D to buffer.D[Idx]
2202	static llvm::Value *emitListToGlobalCopyFunction(
2203	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2204	QualType ReductionArrayTy, SourceLocation Loc,
2205	const RecordDecl *TeamReductionRec,
2206	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
2207	&VarFieldMap) {
2208	ASTContext &C = CGM.getContext();
2209
2210	// Buffer: global reduction buffer.
2211	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2212	C.VoidPtrTy, ImplicitParamDecl::Other);
2213	// Idx: index of the buffer.
2214	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2215	ImplicitParamDecl::Other);
2216	// ReduceList: thread local Reduce list.
2217	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2218	C.VoidPtrTy, ImplicitParamDecl::Other);
2219	FunctionArgList Args;
2220	Args.push_back(&BufferArg);
2221	Args.push_back(&IdxArg);
2222	Args.push_back(&ReduceListArg);
2223
2224	const CGFunctionInfo &CGFI =
2225	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2226	auto *Fn = llvm::Function::Create(
2227	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2228	"_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2229	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2230	Fn->setDoesNotRecurse();
2231	CodeGenFunction CGF(CGM);
2232	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2233
2234	CGBuilderTy &Bld = CGF.Builder;
2235
2236	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2237	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2238	llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2239	Address LocalReduceList(
2240	Bld.CreatePointerBitCastOrAddrSpaceCast(
2241	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2242	C.VoidPtrTy, Loc),
2243	ElemTy->getPointerTo()),
2244	ElemTy, CGF.getPointerAlign());
2245	QualType StaticTy = C.getRecordType(TeamReductionRec);
2246	llvm::Type *LLVMReductionsBufferTy =
2247	CGM.getTypes().ConvertTypeForMem(StaticTy);
2248	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2249	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2250	LLVMReductionsBufferTy->getPointerTo());
2251	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2252	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2253	/Volatile=/false, C.IntTy,
2254	Loc)};
2255	unsigned Idx = 0;
2256	for (const Expr *Private : Privates) {
2257	// Reduce element = LocalReduceList[i]
2258	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2259	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2260	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
2261	// elemptr = ((CopyType*)(elemptrptr)) + I
2262	ElemTy = CGF.ConvertTypeForMem(Private->getType());
2263	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2264	ElemPtrPtr, ElemTy->getPointerTo());
2265	Address ElemPtr =
2266	Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2267	const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2268	// Global = Buffer.VD[Idx];
2269	const FieldDecl *FD = VarFieldMap.lookup(VD);
2270	LValue GlobLVal = CGF.EmitLValueForField(
2271	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2272	Address GlobAddr = GlobLVal.getAddress(CGF);
2273	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2274	GlobAddr.getPointer(), Idxs);
2275	GlobLVal.setAddress(Address(BufferPtr,
2276	CGF.ConvertTypeForMem(Private->getType()),
2277	GlobAddr.getAlignment()));
2278	switch (CGF.getEvaluationKind(Private->getType())) {
2279	case TEK_Scalar: {
2280	llvm::Value *V = CGF.EmitLoadOfScalar(
2281	ElemPtr, /Volatile=/false, Private->getType(), Loc,
2282	LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
2283	CGF.EmitStoreOfScalar(V, GlobLVal);
2284	break;
2285	}
2286	case TEK_Complex: {
2287	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(
2288	CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2289	CGF.EmitStoreOfComplex(V, GlobLVal, /isInit=/false);
2290	break;
2291	}
2292	case TEK_Aggregate:
2293	CGF.EmitAggregateCopy(GlobLVal,
2294	CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2295	Private->getType(), AggValueSlot::DoesNotOverlap);
2296	break;
2297	}
2298	++Idx;
2299	}
2300
2301	CGF.FinishFunction();
2302	return Fn;
2303	}
2304
2305	/// This function emits a helper that reduces all the reduction variables from
2306	/// the team into the provided global buffer for the reduction variables.
2307	///
2308	/// void list_to_global_reduce_func(void buffer, int Idx, void reduce_data)
2309	/// void *GlobPtrs[];
2310	/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2311	/// ...
2312	/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2313	/// reduce_function(GlobPtrs, reduce_data);
2314	static llvm::Value *emitListToGlobalReduceFunction(
2315	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2316	QualType ReductionArrayTy, SourceLocation Loc,
2317	const RecordDecl *TeamReductionRec,
2318	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
2319	&VarFieldMap,
2320	llvm::Function *ReduceFn) {
2321	ASTContext &C = CGM.getContext();
2322
2323	// Buffer: global reduction buffer.
2324	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2325	C.VoidPtrTy, ImplicitParamDecl::Other);
2326	// Idx: index of the buffer.
2327	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2328	ImplicitParamDecl::Other);
2329	// ReduceList: thread local Reduce list.
2330	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2331	C.VoidPtrTy, ImplicitParamDecl::Other);
2332	FunctionArgList Args;
2333	Args.push_back(&BufferArg);
2334	Args.push_back(&IdxArg);
2335	Args.push_back(&ReduceListArg);
2336
2337	const CGFunctionInfo &CGFI =
2338	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2339	auto *Fn = llvm::Function::Create(
2340	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2341	"_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2342	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2343	Fn->setDoesNotRecurse();
2344	CodeGenFunction CGF(CGM);
2345	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2346
2347	CGBuilderTy &Bld = CGF.Builder;
2348
2349	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2350	QualType StaticTy = C.getRecordType(TeamReductionRec);
2351	llvm::Type *LLVMReductionsBufferTy =
2352	CGM.getTypes().ConvertTypeForMem(StaticTy);
2353	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2354	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2355	LLVMReductionsBufferTy->getPointerTo());
2356
2357	// 1. Build a list of reduction variables.
2358	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2359	Address ReductionList =
2360	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2361	auto IPriv = Privates.begin();
2362	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2363	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2364	/Volatile=/false, C.IntTy,
2365	Loc)};
2366	unsigned Idx = 0;
2367	for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2368	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2369	// Global = Buffer.VD[Idx];
2370	const ValueDecl VD = cast<DeclRefExpr>(IPriv)->getDecl();
2371	const FieldDecl *FD = VarFieldMap.lookup(VD);
2372	LValue GlobLVal = CGF.EmitLValueForField(
2373	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2374	Address GlobAddr = GlobLVal.getAddress(CGF);
2375	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2376	GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2377	llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2378	CGF.EmitStoreOfScalar(Ptr, Elem, /Volatile=/false, C.VoidPtrTy);
2379	if ((*IPriv)->getType()->isVariablyModifiedType()) {
2380	// Store array size.
2381	++Idx;
2382	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2383	llvm::Value *Size = CGF.Builder.CreateIntCast(
2384	CGF.getVLASize(
2385	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2386	.NumElts,
2387	CGF.SizeTy, /isSigned=/false);
2388	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2389	Elem);
2390	}
2391	}
2392
2393	// Call reduce_function(GlobalReduceList, ReduceList)
2394	llvm::Value *GlobalReduceList =
2395	CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2396	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2397	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2398	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
2399	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2400	CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2401	CGF.FinishFunction();
2402	return Fn;
2403	}
2404
2405	/// This function emits a helper that copies all the reduction variables from
2406	/// the team into the provided global buffer for the reduction variables.
2407	///
2408	/// void list_to_global_copy_func(void buffer, int Idx, void reduce_data)
2409	/// For all data entries D in reduce_data:
2410	/// Copy buffer.D[Idx] to local D;
2411	static llvm::Value *emitGlobalToListCopyFunction(
2412	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2413	QualType ReductionArrayTy, SourceLocation Loc,
2414	const RecordDecl *TeamReductionRec,
2415	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
2416	&VarFieldMap) {
2417	ASTContext &C = CGM.getContext();
2418
2419	// Buffer: global reduction buffer.
2420	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2421	C.VoidPtrTy, ImplicitParamDecl::Other);
2422	// Idx: index of the buffer.
2423	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2424	ImplicitParamDecl::Other);
2425	// ReduceList: thread local Reduce list.
2426	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2427	C.VoidPtrTy, ImplicitParamDecl::Other);
2428	FunctionArgList Args;
2429	Args.push_back(&BufferArg);
2430	Args.push_back(&IdxArg);
2431	Args.push_back(&ReduceListArg);
2432
2433	const CGFunctionInfo &CGFI =
2434	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2435	auto *Fn = llvm::Function::Create(
2436	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2437	"_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2438	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2439	Fn->setDoesNotRecurse();
2440	CodeGenFunction CGF(CGM);
2441	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2442
2443	CGBuilderTy &Bld = CGF.Builder;
2444
2445	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2446	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2447	llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2448	Address LocalReduceList(
2449	Bld.CreatePointerBitCastOrAddrSpaceCast(
2450	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
2451	C.VoidPtrTy, Loc),
2452	ElemTy->getPointerTo()),
2453	ElemTy, CGF.getPointerAlign());
2454	QualType StaticTy = C.getRecordType(TeamReductionRec);
2455	llvm::Type *LLVMReductionsBufferTy =
2456	CGM.getTypes().ConvertTypeForMem(StaticTy);
2457	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2458	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2459	LLVMReductionsBufferTy->getPointerTo());
2460
2461	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2462	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2463	/Volatile=/false, C.IntTy,
2464	Loc)};
2465	unsigned Idx = 0;
2466	for (const Expr *Private : Privates) {
2467	// Reduce element = LocalReduceList[i]
2468	Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2469	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2470	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
2471	// elemptr = ((CopyType*)(elemptrptr)) + I
2472	ElemTy = CGF.ConvertTypeForMem(Private->getType());
2473	ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2474	ElemPtrPtr, ElemTy->getPointerTo());
2475	Address ElemPtr =
2476	Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2477	const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2478	// Global = Buffer.VD[Idx];
2479	const FieldDecl *FD = VarFieldMap.lookup(VD);
2480	LValue GlobLVal = CGF.EmitLValueForField(
2481	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2482	Address GlobAddr = GlobLVal.getAddress(CGF);
2483	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(GlobAddr.getElementType(),
2484	GlobAddr.getPointer(), Idxs);
2485	GlobLVal.setAddress(Address(BufferPtr,
2486	CGF.ConvertTypeForMem(Private->getType()),
2487	GlobAddr.getAlignment()));
2488	switch (CGF.getEvaluationKind(Private->getType())) {
2489	case TEK_Scalar: {
2490	llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2491	CGF.EmitStoreOfScalar(V, ElemPtr, /Volatile=/false, Private->getType(),
2492	LValueBaseInfo(AlignmentSource::Type),
2493	TBAAAccessInfo());
2494	break;
2495	}
2496	case TEK_Complex: {
2497	CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);
2498	CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2499	/isInit=/false);
2500	break;
2501	}
2502	case TEK_Aggregate:
2503	CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2504	GlobLVal, Private->getType(),
2505	AggValueSlot::DoesNotOverlap);
2506	break;
2507	}
2508	++Idx;
2509	}
2510
2511	CGF.FinishFunction();
2512	return Fn;
2513	}
2514
2515	/// This function emits a helper that reduces all the reduction variables from
2516	/// the team into the provided global buffer for the reduction variables.
2517	///
2518	/// void global_to_list_reduce_func(void buffer, int Idx, void reduce_data)
2519	/// void *GlobPtrs[];
2520	/// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2521	/// ...
2522	/// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2523	/// reduce_function(reduce_data, GlobPtrs);
2524	static llvm::Value *emitGlobalToListReduceFunction(
2525	CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2526	QualType ReductionArrayTy, SourceLocation Loc,
2527	const RecordDecl *TeamReductionRec,
2528	const llvm::SmallDenseMap<const ValueDecl , const FieldDecl >
2529	&VarFieldMap,
2530	llvm::Function *ReduceFn) {
2531	ASTContext &C = CGM.getContext();
2532
2533	// Buffer: global reduction buffer.
2534	ImplicitParamDecl BufferArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2535	C.VoidPtrTy, ImplicitParamDecl::Other);
2536	// Idx: index of the buffer.
2537	ImplicitParamDecl IdxArg(C, /DC=/nullptr, Loc, /Id=/nullptr, C.IntTy,
2538	ImplicitParamDecl::Other);
2539	// ReduceList: thread local Reduce list.
2540	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, Loc, /Id=/nullptr,
2541	C.VoidPtrTy, ImplicitParamDecl::Other);
2542	FunctionArgList Args;
2543	Args.push_back(&BufferArg);
2544	Args.push_back(&IdxArg);
2545	Args.push_back(&ReduceListArg);
2546
2547	const CGFunctionInfo &CGFI =
2548	CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2549	auto *Fn = llvm::Function::Create(
2550	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2551	"_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2552	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2553	Fn->setDoesNotRecurse();
2554	CodeGenFunction CGF(CGM);
2555	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2556
2557	CGBuilderTy &Bld = CGF.Builder;
2558
2559	Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2560	QualType StaticTy = C.getRecordType(TeamReductionRec);
2561	llvm::Type *LLVMReductionsBufferTy =
2562	CGM.getTypes().ConvertTypeForMem(StaticTy);
2563	llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2564	CGF.EmitLoadOfScalar(AddrBufferArg, /Volatile=/false, C.VoidPtrTy, Loc),
2565	LLVMReductionsBufferTy->getPointerTo());
2566
2567	// 1. Build a list of reduction variables.
2568	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2569	Address ReductionList =
2570	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2571	auto IPriv = Privates.begin();
2572	llvm::Value *Idxs[] = {llvm::ConstantInt::getNullValue(CGF.Int32Ty),
2573	CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2574	/Volatile=/false, C.IntTy,
2575	Loc)};
2576	unsigned Idx = 0;
2577	for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2578	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2579	// Global = Buffer.VD[Idx];
2580	const ValueDecl VD = cast<DeclRefExpr>(IPriv)->getDecl();
2581	const FieldDecl *FD = VarFieldMap.lookup(VD);
2582	LValue GlobLVal = CGF.EmitLValueForField(
2583	CGF.MakeNaturalAlignAddrLValue(BufferArrPtr, StaticTy), FD);
2584	Address GlobAddr = GlobLVal.getAddress(CGF);
2585	llvm::Value *BufferPtr = Bld.CreateInBoundsGEP(
2586	GlobAddr.getElementType(), GlobAddr.getPointer(), Idxs);
2587	llvm::Value *Ptr = CGF.EmitCastToVoidPtr(BufferPtr);
2588	CGF.EmitStoreOfScalar(Ptr, Elem, /Volatile=/false, C.VoidPtrTy);
2589	if ((*IPriv)->getType()->isVariablyModifiedType()) {
2590	// Store array size.
2591	++Idx;
2592	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2593	llvm::Value *Size = CGF.Builder.CreateIntCast(
2594	CGF.getVLASize(
2595	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2596	.NumElts,
2597	CGF.SizeTy, /isSigned=/false);
2598	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2599	Elem);
2600	}
2601	}
2602
2603	// Call reduce_function(ReduceList, GlobalReduceList)
2604	llvm::Value *GlobalReduceList =
2605	CGF.EmitCastToVoidPtr(ReductionList.getPointer());
2606	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2607	llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2608	AddrReduceListArg, /Volatile=/false, C.VoidPtrTy, Loc);
2609	CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2610	CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2611	CGF.FinishFunction();
2612	return Fn;
2613	}
2614
2615	///
2616	/// Design of OpenMP reductions on the GPU
2617	///
2618	/// Consider a typical OpenMP program with one or more reduction
2619	/// clauses:
2620	///
2621	/// float foo;
2622	/// double bar;
2623	/// #pragma omp target teams distribute parallel for \
2624	/// reduction(+:foo) reduction(*:bar)
2625	/// for (int i = 0; i < N; i++) {
2626	/// foo += A[i]; bar *= B[i];
2627	/// }
2628	///
2629	/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2630	/// all teams. In our OpenMP implementation on the NVPTX device an
2631	/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2632	/// within a team are mapped to CUDA threads within a threadblock.
2633	/// Our goal is to efficiently aggregate values across all OpenMP
2634	/// threads such that:
2635	///
2636	/// - the compiler and runtime are logically concise, and
2637	/// - the reduction is performed efficiently in a hierarchical
2638	/// manner as follows: within OpenMP threads in the same warp,
2639	/// across warps in a threadblock, and finally across teams on
2640	/// the NVPTX device.
2641	///
2642	/// Introduction to Decoupling
2643	///
2644	/// We would like to decouple the compiler and the runtime so that the
2645	/// latter is ignorant of the reduction variables (number, data types)
2646	/// and the reduction operators. This allows a simpler interface
2647	/// and implementation while still attaining good performance.
2648	///
2649	/// Pseudocode for the aforementioned OpenMP program generated by the
2650	/// compiler is as follows:
2651	///
2652	/// 1. Create private copies of reduction variables on each OpenMP
2653	/// thread: 'foo_private', 'bar_private'
2654	/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2655	/// to it and writes the result in 'foo_private' and 'bar_private'
2656	/// respectively.
2657	/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2658	/// and store the result on the team master:
2659	///
2660	/// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2661	/// reduceData, shuffleReduceFn, interWarpCpyFn)
2662	///
2663	/// where:
2664	/// struct ReduceData {
2665	/// double *foo;
2666	/// double *bar;
2667	/// } reduceData
2668	/// reduceData.foo = &foo_private
2669	/// reduceData.bar = &bar_private
2670	///
2671	/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2672	/// auxiliary functions generated by the compiler that operate on
2673	/// variables of type 'ReduceData'. They aid the runtime perform
2674	/// algorithmic steps in a data agnostic manner.
2675	///
2676	/// 'shuffleReduceFn' is a pointer to a function that reduces data
2677	/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2678	/// same warp. It takes the following arguments as input:
2679	///
2680	/// a. variable of type 'ReduceData' on the calling lane,
2681	/// b. its lane_id,
2682	/// c. an offset relative to the current lane_id to generate a
2683	/// remote_lane_id. The remote lane contains the second
2684	/// variable of type 'ReduceData' that is to be reduced.
2685	/// d. an algorithm version parameter determining which reduction
2686	/// algorithm to use.
2687	///
2688	/// 'shuffleReduceFn' retrieves data from the remote lane using
2689	/// efficient GPU shuffle intrinsics and reduces, using the
2690	/// algorithm specified by the 4th parameter, the two operands
2691	/// element-wise. The result is written to the first operand.
2692	///
2693	/// Different reduction algorithms are implemented in different
2694	/// runtime functions, all calling 'shuffleReduceFn' to perform
2695	/// the essential reduction step. Therefore, based on the 4th
2696	/// parameter, this function behaves slightly differently to
2697	/// cooperate with the runtime to ensure correctness under
2698	/// different circumstances.
2699	///
2700	/// 'InterWarpCpyFn' is a pointer to a function that transfers
2701	/// reduced variables across warps. It tunnels, through CUDA
2702	/// shared memory, the thread-private data of type 'ReduceData'
2703	/// from lane 0 of each warp to a lane in the first warp.
2704	/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2705	/// The last team writes the global reduced value to memory.
2706	///
2707	/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2708	/// reduceData, shuffleReduceFn, interWarpCpyFn,
2709	/// scratchpadCopyFn, loadAndReduceFn)
2710	///
2711	/// 'scratchpadCopyFn' is a helper that stores reduced
2712	/// data from the team master to a scratchpad array in
2713	/// global memory.
2714	///
2715	/// 'loadAndReduceFn' is a helper that loads data from
2716	/// the scratchpad array and reduces it with the input
2717	/// operand.
2718	///
2719	/// These compiler generated functions hide address
2720	/// calculation and alignment information from the runtime.
2721	/// 5. if ret == 1:
2722	/// The team master of the last team stores the reduced
2723	/// result to the globals in memory.
2724	/// foo += reduceData.foo; bar *= reduceData.bar
2725	///
2726	///
2727	/// Warp Reduction Algorithms
2728	///
2729	/// On the warp level, we have three algorithms implemented in the
2730	/// OpenMP runtime depending on the number of active lanes:
2731	///
2732	/// Full Warp Reduction
2733	///
2734	/// The reduce algorithm within a warp where all lanes are active
2735	/// is implemented in the runtime as follows:
2736	///
2737	/// full_warp_reduce(void *reduce_data,
2738	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2739	/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2740	/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2741	/// }
2742	///
2743	/// The algorithm completes in log(2, WARPSIZE) steps.
2744	///
2745	/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2746	/// not used therefore we save instructions by not retrieving lane_id
2747	/// from the corresponding special registers. The 4th parameter, which
2748	/// represents the version of the algorithm being used, is set to 0 to
2749	/// signify full warp reduction.
2750	///
2751	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2752	///
2753	/// #reduce_elem refers to an element in the local lane's data structure
2754	/// #remote_elem is retrieved from a remote lane
2755	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2756	/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2757	///
2758	/// Contiguous Partial Warp Reduction
2759	///
2760	/// This reduce algorithm is used within a warp where only the first
2761	/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2762	/// number of OpenMP threads in a parallel region is not a multiple of
2763	/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2764	///
2765	/// void
2766	/// contiguous_partial_reduce(void *reduce_data,
2767	/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2768	/// int size, int lane_id) {
2769	/// int curr_size;
2770	/// int offset;
2771	/// curr_size = size;
2772	/// mask = curr_size/2;
2773	/// while (offset>0) {
2774	/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2775	/// curr_size = (curr_size+1)/2;
2776	/// offset = curr_size/2;
2777	/// }
2778	/// }
2779	///
2780	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2781	///
2782	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2783	/// if (lane_id < offset)
2784	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2785	/// else
2786	/// reduce_elem = remote_elem
2787	///
2788	/// This algorithm assumes that the data to be reduced are located in a
2789	/// contiguous subset of lanes starting from the first. When there is
2790	/// an odd number of active lanes, the data in the last lane is not
2791	/// aggregated with any other lane's dat but is instead copied over.
2792	///
2793	/// Dispersed Partial Warp Reduction
2794	///
2795	/// This algorithm is used within a warp when any discontiguous subset of
2796	/// lanes are active. It is used to implement the reduction operation
2797	/// across lanes in an OpenMP simd region or in a nested parallel region.
2798	///
2799	/// void
2800	/// dispersed_partial_reduce(void *reduce_data,
2801	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2802	/// int size, remote_id;
2803	/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2804	/// do {
2805	/// remote_id = next_active_lane_id_right_after_me();
2806	/// # the above function returns 0 of no active lane
2807	/// # is present right after the current lane.
2808	/// size = number_of_active_lanes_in_this_warp();
2809	/// logical_lane_id /= 2;
2810	/// ShuffleReduceFn(reduce_data, logical_lane_id,
2811	/// remote_id-1-threadIdx.x, 2);
2812	/// } while (logical_lane_id % 2 == 0 && size > 1);
2813	/// }
2814	///
2815	/// There is no assumption made about the initial state of the reduction.
2816	/// Any number of lanes (>=1) could be active at any position. The reduction
2817	/// result is returned in the first active lane.
2818	///
2819	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2820	///
2821	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2822	/// if (lane_id % 2 == 0 && offset > 0)
2823	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2824	/// else
2825	/// reduce_elem = remote_elem
2826	///
2827	///
2828	/// Intra-Team Reduction
2829	///
2830	/// This function, as implemented in the runtime call
2831	/// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2832	/// threads in a team. It first reduces within a warp using the
2833	/// aforementioned algorithms. We then proceed to gather all such
2834	/// reduced values at the first warp.
2835	///
2836	/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2837	/// data from each of the "warp master" (zeroth lane of each warp, where
2838	/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2839	/// a mathematical sense) the problem of reduction across warp masters in
2840	/// a block to the problem of warp reduction.
2841	///
2842	///
2843	/// Inter-Team Reduction
2844	///
2845	/// Once a team has reduced its data to a single value, it is stored in
2846	/// a global scratchpad array. Since each team has a distinct slot, this
2847	/// can be done without locking.
2848	///
2849	/// The last team to write to the scratchpad array proceeds to reduce the
2850	/// scratchpad array. One or more workers in the last team use the helper
2851	/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2852	/// the k'th worker reduces every k'th element.
2853	///
2854	/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2855	/// reduce across workers and compute a globally reduced value.
2856	///
2857	void CGOpenMPRuntimeGPU::emitReduction(
2858	CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2859	ArrayRef<const Expr > LHSExprs, ArrayRef<const Expr > RHSExprs,
2860	ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2861	if (!CGF.HaveInsertPoint())
2862	return;
2863
2864	bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2865	#ifndef NDEBUG
2866	bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2867	#endif
2868
2869	if (Options.SimpleReduction) {
2870	assert(!TeamsReduction && !ParallelReduction &&(static_cast <bool> (!TeamsReduction && !ParallelReduction && "Invalid reduction selection in emitReduction.") ? void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2871, __extension__ __PRETTY_FUNCTION__))
2871	"Invalid reduction selection in emitReduction.")(static_cast <bool> (!TeamsReduction && !ParallelReduction && "Invalid reduction selection in emitReduction.") ? void (0) : __assert_fail ("!TeamsReduction && !ParallelReduction && \"Invalid reduction selection in emitReduction.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2871, __extension__ __PRETTY_FUNCTION__));
2872	CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2873	ReductionOps, Options);
2874	return;
2875	}
2876
2877	assert((TeamsReduction \|\| ParallelReduction) &&(static_cast <bool> ((TeamsReduction \|\| ParallelReduction ) && "Invalid reduction selection in emitReduction.") ? void (0) : __assert_fail ("(TeamsReduction \|\| ParallelReduction) && \"Invalid reduction selection in emitReduction.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2878, __extension__ __PRETTY_FUNCTION__))
2878	"Invalid reduction selection in emitReduction.")(static_cast <bool> ((TeamsReduction \|\| ParallelReduction ) && "Invalid reduction selection in emitReduction.") ? void (0) : __assert_fail ("(TeamsReduction \|\| ParallelReduction) && \"Invalid reduction selection in emitReduction.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2878, __extension__ __PRETTY_FUNCTION__));
2879
2880	// Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2881	// RedList, shuffle_reduce_func, interwarp_copy_func);
2882	// or
2883	// Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2884	llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2885	llvm::Value *ThreadId = getThreadID(CGF, Loc);
2886
2887	llvm::Value *Res;
2888	ASTContext &C = CGM.getContext();
2889	// 1. Build a list of reduction variables.
2890	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2891	auto Size = RHSExprs.size();
2892	for (const Expr *E : Privates) {
2893	if (E->getType()->isVariablyModifiedType())
2894	// Reserve place for array size.
2895	++Size;
2896	}
2897	llvm::APInt ArraySize(/unsigned int numBits=/32, Size);
2898	QualType ReductionArrayTy =
2899	C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
2900	/IndexTypeQuals=/0);
2901	Address ReductionList =
2902	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2903	auto IPriv = Privates.begin();
2904	unsigned Idx = 0;
2905	for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2906	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2907	CGF.Builder.CreateStore(
2908	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2909	CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
2910	Elem);
2911	if ((*IPriv)->getType()->isVariablyModifiedType()) {
2912	// Store array size.
2913	++Idx;
2914	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2915	llvm::Value *Size = CGF.Builder.CreateIntCast(
2916	CGF.getVLASize(
2917	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2918	.NumElts,
2919	CGF.SizeTy, /isSigned=/false);
2920	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2921	Elem);
2922	}
2923	}
2924
2925	llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2926	ReductionList.getPointer(), CGF.VoidPtrTy);
2927	llvm::Function *ReductionFn = emitReductionFunction(
2928	CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
2929	Privates, LHSExprs, RHSExprs, ReductionOps);
2930	llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2931	llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2932	CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
2933	llvm::Value *InterWarpCopyFn =
2934	emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
2935
2936	if (ParallelReduction) {
2937	llvm::Value *Args[] = {RTLoc,
2938	ThreadId,
2939	CGF.Builder.getInt32(RHSExprs.size()),
2940	ReductionArrayTySize,
2941	RL,
2942	ShuffleAndReduceFn,
2943	InterWarpCopyFn};
2944
2945	Res = CGF.EmitRuntimeCall(
2946	OMPBuilder.getOrCreateRuntimeFunction(
2947	CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2948	Args);
2949	} else {
2950	assert(TeamsReduction && "expected teams reduction.")(static_cast <bool> (TeamsReduction && "expected teams reduction." ) ? void (0) : __assert_fail ("TeamsReduction && \"expected teams reduction.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 2950, __extension__ __PRETTY_FUNCTION__));
2951	llvm::SmallDenseMap<const ValueDecl , const FieldDecl > VarFieldMap;
2952	llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2953	int Cnt = 0;
2954	for (const Expr *DRE : Privates) {
2955	PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
2956	++Cnt;
2957	}
2958	const RecordDecl *TeamReductionRec = ::buildRecordForGlobalizedVars(
2959	CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
2960	C.getLangOpts().OpenMPCUDAReductionBufNum);
2961	TeamsReductions.push_back(TeamReductionRec);
2962	if (!KernelTeamsReductionPtr) {
2963	KernelTeamsReductionPtr = new llvm::GlobalVariable(
2964	CGM.getModule(), CGM.VoidPtrTy, /isConstant=/true,
2965	llvm::GlobalValue::InternalLinkage, nullptr,
2966	"_openmp_teams_reductions_buffer_$_$ptr");
2967	}
2968	llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
2969	Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
2970	/Volatile=/false, C.getPointerType(C.VoidPtrTy), Loc);
2971	llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2972	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2973	llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2974	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2975	ReductionFn);
2976	llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2977	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
2978	llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2979	CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap,
2980	ReductionFn);
2981
2982	llvm::Value *Args[] = {
2983	RTLoc,
2984	ThreadId,
2985	GlobalBufferPtr,
2986	CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
2987	RL,
2988	ShuffleAndReduceFn,
2989	InterWarpCopyFn,
2990	GlobalToBufferCpyFn,
2991	GlobalToBufferRedFn,
2992	BufferToGlobalCpyFn,
2993	BufferToGlobalRedFn};
2994
2995	Res = CGF.EmitRuntimeCall(
2996	OMPBuilder.getOrCreateRuntimeFunction(
2997	CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2998	Args);
2999	}
3000
3001	// 5. Build if (res == 1)
3002	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
3003	llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
3004	llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
3005	Res, llvm::ConstantInt::get(CGM.Int32Ty, /V=/1));
3006	CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3007
3008	// 6. Build then branch: where we have reduced values in the master
3009	// thread in each team.
3010	// __kmpc_end_reduce{_nowait}(<gtid>);
3011	// break;
3012	CGF.EmitBlock(ThenBB);
3013
3014	// Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3015	auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
3016	this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3017	auto IPriv = Privates.begin();
3018	auto ILHS = LHSExprs.begin();
3019	auto IRHS = RHSExprs.begin();
3020	for (const Expr *E : ReductionOps) {
3021	emitSingleReductionCombiner(CGF, E, IPriv, cast<DeclRefExpr>(ILHS),
3022	cast<DeclRefExpr>(*IRHS));
3023	++IPriv;
3024	++ILHS;
3025	++IRHS;
3026	}
3027	};
3028	llvm::Value *EndArgs[] = {ThreadId};
3029	RegionCodeGenTy RCG(CodeGen);
3030	NVPTXActionTy Action(
3031	nullptr, std::nullopt,
3032	OMPBuilder.getOrCreateRuntimeFunction(
3033	CGM.getModule(), OMPRTL___kmpc_nvptx_end_reduce_nowait),
3034	EndArgs);
3035	RCG.setAction(Action);
3036	RCG(CGF);
3037	// There is no need to emit line number for unconditional branch.
3038	(void)ApplyDebugLocation::CreateEmpty(CGF);
3039	CGF.EmitBlock(ExitBB, /IsFinished=/true);
3040	}
3041
3042	const VarDecl *
3043	CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,
3044	const VarDecl *NativeParam) const {
3045	if (!NativeParam->getType()->isReferenceType())
3046	return NativeParam;
3047	QualType ArgType = NativeParam->getType();
3048	QualifierCollector QC;
3049	const Type *NonQualTy = QC.strip(ArgType);
3050	QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3051	if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3052	if (Attr->getCaptureKind() == OMPC_map) {
3053	PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3054	LangAS::opencl_global);
3055	}
3056	}
3057	ArgType = CGM.getContext().getPointerType(PointeeTy);
3058	QC.addRestrict();
3059	enum { NVPTX_local_addr = 5 };
3060	QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
3061	ArgType = QC.apply(CGM.getContext(), ArgType);
3062	if (isa<ImplicitParamDecl>(NativeParam))
3063	return ImplicitParamDecl::Create(
3064	CGM.getContext(), /DC=/nullptr, NativeParam->getLocation(),
3065	NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
3066	return ParmVarDecl::Create(
3067	CGM.getContext(),
3068	const_cast<DeclContext *>(NativeParam->getDeclContext()),
3069	NativeParam->getBeginLoc(), NativeParam->getLocation(),
3070	NativeParam->getIdentifier(), ArgType,
3071	/TInfo=/nullptr, SC_None, /DefArg=/nullptr);
3072	}
3073
3074	Address
3075	CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,
3076	const VarDecl *NativeParam,
3077	const VarDecl *TargetParam) const {
3078	assert(NativeParam != TargetParam &&(static_cast <bool> (NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg." ) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3080, __extension__ __PRETTY_FUNCTION__))
3079	NativeParam->getType()->isReferenceType() &&(static_cast <bool> (NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg." ) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3080, __extension__ __PRETTY_FUNCTION__))
3080	"Native arg must not be the same as target arg.")(static_cast <bool> (NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && "Native arg must not be the same as target arg." ) ? void (0) : __assert_fail ("NativeParam != TargetParam && NativeParam->getType()->isReferenceType() && \"Native arg must not be the same as target arg.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3080, __extension__ __PRETTY_FUNCTION__));
3081	Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3082	QualType NativeParamType = NativeParam->getType();
3083	QualifierCollector QC;
3084	const Type *NonQualTy = QC.strip(NativeParamType);
3085	QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3086	unsigned NativePointeeAddrSpace =
3087	CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
3088	QualType TargetTy = TargetParam->getType();
3089	llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
3090	LocalAddr, /Volatile=/false, TargetTy, SourceLocation());
3091	// First cast to generic.
3092	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3093	TargetAddr, llvm::PointerType::getWithSamePointeeType(
3094	cast<llvm::PointerType>(TargetAddr->getType()), /AddrSpace=/0));
3095	// Cast from generic to native address space.
3096	TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3097	TargetAddr, llvm::PointerType::getWithSamePointeeType(
3098	cast<llvm::PointerType>(TargetAddr->getType()),
3099	NativePointeeAddrSpace));
3100	Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3101	CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /Volatile=/false,
3102	NativeParamType);
3103	return NativeParamAddr;
3104	}
3105
3106	void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(
3107	CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3108	ArrayRef<llvm::Value *> Args) const {
3109	SmallVector<llvm::Value *, 4> TargetArgs;
3110	TargetArgs.reserve(Args.size());
3111	auto *FnType = OutlinedFn.getFunctionType();
3112	for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3113	if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3114	TargetArgs.append(std::next(Args.begin(), I), Args.end());
3115	break;
3116	}
3117	llvm::Type *TargetType = FnType->getParamType(I);
3118	llvm::Value *NativeArg = Args[I];
3119	if (!TargetType->isPointerTy()) {
3120	TargetArgs.emplace_back(NativeArg);
3121	continue;
3122	}
3123	llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3124	NativeArg, llvm::PointerType::getWithSamePointeeType(
3125	cast<llvm::PointerType>(NativeArg->getType()), /AddrSpace/ 0));
3126	TargetArgs.emplace_back(
3127	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3128	}
3129	CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3130	}
3131
3132	/// Emit function which wraps the outline parallel region
3133	/// and controls the arguments which are passed to this function.
3134	/// The wrapper ensures that the outlined function is called
3135	/// with the correct arguments when data is shared.
3136	llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3137	llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3138	ASTContext &Ctx = CGM.getContext();
3139	const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3140
3141	// Create a function that takes as argument the source thread.
3142	FunctionArgList WrapperArgs;
3143	QualType Int16QTy =
3144	Ctx.getIntTypeForBitwidth(/DestWidth=/16, /Signed=/false);
3145	QualType Int32QTy =
3146	Ctx.getIntTypeForBitwidth(/DestWidth=/32, /Signed=/false);
3147	ImplicitParamDecl ParallelLevelArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
3148	/Id=/nullptr, Int16QTy,
3149	ImplicitParamDecl::Other);
3150	ImplicitParamDecl WrapperArg(Ctx, /DC=/nullptr, D.getBeginLoc(),
3151	/Id=/nullptr, Int32QTy,
3152	ImplicitParamDecl::Other);
3153	WrapperArgs.emplace_back(&ParallelLevelArg);
3154	WrapperArgs.emplace_back(&WrapperArg);
3155
3156	const CGFunctionInfo &CGFI =
3157	CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3158
3159	auto *Fn = llvm::Function::Create(
3160	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3161	Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3162
3163	// Ensure we do not inline the function. This is trivially true for the ones
3164	// passed to __kmpc_fork_call but the ones calles in serialized regions
3165	// could be inlined. This is not a perfect but it is closer to the invariant
3166	// we want, namely, every data environment starts with a new function.
3167	// TODO: We should pass the if condition to the runtime function and do the
3168	// handling there. Much cleaner code.
3169	Fn->addFnAttr(llvm::Attribute::NoInline);
3170
3171	CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
3172	Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3173	Fn->setDoesNotRecurse();
3174
3175	CodeGenFunction CGF(CGM, /suppressNewContext=/true);
3176	CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3177	D.getBeginLoc(), D.getBeginLoc());
3178
3179	const auto *RD = CS.getCapturedRecordDecl();
3180	auto CurField = RD->field_begin();
3181
3182	Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3183	/Name=/".zero.addr");
3184	CGF.Builder.CreateStore(CGF.Builder.getInt32(/C/ 0), ZeroAddr);
3185	// Get the array of arguments.
3186	SmallVector<llvm::Value *, 8> Args;
3187
3188	Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3189	Args.emplace_back(ZeroAddr.getPointer());
3190
3191	CGBuilderTy &Bld = CGF.Builder;
3192	auto CI = CS.capture_begin();
3193
3194	// Use global memory for data sharing.
3195	// Handle passing of global args to workers.
3196	Address GlobalArgs =
3197	CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3198	llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3199	llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3200	CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3201	CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3202	DataSharingArgs);
3203
3204	// Retrieve the shared variables from the list of references returned
3205	// by the runtime. Pass the variables to the outlined function.
3206	Address SharedArgListAddress = Address::invalid();
3207	if (CS.capture_size() > 0 \|\|
3208	isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3209	SharedArgListAddress = CGF.EmitLoadOfPointer(
3210	GlobalArgs, CGF.getContext()
3211	.getPointerType(CGF.getContext().VoidPtrTy)
3212	.castAs<PointerType>());
3213	}
3214	unsigned Idx = 0;
3215	if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3216	Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3217	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3218	Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3219	llvm::Value *LB = CGF.EmitLoadOfScalar(
3220	TypedAddress,
3221	/Volatile=/false,
3222	CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3223	cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3224	Args.emplace_back(LB);
3225	++Idx;
3226	Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3227	TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3228	Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3229	llvm::Value *UB = CGF.EmitLoadOfScalar(
3230	TypedAddress,
3231	/Volatile=/false,
3232	CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3233	cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3234	Args.emplace_back(UB);
3235	++Idx;
3236	}
3237	if (CS.capture_size() > 0) {
3238	ASTContext &CGFContext = CGF.getContext();
3239	for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3240	QualType ElemTy = CurField->getType();
3241	Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3242	Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3243	Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
3244	CGF.ConvertTypeForMem(ElemTy));
3245	llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3246	/Volatile=/false,
3247	CGFContext.getPointerType(ElemTy),
3248	CI->getLocation());
3249	if (CI->capturesVariableByCopy() &&
3250	!CI->getCapturedVar()->getType()->isAnyPointerType()) {
3251	Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3252	CI->getLocation());
3253	}
3254	Args.emplace_back(Arg);
3255	}
3256	}
3257
3258	emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3259	CGF.FinishFunction();
3260	return Fn;
3261	}
3262
3263	void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
3264	const Decl *D) {
3265	if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3266	return;
3267
3268	assert(D && "Expected function or captured\|block decl.")(static_cast <bool> (D && "Expected function or captured\|block decl." ) ? void (0) : __assert_fail ("D && \"Expected function or captured\|block decl.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3268, __extension__ __PRETTY_FUNCTION__));
3269	assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&(static_cast <bool> (FunctionGlobalizedDecls.count(CGF. CurFn) == 0 && "Function is registered already.") ? void (0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3270, __extension__ __PRETTY_FUNCTION__))
3270	"Function is registered already.")(static_cast <bool> (FunctionGlobalizedDecls.count(CGF. CurFn) == 0 && "Function is registered already.") ? void (0) : __assert_fail ("FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && \"Function is registered already.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3270, __extension__ __PRETTY_FUNCTION__));
3271	assert((!TeamAndReductions.first \|\| TeamAndReductions.first == D) &&(static_cast <bool> ((!TeamAndReductions.first \|\| TeamAndReductions .first == D) && "Team is set but not processed.") ? void (0) : __assert_fail ("(!TeamAndReductions.first \|\| TeamAndReductions.first == D) && \"Team is set but not processed.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3272, __extension__ __PRETTY_FUNCTION__))
3272	"Team is set but not processed.")(static_cast <bool> ((!TeamAndReductions.first \|\| TeamAndReductions .first == D) && "Team is set but not processed.") ? void (0) : __assert_fail ("(!TeamAndReductions.first \|\| TeamAndReductions.first == D) && \"Team is set but not processed.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3272, __extension__ __PRETTY_FUNCTION__));
3273	const Stmt *Body = nullptr;
3274	bool NeedToDelayGlobalization = false;
3275	if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3276	Body = FD->getBody();
3277	} else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3278	Body = BD->getBody();
3279	} else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3280	Body = CD->getBody();
3281	NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3282	if (NeedToDelayGlobalization &&
3283	getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3284	return;
3285	}
3286	if (!Body)
3287	return;
3288	CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3289	VarChecker.Visit(Body);
3290	const RecordDecl *GlobalizedVarsRecord =
3291	VarChecker.getGlobalizedRecord(IsInTTDRegion);
3292	TeamAndReductions.first = nullptr;
3293	TeamAndReductions.second.clear();
3294	ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3295	VarChecker.getEscapedVariableLengthDecls();
3296	if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
3297	return;
3298	auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3299	I->getSecond().MappedParams =
3300	std::make_unique<CodeGenFunction::OMPMapVars>();
3301	I->getSecond().EscapedParameters.insert(
3302	VarChecker.getEscapedParameters().begin(),
3303	VarChecker.getEscapedParameters().end());
3304	I->getSecond().EscapedVariableLengthDecls.append(
3305	EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3306	DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3307	for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3308	assert(VD->isCanonicalDecl() && "Expected canonical declaration")(static_cast <bool> (VD->isCanonicalDecl() && "Expected canonical declaration") ? void (0) : __assert_fail ("VD->isCanonicalDecl() && \"Expected canonical declaration\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3308, __extension__ __PRETTY_FUNCTION__));
3309	Data.insert(std::make_pair(VD, MappedVarData()));
3310	}
3311	if (!NeedToDelayGlobalization) {
3312	emitGenericVarsProlog(CGF, D->getBeginLoc(), /WithSPMDCheck=/true);
3313	struct GlobalizationScope final : EHScopeStack::Cleanup {
3314	GlobalizationScope() = default;
3315
3316	void Emit(CodeGenFunction &CGF, Flags flags) override {
3317	static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3318	.emitGenericVarsEpilog(CGF, /WithSPMDCheck=/true);
3319	}
3320	};
3321	CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3322	}
3323	}
3324
3325	Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,
3326	const VarDecl *VD) {
3327	if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3328	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3329	auto AS = LangAS::Default;
3330	switch (A->getAllocatorType()) {
3331	// Use the default allocator here as by default local vars are
3332	// threadlocal.
3333	case OMPAllocateDeclAttr::OMPNullMemAlloc:
3334	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3335	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3336	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3337	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3338	// Follow the user decision - use default allocation.
3339	return Address::invalid();
3340	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3341	// TODO: implement aupport for user-defined allocators.
3342	return Address::invalid();
3343	case OMPAllocateDeclAttr::OMPConstMemAlloc:
3344	AS = LangAS::cuda_constant;
3345	break;
3346	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3347	AS = LangAS::cuda_shared;
3348	break;
3349	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3350	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3351	break;
3352	}
3353	llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3354	auto *GV = new llvm::GlobalVariable(
3355	CGM.getModule(), VarTy, /isConstant=/false,
3356	llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
3357	VD->getName(),
3358	/InsertBefore=/nullptr, llvm::GlobalValue::NotThreadLocal,
3359	CGM.getContext().getTargetAddressSpace(AS));
3360	CharUnits Align = CGM.getContext().getDeclAlign(VD);
3361	GV->setAlignment(Align.getAsAlign());
3362	return Address(
3363	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3364	GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3365	VD->getType().getAddressSpace()))),
3366	VarTy, Align);
3367	}
3368
3369	if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic)
3370	return Address::invalid();
3371
3372	VD = VD->getCanonicalDecl();
3373	auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3374	if (I == FunctionGlobalizedDecls.end())
3375	return Address::invalid();
3376	auto VDI = I->getSecond().LocalVarData.find(VD);
3377	if (VDI != I->getSecond().LocalVarData.end())
3378	return VDI->second.PrivateAddr;
3379	if (VD->hasAttrs()) {
3380	for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3381	E(VD->attr_end());
3382	IT != E; ++IT) {
3383	auto VDI = I->getSecond().LocalVarData.find(
3384	cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3385	->getCanonicalDecl());
3386	if (VDI != I->getSecond().LocalVarData.end())
3387	return VDI->second.PrivateAddr;
3388	}
3389	}
3390
3391	return Address::invalid();
3392	}
3393
3394	void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {
3395	FunctionGlobalizedDecls.erase(CGF.CurFn);
3396	CGOpenMPRuntime::functionFinished(CGF);
3397	}
3398
3399	void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(
3400	CodeGenFunction &CGF, const OMPLoopDirective &S,
3401	OpenMPDistScheduleClauseKind &ScheduleKind,
3402	llvm::Value *&Chunk) const {
3403	auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3404	if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3405	ScheduleKind = OMPC_DIST_SCHEDULE_static;
3406	Chunk = CGF.EmitScalarConversion(
3407	RT.getGPUNumThreads(CGF),
3408	CGF.getContext().getIntTypeForBitwidth(32, /Signed=/0),
3409	S.getIterationVariable()->getType(), S.getBeginLoc());
3410	return;
3411	}
3412	CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
3413	CGF, S, ScheduleKind, Chunk);
3414	}
3415
3416	void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(
3417	CodeGenFunction &CGF, const OMPLoopDirective &S,
3418	OpenMPScheduleClauseKind &ScheduleKind,
3419	const Expr *&ChunkExpr) const {
3420	ScheduleKind = OMPC_SCHEDULE_static;
3421	// Chunk size is 1 in this case.
3422	llvm::APInt ChunkSize(32, 1);
3423	ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3424	CGF.getContext().getIntTypeForBitwidth(32, /Signed=/0),
3425	SourceLocation());
3426	}
3427
3428	void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(
3429	CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3430	assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&(static_cast <bool> (isOpenMPTargetExecutionDirective(D .getDirectiveKind()) && " Expected target-based directive." ) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3431, __extension__ __PRETTY_FUNCTION__))
3431	" Expected target-based directive.")(static_cast <bool> (isOpenMPTargetExecutionDirective(D .getDirectiveKind()) && " Expected target-based directive." ) ? void (0) : __assert_fail ("isOpenMPTargetExecutionDirective(D.getDirectiveKind()) && \" Expected target-based directive.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3431, __extension__ __PRETTY_FUNCTION__));
3432	const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3433	for (const CapturedStmt::Capture &C : CS->captures()) {
3434	// Capture variables captured by reference in lambdas for target-based
3435	// directives.
3436	if (!C.capturesVariable())
3437	continue;
3438	const VarDecl *VD = C.getCapturedVar();
3439	const auto *RD = VD->getType()
3440	.getCanonicalType()
3441	.getNonReferenceType()
3442	->getAsCXXRecordDecl();
3443	if (!RD \|\| !RD->isLambda())
3444	continue;
3445	Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3446	LValue VDLVal;
3447	if (VD->getType().getCanonicalType()->isReferenceType())
3448	VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3449	else
3450	VDLVal = CGF.MakeAddrLValue(
3451	VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3452	llvm::DenseMap<const ValueDecl , FieldDecl > Captures;
3453	FieldDecl *ThisCapture = nullptr;
3454	RD->getCaptureFields(Captures, ThisCapture);
3455	if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3456	LValue ThisLVal =
3457	CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3458	llvm::Value *CXXThis = CGF.LoadCXXThis();
3459	CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3460	}
3461	for (const LambdaCapture &LC : RD->captures()) {
3462	if (LC.getCaptureKind() != LCK_ByRef)
3463	continue;
3464	const ValueDecl *VD = LC.getCapturedVar();
3465	// FIXME: For now VD is always a VarDecl because OpenMP does not support
3466	// capturing structured bindings in lambdas yet.
3467	if (!CS->capturesVariable(cast<VarDecl>(VD)))
3468	continue;
3469	auto It = Captures.find(VD);
3470	assert(It != Captures.end() && "Found lambda capture without field.")(static_cast <bool> (It != Captures.end() && "Found lambda capture without field." ) ? void (0) : __assert_fail ("It != Captures.end() && \"Found lambda capture without field.\"" , "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp", 3470, __extension__ __PRETTY_FUNCTION__));
3471	LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3472	Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3473	if (VD->getType().getCanonicalType()->isReferenceType())
3474	VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3475	VD->getType().getCanonicalType())
3476	.getAddress(CGF);
3477	CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
3478	}
3479	}
3480	}
3481
3482	bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,
3483	LangAS &AS) {
3484	if (!VD \|\| !VD->hasAttr<OMPAllocateDeclAttr>())
3485	return false;
3486	const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3487	switch(A->getAllocatorType()) {
3488	case OMPAllocateDeclAttr::OMPNullMemAlloc:
3489	case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3490	// Not supported, fallback to the default mem space.
3491	case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3492	case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3493	case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3494	case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3495	case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3496	AS = LangAS::Default;
3497	return true;
3498	case OMPAllocateDeclAttr::OMPConstMemAlloc:
3499	AS = LangAS::cuda_constant;
3500	return true;
3501	case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3502	AS = LangAS::cuda_shared;
3503	return true;
3504	case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3505	llvm_unreachable("Expected predefined allocator for the variables with the "::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the " "static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp" , 3506)
3506	"static storage.")::llvm::llvm_unreachable_internal("Expected predefined allocator for the variables with the " "static storage.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp" , 3506);
3507	}
3508	return false;
3509	}
3510
3511	// Get current CudaArch and ignore any unknown values
3512	static CudaArch getCudaArch(CodeGenModule &CGM) {
3513	if (!CGM.getTarget().hasFeature("ptx"))
3514	return CudaArch::UNKNOWN;
3515	for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3516	if (Feature.getValue()) {
3517	CudaArch Arch = StringToCudaArch(Feature.getKey());
3518	if (Arch != CudaArch::UNKNOWN)
3519	return Arch;
3520	}
3521	}
3522	return CudaArch::UNKNOWN;
3523	}
3524
3525	/// Check to see if target architecture supports unified addressing which is
3526	/// a restriction for OpenMP requires clause "unified_shared_memory".
3527	void CGOpenMPRuntimeGPU::processRequiresDirective(
3528	const OMPRequiresDecl *D) {
3529	for (const OMPClause *Clause : D->clauselists()) {
3530	if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3531	CudaArch Arch = getCudaArch(CGM);
3532	switch (Arch) {
3533	case CudaArch::SM_20:
3534	case CudaArch::SM_21:
3535	case CudaArch::SM_30:
3536	case CudaArch::SM_32:
3537	case CudaArch::SM_35:
3538	case CudaArch::SM_37:
3539	case CudaArch::SM_50:
3540	case CudaArch::SM_52:
3541	case CudaArch::SM_53: {
3542	SmallString<256> Buffer;
3543	llvm::raw_svector_ostream Out(Buffer);
3544	Out << "Target architecture " << CudaArchToString(Arch)
3545	<< " does not support unified addressing";
3546	CGM.Error(Clause->getBeginLoc(), Out.str());
3547	return;
3548	}
3549	case CudaArch::SM_60:
3550	case CudaArch::SM_61:
3551	case CudaArch::SM_62:
3552	case CudaArch::SM_70:
3553	case CudaArch::SM_72:
3554	case CudaArch::SM_75:
3555	case CudaArch::SM_80:
3556	case CudaArch::SM_86:
3557	case CudaArch::SM_87:
3558	case CudaArch::SM_89:
3559	case CudaArch::SM_90:
3560	case CudaArch::GFX600:
3561	case CudaArch::GFX601:
3562	case CudaArch::GFX602:
3563	case CudaArch::GFX700:
3564	case CudaArch::GFX701:
3565	case CudaArch::GFX702:
3566	case CudaArch::GFX703:
3567	case CudaArch::GFX704:
3568	case CudaArch::GFX705:
3569	case CudaArch::GFX801:
3570	case CudaArch::GFX802:
3571	case CudaArch::GFX803:
3572	case CudaArch::GFX805:
3573	case CudaArch::GFX810:
3574	case CudaArch::GFX900:
3575	case CudaArch::GFX902:
3576	case CudaArch::GFX904:
3577	case CudaArch::GFX906:
3578	case CudaArch::GFX908:
3579	case CudaArch::GFX909:
3580	case CudaArch::GFX90a:
3581	case CudaArch::GFX90c:
3582	case CudaArch::GFX940:
3583	case CudaArch::GFX1010:
3584	case CudaArch::GFX1011:
3585	case CudaArch::GFX1012:
3586	case CudaArch::GFX1013:
3587	case CudaArch::GFX1030:
3588	case CudaArch::GFX1031:
3589	case CudaArch::GFX1032:
3590	case CudaArch::GFX1033:
3591	case CudaArch::GFX1034:
3592	case CudaArch::GFX1035:
3593	case CudaArch::GFX1036:
3594	case CudaArch::GFX1100:
3595	case CudaArch::GFX1101:
3596	case CudaArch::GFX1102:
3597	case CudaArch::GFX1103:
3598	case CudaArch::Generic:
3599	case CudaArch::UNUSED:
3600	case CudaArch::UNKNOWN:
3601	break;
3602	case CudaArch::LAST:
3603	llvm_unreachable("Unexpected Cuda arch.")::llvm::llvm_unreachable_internal("Unexpected Cuda arch.", "clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp" , 3603);
3604	}
3605	}
3606	}
3607	CGOpenMPRuntime::processRequiresDirective(D);
3608	}
3609
3610	void CGOpenMPRuntimeGPU::clear() {
3611
3612	if (!TeamsReductions.empty()) {
3613	ASTContext &C = CGM.getContext();
3614	RecordDecl *StaticRD = C.buildImplicitRecord(
3615	"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
3616	StaticRD->startDefinition();
3617	for (const RecordDecl *TeamReductionRec : TeamsReductions) {
3618	QualType RecTy = C.getRecordType(TeamReductionRec);
3619	auto *Field = FieldDecl::Create(
3620	C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
3621	C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
3622	/BW=/nullptr, /Mutable=/false,
3623	/InitStyle=/ICIS_NoInit);
3624	Field->setAccess(AS_public);
3625	StaticRD->addDecl(Field);
3626	}
3627	StaticRD->completeDefinition();
3628	QualType StaticTy = C.getRecordType(StaticRD);
3629	llvm::Type *LLVMReductionsBufferTy =
3630	CGM.getTypes().ConvertTypeForMem(StaticTy);
3631	// FIXME: nvlink does not handle weak linkage correctly (object with the
3632	// different size are reported as erroneous).
3633	// Restore CommonLinkage as soon as nvlink is fixed.
3634	auto *GV = new llvm::GlobalVariable(
3635	CGM.getModule(), LLVMReductionsBufferTy,
3636	/isConstant=/false, llvm::GlobalValue::InternalLinkage,
3637	llvm::Constant::getNullValue(LLVMReductionsBufferTy),
3638	"_openmp_teams_reductions_buffer_$_");
3639	KernelTeamsReductionPtr->setInitializer(
3640	llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
3641	CGM.VoidPtrTy));
3642	}
3643	CGOpenMPRuntime::clear();
3644	}
3645
3646	llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
3647	CGBuilderTy &Bld = CGF.Builder;
3648	llvm::Module *M = &CGF.CGM.getModule();
3649	const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3650	llvm::Function *F = M->getFunction(LocSize);
3651	if (!F) {
3652	F = llvm::Function::Create(
3653	llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),
3654	llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
3655	}
3656	return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");
3657	}
3658
3659	llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
3660	ArrayRef<llvm::Value *> Args{};
3661	return CGF.EmitRuntimeCall(
3662	OMPBuilder.getOrCreateRuntimeFunction(
3663	CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
3664	Args);
3665	}
3666
3667	llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
3668	ArrayRef<llvm::Value *> Args{};
3669	return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3670	CGM.getModule(), OMPRTL___kmpc_get_warp_size),
3671	Args);
3672	}