/build/source/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

Bug Summary

File:	build/source/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
Warning:	line 2320, column 9 Value stored to 'Err' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

Show analyzer invocation

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name rtl.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D OMPT_SUPPORT=1 -D TARGET_NAME=AMDGPU -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I projects/openmp/libomptarget/plugins/amdgpu -I /build/source/openmp/libomptarget/plugins/amdgpu -I include -I /build/source/llvm/include -I projects/openmp/runtime/src -I /build/source/openmp/libomptarget/plugins/amdgpu/dynamic_hsa -I /build/source/openmp/libomptarget/include -I /build/source/openmp/libomptarget/plugins/amdgpu/impl -I /build/source/openmp/libomptarget/plugins/amdgpu/../../plugins-nextgen/amdgpu/utils -I /build/source/openmp/libomptarget/plugins/common/elf_common -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -Wno-extra -Wno-pedantic -Wno-maybe-uninitialized -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=protected -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp

1	//===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// RTL for AMD hsa machine
10	//
11	//===----------------------------------------------------------------------===//
12
13	#include "llvm/ADT/StringRef.h"
14	#include "llvm/Frontend/OpenMP/OMPConstants.h"
15	#include "llvm/Frontend/OpenMP/OMPGridValues.h"
16	#include "llvm/Object/ELF.h"
17	#include "llvm/Object/ELFObjectFile.h"
18
19	#include <algorithm>
20	#include <assert.h>
21	#include <cstdio>
22	#include <cstdlib>
23	#include <cstring>
24	#include <functional>
25	#include <list>
26	#include <memory>
27	#include <mutex>
28	#include <shared_mutex>
29	#include <unordered_map>
30	#include <vector>
31
32	#include "ELFSymbols.h"
33	#include "impl_runtime.h"
34	#include "interop_hsa.h"
35
36	#include "UtilitiesRTL.h"
37	#include "internal.h"
38	#include "rt.h"
39
40	#include "DeviceEnvironment.h"
41	#include "get_elf_mach_gfx_name.h"
42	#include "omptargetplugin.h"
43	#include "print_tracing.h"
44
45	using namespace llvm;
46	using namespace llvm::object;
47	using namespace llvm::ELF;
48	using namespace llvm::omp::target::plugin::utils;
49
50	// hostrpc interface, FIXME: consider moving to its own include these are
51	// statically linked into amdgpu/plugin if present from hostrpc_services.a,
52	// linked as --whole-archive to override the weak symbols that are used to
53	// implement a fallback for toolchains that do not yet have a hostrpc library.
54	extern "C" {
55	uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
56	uint32_t DeviceId);
57	hsa_status_t hostrpc_init();
58	hsa_status_t hostrpc_terminate();
59
60	__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
61	__attribute__((weak)) hsa_status_t hostrpc_terminate() {
62	return HSA_STATUS_SUCCESS;
63	}
64	__attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
65	uint32_t DeviceId) {
66	DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "{}
67	"missing\n",{}
68	DeviceId){};
69	return 0;
70	}
71	}
72
73	// Heuristic parameters used for kernel launch
74	// Number of teams per CU to allow scheduling flexibility
75	static const unsigned DefaultTeamsPerCU = 4;
76
77	int print_kernel_trace;
78
79	#ifdef OMPTARGET_DEBUG
80	#define check(msg, status){} \
81	if (status != HSA_STATUS_SUCCESS) { \
82	DP(#msg " failed\n"){}; \
83	} else { \
84	DP(#msg " succeeded\n"){}; \
85	}
86	#else
87	#define check(msg, status){} \
88	{}
89	#endif
90
91	#include "elf_common.h"
92
93	namespace hsa {
94	template <typename C> hsa_status_t iterate_agents(C Cb) {
95	auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
96	C Unwrapped = static_cast<C >(Data);
97	return (*Unwrapped)(Agent);
98	};
99	return hsa_iterate_agents(L, static_cast<void *>(&Cb));
100	}
101
102	template <typename C>
103	hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
104	auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
105	C Unwrapped = static_cast<C >(Data);
106	return (*Unwrapped)(MemoryPool);
107	};
108
109	return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
110	}
111
112	} // namespace hsa
113
114	/// Keep entries table per device
115	struct FuncOrGblEntryTy {
116	__tgt_target_table Table;
117	std::vector<__tgt_offload_entry> Entries;
118	};
119
120	struct KernelArgPool {
121	private:
122	static pthread_mutex_t Mutex;
123
124	public:
125	uint32_t KernargSegmentSize;
126	void *KernargRegion = nullptr;
127	std::queue<int> FreeKernargSegments;
128
129	uint32_t kernargSizeIncludingImplicit() {
130	return KernargSegmentSize + sizeof(AMDGPUImplicitArgsTy);
131	}
132
133	~KernelArgPool() {
134	if (KernargRegion) {
135	auto R = hsa_amd_memory_pool_free(KernargRegion);
136	if (R != HSA_STATUS_SUCCESS) {
137	DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)){};
138	}
139	}
140	}
141
142	// Can't really copy or move a mutex
143	KernelArgPool() = default;
144	KernelArgPool(const KernelArgPool &) = delete;
145	KernelArgPool(KernelArgPool &&) = delete;
146
147	KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
148	: KernargSegmentSize(KernargSegmentSize) {
149
150	// impl uses one pool per kernel for all gpus, with a fixed upper size
151	// preserving that exact scheme here, including the queue<int>
152
153	hsa_status_t Err = hsa_amd_memory_pool_allocate(
154	MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS(1024 * 16), 0,
155	&KernargRegion);
156
157	if (Err != HSA_STATUS_SUCCESS) {
158	DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)){};
159	KernargRegion = nullptr; // paranoid
160	return;
161	}
162
163	Err = core::allow_access_to_all_gpu_agents(KernargRegion);
164	if (Err != HSA_STATUS_SUCCESS) {
165	DP("hsa allow_access_to_all_gpu_agents failed: %s\n",{}
166	get_error_string(Err)){};
167	auto R = hsa_amd_memory_pool_free(KernargRegion);
168	if (R != HSA_STATUS_SUCCESS) {
169	// if free failed, can't do anything more to resolve it
170	DP("hsa memory poll free failed: %s\n", get_error_string(Err)){};
171	}
172	KernargRegion = nullptr;
173	return;
174	}
175
176	for (int I = 0; I < MAX_NUM_KERNELS(1024 * 16); I++) {
177	FreeKernargSegments.push(I);
178	}
179	}
180
181	void *allocate(uint64_t ArgNum) {
182	assert((ArgNum * sizeof(void )) == KernargSegmentSize)(static_cast <bool> ((ArgNum sizeof(void )) == KernargSegmentSize ) ? void (0) : __assert_fail ("(ArgNum sizeof(void *)) == KernargSegmentSize" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 182, __extension__ __PRETTY_FUNCTION__));
183	Lock L(&Mutex);
184	void *Res = nullptr;
185	if (!FreeKernargSegments.empty()) {
186
187	int FreeIdx = FreeKernargSegments.front();
188	Res = static_cast<void >(static_cast<char >(KernargRegion) +
189	(FreeIdx * kernargSizeIncludingImplicit()));
190	assert(FreeIdx == pointerToIndex(Res))(static_cast <bool> (FreeIdx == pointerToIndex(Res)) ? void (0) : __assert_fail ("FreeIdx == pointerToIndex(Res)", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 190, __extension__ __PRETTY_FUNCTION__));
191	FreeKernargSegments.pop();
192	}
193	return Res;
194	}
195
196	void deallocate(void *Ptr) {
197	Lock L(&Mutex);
198	int Idx = pointerToIndex(Ptr);
199	FreeKernargSegments.push(Idx);
200	}
201
202	private:
203	int pointerToIndex(void *Ptr) {
204	ptrdiff_t Bytes =
205	static_cast<char >(Ptr) - static_cast<char >(KernargRegion);
206	assert(Bytes >= 0)(static_cast <bool> (Bytes >= 0) ? void (0) : __assert_fail ("Bytes >= 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 206, __extension__ __PRETTY_FUNCTION__));
207	assert(Bytes % kernargSizeIncludingImplicit() == 0)(static_cast <bool> (Bytes % kernargSizeIncludingImplicit () == 0) ? void (0) : __assert_fail ("Bytes % kernargSizeIncludingImplicit() == 0" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 207, __extension__ __PRETTY_FUNCTION__));
208	return Bytes / kernargSizeIncludingImplicit();
209	}
210	struct Lock {
211	Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
212	~Lock() { pthread_mutex_unlock(M); }
213	pthread_mutex_t *M;
214	};
215	};
216	pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
217
218	std::unordered_map<std::string /kernel/, std::unique_ptr<KernelArgPool>>
219	KernelArgPoolMap;
220
221	/// Use a single entity to encode a kernel and a set of flags
222	struct KernelTy {
223	llvm::omp::OMPTgtExecModeFlags ExecutionMode;
224	int16_t ConstWGSize;
225	int32_t DeviceId;
226	void *CallStackAddr = nullptr;
227	const char *Name;
228
229	KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
230	int32_t DeviceId, void CallStackAddr, const char Name,
231	uint32_t KernargSegmentSize,
232	hsa_amd_memory_pool_t &KernArgMemoryPool)
233	: ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
234	DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
235	DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode){};
236
237	std::string N(Name);
238	if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
239	KernelArgPoolMap.insert(
240	std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
241	KernargSegmentSize, KernArgMemoryPool))));
242	}
243	}
244	};
245
246	/// List that contains all the kernels.
247	/// FIXME: we may need this to be per device and per library.
248	std::list<KernelTy> KernelsList;
249
250	template <typename Callback> static hsa_status_t findAgents(Callback CB) {
251
252	hsa_status_t Err =
253	hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
254	hsa_device_type_t DeviceType;
255	// get_info fails iff HSA runtime not yet initialized
256	hsa_status_t Err =
257	hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
258
259	if (Err != HSA_STATUS_SUCCESS) {
260	if (print_kernel_trace > 0)
261	DP("rtl.cpp: err %s\n", get_error_string(Err)){};
262
263	return Err;
264	}
265
266	CB(DeviceType, Agent);
267	return HSA_STATUS_SUCCESS;
268	});
269
270	// iterate_agents fails iff HSA runtime not yet initialized
271	if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
272	DP("rtl.cpp: err %s\n", get_error_string(Err)){};
273	}
274
275	return Err;
276	}
277
278	static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
279	void *Data) {
280	if (Status != HSA_STATUS_SUCCESS) {
281	const char *StatusString;
282	if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
283	StatusString = "unavailable";
284	}
285	DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,{}
286	Status, StatusString){};
287	abort();
288	}
289	}
290
291	namespace core {
292	namespace {
293
294	bool checkResult(hsa_status_t Err, const char *ErrMsg) {
295	if (Err == HSA_STATUS_SUCCESS)
296	return true;
297
298	REPORT("%s", ErrMsg)do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s" , ErrMsg); } while (0);;
299	REPORT("%s", get_error_string(Err))do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s" , get_error_string(Err)); } while (0);;
300	return false;
301	}
302
303	void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
304	__atomic_store_n(Packet, Header \| (Rest << 16), __ATOMIC_RELEASE3);
305	}
306
307	uint16_t createHeader() {
308	uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
309	Header \|= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
310	Header \|= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
311	return Header;
312	}
313
314	hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
315	bool AllocAllowed = false;
316	hsa_status_t Err = hsa_amd_memory_pool_get_info(
317	MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
318	&AllocAllowed);
319	if (Err != HSA_STATUS_SUCCESS) {
320	DP("Alloc allowed in memory pool check failed: %s\n",{}
321	get_error_string(Err)){};
322	return Err;
323	}
324
325	size_t Size = 0;
326	Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
327	&Size);
328	if (Err != HSA_STATUS_SUCCESS) {
329	DP("Get memory pool size failed: %s\n", get_error_string(Err)){};
330	return Err;
331	}
332
333	return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
334	}
335
336	hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
337	std::vector<hsa_amd_memory_pool_t> *Result =
338	static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
339
340	hsa_status_t Err;
341	if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
342	return Err;
343	}
344
345	Result->push_back(MemoryPool);
346	return HSA_STATUS_SUCCESS;
347	}
348
349	} // namespace
350	} // namespace core
351
352	struct EnvironmentVariables {
353	int NumTeams;
354	int TeamLimit;
355	int TeamThreadLimit;
356	int MaxTeamsDefault;
357	int DynamicMemSize;
358	};
359
360	template <uint32_t wavesize>
361	static constexpr const llvm::omp::GV &getGridValue() {
362	return llvm::omp::getAMDGPUGridValues<wavesize>();
363	}
364
365	struct HSALifetime {
366	// Wrapper around HSA used to ensure it is constructed before other types
367	// and destructed after, which means said other types can use raii for
368	// cleanup without risking running outside of the lifetime of HSA
369	const hsa_status_t S;
370
371	bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
372	HSALifetime() : S(hsa_init()) {}
373
374	~HSALifetime() {
375	if (S == HSA_STATUS_SUCCESS) {
376	hsa_status_t Err = hsa_shut_down();
377	if (Err != HSA_STATUS_SUCCESS) {
378	// Can't call into HSA to get a string from the integer
379	DP("Shutting down HSA failed: %d\n", Err){};
380	}
381	}
382	}
383	};
384
385	// Handle scheduling of multiple hsa_queue's per device to
386	// multiple threads (one scheduler per device)
387	class HSAQueueScheduler {
388	public:
389	HSAQueueScheduler() : Current(0) {}
390
391	HSAQueueScheduler(const HSAQueueScheduler &) = delete;
392
393	HSAQueueScheduler(HSAQueueScheduler &&Q) {
394	Current = Q.Current.load();
395	for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
396	HSAQueues[I] = Q.HSAQueues[I];
397	Q.HSAQueues[I] = nullptr;
398	}
399	}
400
401	// \return false if any HSA queue creation fails
402	bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
403	for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
404	hsa_queue_t *Q = nullptr;
405	hsa_status_t Rc =
406	hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
407	callbackQueue, NULL__null, UINT32_MAX(4294967295U), UINT32_MAX(4294967295U), &Q);
408	if (Rc != HSA_STATUS_SUCCESS) {
409	DP("Failed to create HSA queue %d\n", I){};
410	return false;
411	}
412	HSAQueues[I] = Q;
413	}
414	return true;
415	}
416
417	~HSAQueueScheduler() {
418	for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
419	if (HSAQueues[I]) {
420	hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
421	if (Err != HSA_STATUS_SUCCESS)
422	DP("Error destroying HSA queue"){};
423	}
424	}
425	}
426
427	// \return next queue to use for device
428	hsa_queue_t *next() {
429	return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
430	NUM_QUEUES_PER_DEVICE];
431	}
432
433	private:
434	// Number of queues per device
435	enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
436	hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
437	std::atomic<uint8_t> Current;
438	};
439
440	/// Class containing all the device information
441	class RTLDeviceInfoTy : HSALifetime {
442	std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
443
444	struct QueueDeleter {
445	void operator()(hsa_queue_t *Q) {
446	if (Q) {
447	hsa_status_t Err = hsa_queue_destroy(Q);
448	if (Err != HSA_STATUS_SUCCESS) {
449	DP("Error destroying hsa queue: %s\n", get_error_string(Err)){};
450	}
451	}
452	}
453	};
454
455	public:
456	bool ConstructionSucceeded = false;
457
458	// load binary populates symbol tables and mutates various global state
459	// run uses those symbol tables
460	std::shared_timed_mutex LoadRunLock;
461
462	int NumberOfDevices = 0;
463
464	// GPU devices
465	std::vector<hsa_agent_t> HSAAgents;
466	std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
467
468	// CPUs
469	std::vector<hsa_agent_t> CPUAgents;
470
471	// Device properties
472	std::vector<int> ComputeUnits;
473	std::vector<int> GroupsPerDevice;
474	std::vector<int> ThreadsPerGroup;
475	std::vector<int> WarpSize;
476	std::vector<std::string> GPUName;
477	std::vector<std::string> TargetID;
478
479	// OpenMP properties
480	std::vector<int> NumTeams;
481	std::vector<int> NumThreads;
482
483	// OpenMP Environment properties
484	EnvironmentVariables Env;
485
486	// OpenMP Requires Flags
487	int64_t RequiresFlags;
488
489	// Resource pools
490	SignalPoolT FreeSignalPool;
491
492	bool HostcallRequired = false;
493
494	std::vector<hsa_executable_t> HSAExecutables;
495
496	std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
497	std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
498
499	hsa_amd_memory_pool_t KernArgPool;
500
501	// fine grained memory pool for host allocations
502	hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
503
504	// fine and coarse-grained memory pools per offloading device
505	std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
506	std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
507
508	struct ImplFreePtrDeletor {
509	void operator()(void *P) {
510	core::Runtime::Memfree(P); // ignore failure to free
511	}
512	};
513
514	// device_State shared across loaded binaries, error if inconsistent size
515	std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
516	DeviceStateStore;
517
518	static const unsigned HardTeamLimit =
519	(1 << 16) - 1; // 64K needed to fit in uint16
520	static const int DefaultNumTeams = 128;
521
522	// These need to be per-device since different devices can have different
523	// wave sizes, but are currently the same number for each so that refactor
524	// can be postponed.
525	static_assert(getGridValue<32>().GV_Max_Teams ==
526	getGridValue<64>().GV_Max_Teams,
527	"");
528	static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
529
530	static_assert(getGridValue<32>().GV_Max_WG_Size ==
531	getGridValue<64>().GV_Max_WG_Size,
532	"");
533	static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
534
535	static_assert(getGridValue<32>().GV_Default_WG_Size ==
536	getGridValue<64>().GV_Default_WG_Size,
537	"");
538	static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
539
540	using MemcpyFunc = hsa_status_t ()(hsa_signal_t, void , void *, size_t Size,
541	hsa_agent_t, hsa_amd_memory_pool_t);
542	hsa_status_t freesignalpoolMemcpy(void Dest, void Src, size_t Size,
543	MemcpyFunc Func, int32_t DeviceId) {
544	hsa_agent_t Agent = HSAAgents[DeviceId];
545	hsa_signal_t S = FreeSignalPool.pop();
546	if (S.handle == 0) {
547	return HSA_STATUS_ERROR;
548	}
549	hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
550	FreeSignalPool.push(S);
551	return R;
552	}
553
554	hsa_status_t freesignalpoolMemcpyD2H(void Dest, void Src, size_t Size,
555	int32_t DeviceId) {
556	return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
557	}
558
559	hsa_status_t freesignalpoolMemcpyH2D(void Dest, void Src, size_t Size,
560	int32_t DeviceId) {
561	return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
562	}
563
564	static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
565	char TmpChar[1000];
566	uint16_t Major, Minor;
567	uint32_t TmpUInt;
568	uint32_t TmpUInt2;
569	uint32_t CacheSize[4];
570	bool TmpBool;
571	uint16_t WorkgroupMaxDim[3];
572	hsa_dim3_t GridMaxDim;
573
574	// Getting basic information about HSA and Device
575	core::checkResult(
576	hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
577	"Error from hsa_system_get_info when obtaining "
578	"HSA_SYSTEM_INFO_VERSION_MAJOR\n");
579	core::checkResult(
580	hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
581	"Error from hsa_system_get_info when obtaining "
582	"HSA_SYSTEM_INFO_VERSION_MINOR\n");
583	printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
584	printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId);
585	core::checkResult(
586	hsa_agent_get_info(
587	Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
588	"Error returned from hsa_agent_get_info when obtaining "
589	"HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
590	printf(" Product Name: \t\t\t%s \n", TmpChar);
591	core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
592	"Error returned from hsa_agent_get_info when obtaining "
593	"HSA_AGENT_INFO_NAME\n");
594	printf(" Device Name: \t\t\t%s \n", TmpChar);
595	core::checkResult(
596	hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
597	"Error returned from hsa_agent_get_info when obtaining "
598	"HSA_AGENT_INFO_NAME\n");
599	printf(" Vendor Name: \t\t\t%s \n", TmpChar);
600	hsa_device_type_t DevType;
601	core::checkResult(
602	hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
603	"Error returned from hsa_agent_get_info when obtaining "
604	"HSA_AGENT_INFO_DEVICE\n");
605	printf(" Device Type: \t\t\t%s \n",
606	DevType == HSA_DEVICE_TYPE_CPU
607	? "CPU"
608	: (DevType == HSA_DEVICE_TYPE_GPU
609	? "GPU"
610	: (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
611	core::checkResult(
612	hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
613	"Error returned from hsa_agent_get_info when obtaining "
614	"HSA_AGENT_INFO_QUEUES_MAX\n");
615	printf(" Max Queues: \t\t\t%u \n", TmpUInt);
616	core::checkResult(
617	hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
618	"Error returned from hsa_agent_get_info when obtaining "
619	"HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
620	printf(" Queue Min Size: \t\t\t%u \n", TmpUInt);
621	core::checkResult(
622	hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
623	"Error returned from hsa_agent_get_info when obtaining "
624	"HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
625	printf(" Queue Max Size: \t\t\t%u \n", TmpUInt);
626
627	// Getting cache information
628	printf(" Cache:\n");
629
630	// FIXME: This is deprecated according to HSA documentation. But using
631	// hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
632	// runtime.
633	core::checkResult(
634	hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
635	"Error returned from hsa_agent_get_info when obtaining "
636	"HSA_AGENT_INFO_CACHE_SIZE\n");
637
638	for (int I = 0; I < 4; I++) {
639	if (CacheSize[I]) {
640	printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
641	}
642	}
643
644	core::checkResult(
645	hsa_agent_get_info(Agent,
646	(hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
647	&TmpUInt),
648	"Error returned from hsa_agent_get_info when obtaining "
649	"HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
650	printf(" Cacheline Size: \t\t\t%u \n", TmpUInt);
651	core::checkResult(
652	hsa_agent_get_info(
653	Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
654	&TmpUInt),
655	"Error returned from hsa_agent_get_info when obtaining "
656	"HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
657	printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
658	core::checkResult(
659	hsa_agent_get_info(
660	Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
661	&TmpUInt),
662	"Error returned from hsa_agent_get_info when obtaining "
663	"HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
664	printf(" Compute Units: \t\t\t%u \n", TmpUInt);
665	core::checkResult(hsa_agent_get_info(
666	Agent,
667	(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
668	&TmpUInt),
669	"Error returned from hsa_agent_get_info when obtaining "
670	"HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
671	printf(" SIMD per CU: \t\t\t%u \n", TmpUInt);
672	core::checkResult(
673	hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
674	"Error returned from hsa_agent_get_info when obtaining "
675	"HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
676	printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
677	core::checkResult(
678	hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
679	"Error returned from hsa_agent_get_info when obtaining "
680	"HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
681	printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2);
682	core::checkResult(
683	hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
684	"Error returned from hsa_agent_get_info when obtaining "
685	"HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
686	printf(" Workgroup Max Size: \t\t%u \n", TmpUInt);
687	core::checkResult(hsa_agent_get_info(Agent,
688	HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
689	WorkgroupMaxDim),
690	"Error returned from hsa_agent_get_info when obtaining "
691	"HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
692	printf(" Workgroup Max Size per Dimension:\n");
693	printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
694	printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
695	printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
696	core::checkResult(hsa_agent_get_info(
697	Agent,
698	(hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
699	&TmpUInt),
700	"Error returned from hsa_agent_get_info when obtaining "
701	"HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
702	printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt);
703	printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
704	core::checkResult(
705	hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
706	"Error returned from hsa_agent_get_info when obtaining "
707	"HSA_AGENT_INFO_GRID_MAX_SIZE\n");
708	printf(" Grid Max Size: \t\t\t%u \n", TmpUInt);
709	core::checkResult(
710	hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
711	"Error returned from hsa_agent_get_info when obtaining "
712	"HSA_AGENT_INFO_GRID_MAX_DIM\n");
713	printf(" Grid Max Size per Dimension: \t\t\n");
714	printf(" x: \t\t\t\t%u\n", GridMaxDim.x);
715	printf(" y: \t\t\t\t%u\n", GridMaxDim.y);
716	printf(" z: \t\t\t\t%u\n", GridMaxDim.z);
717	core::checkResult(
718	hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
719	"Error returned from hsa_agent_get_info when obtaining "
720	"HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
721	printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
722
723	printf(" Memory Pools:\n");
724	auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
725	std::string TmpStr;
726	size_t Size;
727	bool Alloc, Access;
728	hsa_amd_segment_t Segment;
729	hsa_amd_memory_pool_global_flag_t GlobalFlags;
730	core::checkResult(
731	hsa_amd_memory_pool_get_info(
732	Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
733	"Error returned from hsa_amd_memory_pool_get_info when obtaining "
734	"HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
735	core::checkResult(hsa_amd_memory_pool_get_info(
736	Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
737	"Error returned from hsa_amd_memory_pool_get_info when "
738	"obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
739
740	switch (Segment) {
741	case HSA_AMD_SEGMENT_GLOBAL:
742	TmpStr = "GLOBAL; FLAGS: ";
743	if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
744	TmpStr += "KERNARG, ";
745	if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
746	TmpStr += "FINE GRAINED, ";
747	if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
748	TmpStr += "COARSE GRAINED, ";
749	break;
750	case HSA_AMD_SEGMENT_READONLY:
751	TmpStr = "READONLY";
752	break;
753	case HSA_AMD_SEGMENT_PRIVATE:
754	TmpStr = "PRIVATE";
755	break;
756	case HSA_AMD_SEGMENT_GROUP:
757	TmpStr = "GROUP";
758	break;
759	}
760	printf(" Pool %s: \n", TmpStr.c_str());
761
762	core::checkResult(hsa_amd_memory_pool_get_info(
763	Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
764	"Error returned from hsa_amd_memory_pool_get_info when "
765	"obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
766	printf(" Size: \t\t\t\t %zu bytes\n", Size);
767	core::checkResult(
768	hsa_amd_memory_pool_get_info(
769	Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
770	"Error returned from hsa_amd_memory_pool_get_info when obtaining "
771	"HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
772	printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
773	core::checkResult(
774	hsa_amd_memory_pool_get_info(
775	Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
776	"Error returned from hsa_amd_memory_pool_get_info when obtaining "
777	"HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
778	printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size);
779	core::checkResult(
780	hsa_amd_memory_pool_get_info(
781	Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
782	"Error returned from hsa_amd_memory_pool_get_info when obtaining "
783	"HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
784	printf(" Runtime Alloc alignment: \t %zu bytes\n", Size);
785	core::checkResult(
786	hsa_amd_memory_pool_get_info(
787	Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
788	"Error returned from hsa_amd_memory_pool_get_info when obtaining "
789	"HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
790	printf(" Accessable by all: \t\t %s\n",
791	(Access ? "TRUE" : "FALSE"));
792
793	return HSA_STATUS_SUCCESS;
794	};
795	// Iterate over all the memory regions for this agent. Get the memory region
796	// type and size
797	hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
798
799	printf(" ISAs:\n");
800	auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
801	char TmpChar[1000];
802	core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
803	"Error returned from hsa_isa_get_info_alt when "
804	"obtaining HSA_ISA_INFO_NAME\n");
805	printf(" Name: \t\t\t\t %s\n", TmpChar);
806
807	return HSA_STATUS_SUCCESS;
808	};
809	// Iterate over all the memory regions for this agent. Get the memory region
810	// type and size
811	hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
812	}
813
814	// Record entry point associated with device
815	void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
816	assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 817, __extension__ __PRETTY_FUNCTION__))
817	"Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 817, __extension__ __PRETTY_FUNCTION__));
818	FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
819
820	E.Entries.push_back(Entry);
821	}
822
823	// Return true if the entry is associated with device
824	bool findOffloadEntry(int32_t DeviceId, void *Addr) {
825	assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 826, __extension__ __PRETTY_FUNCTION__))
826	"Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 826, __extension__ __PRETTY_FUNCTION__));
827	FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
828
829	for (auto &It : E.Entries) {
830	if (It.addr == Addr)
831	return true;
832	}
833
834	return false;
835	}
836
837	// Return the pointer to the target entries table
838	__tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
839	assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 840, __extension__ __PRETTY_FUNCTION__))
840	"Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 840, __extension__ __PRETTY_FUNCTION__));
841	FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
842
843	int32_t Size = E.Entries.size();
844
845	// Table is empty
846	if (!Size)
847	return 0;
848
849	__tgt_offload_entry *Begin = &E.Entries[0];
850	__tgt_offload_entry *End = &E.Entries[Size - 1];
851
852	// Update table info according to the entries and return the pointer
853	E.Table.EntriesBegin = Begin;
854	E.Table.EntriesEnd = ++End;
855
856	return &E.Table;
857	}
858
859	// Clear entries table for a device
860	void clearOffloadEntriesTable(int DeviceId) {
861	assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 862, __extension__ __PRETTY_FUNCTION__))
862	"Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 862, __extension__ __PRETTY_FUNCTION__));
863	FuncGblEntries[DeviceId].emplace_back();
864	FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
865	// KernelArgPoolMap.clear();
866	E.Entries.clear();
867	E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
868	}
869
870	hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
871	unsigned int DeviceId) {
872	assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.")(static_cast <bool> (DeviceId < DeviceFineGrainedMemoryPools .size() && "Error here.") ? void (0) : __assert_fail ( "DeviceId < DeviceFineGrainedMemoryPools.size() && \"Error here.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 872, __extension__ __PRETTY_FUNCTION__));
873	uint32_t GlobalFlags = 0;
874	hsa_status_t Err = hsa_amd_memory_pool_get_info(
875	MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
876
877	if (Err != HSA_STATUS_SUCCESS) {
878	return Err;
879	}
880
881	if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
882	DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
883	} else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
884	DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
885	}
886
887	return HSA_STATUS_SUCCESS;
888	}
889
890	hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
891	for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
892	hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
893	Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
894	hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
895	if (ValidStatus != HSA_STATUS_SUCCESS) {
896	DP("Alloc allowed in memory pool check failed: %s\n",{}
897	get_error_string(ValidStatus)){};
898	return HSA_STATUS_SUCCESS;
899	}
900	return addDeviceMemoryPool(MemoryPool, DeviceId);
901	});
902
903	if (Err != HSA_STATUS_SUCCESS) {
904	DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{}
905	"Iterate all memory pools", get_error_string(Err)){};
906	return Err;
907	}
908	}
909	return HSA_STATUS_SUCCESS;
910	}
911
912	hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
913	std::vector<hsa_amd_memory_pool_t> HostPools;
914
915	// collect all the "valid" pools for all the given agents.
916	for (const auto &Agent : Agents) {
917	hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
918	Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
919	if (Err != HSA_STATUS_SUCCESS) {
920	DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)){};
921	}
922	}
923
924	// We need two fine-grained pools.
925	// 1. One with kernarg flag set for storing kernel arguments
926	// 2. Second for host allocations
927	bool FineGrainedMemoryPoolSet = false;
928	bool KernArgPoolSet = false;
929	for (const auto &MemoryPool : HostPools) {
930	hsa_status_t Err = HSA_STATUS_SUCCESS;
931	uint32_t GlobalFlags = 0;
932	Err = hsa_amd_memory_pool_get_info(
933	MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
934	if (Err != HSA_STATUS_SUCCESS) {
935	DP("Get memory pool info failed: %s\n", get_error_string(Err)){};
936	return Err;
937	}
938
939	if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
940	if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
941	KernArgPool = MemoryPool;
942	KernArgPoolSet = true;
943	} else {
944	HostFineGrainedMemoryPool = MemoryPool;
945	FineGrainedMemoryPoolSet = true;
946	}
947	}
948	}
949
950	if (FineGrainedMemoryPoolSet && KernArgPoolSet)
951	return HSA_STATUS_SUCCESS;
952
953	return HSA_STATUS_ERROR;
954	}
955
956	hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
957	assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&(static_cast <bool> (DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id" ) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 958, __extension__ __PRETTY_FUNCTION__))
958	"Invalid device Id")(static_cast <bool> (DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id" ) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 958, __extension__ __PRETTY_FUNCTION__));
959	return DeviceCoarseGrainedMemoryPools[DeviceId];
960	}
961
962	hsa_amd_memory_pool_t getHostMemoryPool() {
963	return HostFineGrainedMemoryPool;
964	}
965
966	static int readEnv(const char *Env, int Default = -1) {
967	const char *EnvStr = getenv(Env);
968	int Res = Default;
969	if (EnvStr) {
970	Res = std::stoi(EnvStr);
971	DP("Parsed %s=%d\n", Env, Res){};
972	}
973	return Res;
974	}
975
976	RTLDeviceInfoTy() {
977	DP("Start initializing " GETNAME(TARGET_NAME) "\n"){};
978
979	// LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
980	// anytime. You do not need a debug library build.
981	// 0 => no tracing
982	// 1 => tracing dispatch only
983	// >1 => verbosity increase
984
985	if (!HSAInitSuccess()) {
986	DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"){};
987	return;
988	}
989
990	if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
991	print_kernel_trace = atoi(EnvStr);
992	else
993	print_kernel_trace = 0;
994
995	hsa_status_t Err = core::atl_init_gpu_context();
996	if (Err != HSA_STATUS_SUCCESS) {
997	DP("Error when initializing " GETNAME(TARGET_NAME) "\n"){};
998	return;
999	}
1000
1001	// Init hostcall soon after initializing hsa
1002	hostrpc_init();
1003
1004	Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
1005	if (DeviceType == HSA_DEVICE_TYPE_CPU) {
1006	CPUAgents.push_back(Agent);
1007	} else {
1008	HSAAgents.push_back(Agent);
1009	}
1010	});
1011	if (Err != HSA_STATUS_SUCCESS)
1012	return;
1013
1014	NumberOfDevices = (int)HSAAgents.size();
1015
1016	if (NumberOfDevices == 0) {
1017	DP("There are no devices supporting HSA.\n"){};
1018	return;
1019	}
1020	DP("There are %d devices supporting HSA.\n", NumberOfDevices){};
1021
1022	// Init the device info
1023	HSAQueueSchedulers.reserve(NumberOfDevices);
1024	FuncGblEntries.resize(NumberOfDevices);
1025	ThreadsPerGroup.resize(NumberOfDevices);
1026	ComputeUnits.resize(NumberOfDevices);
1027	GPUName.resize(NumberOfDevices);
1028	GroupsPerDevice.resize(NumberOfDevices);
1029	WarpSize.resize(NumberOfDevices);
1030	NumTeams.resize(NumberOfDevices);
1031	NumThreads.resize(NumberOfDevices);
1032	DeviceStateStore.resize(NumberOfDevices);
1033	KernelInfoTable.resize(NumberOfDevices);
1034	SymbolInfoTable.resize(NumberOfDevices);
1035	DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
1036	DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
1037
1038	Err = setupDevicePools(HSAAgents);
1039	if (Err != HSA_STATUS_SUCCESS) {
1040	DP("Setup for Device Memory Pools failed\n"){};
1041	return;
1042	}
1043
1044	Err = setupHostMemoryPools(CPUAgents);
1045	if (Err != HSA_STATUS_SUCCESS) {
1046	DP("Setup for Host Memory Pools failed\n"){};
1047	return;
1048	}
1049
1050	for (int I = 0; I < NumberOfDevices; I++) {
1051	uint32_t QueueSize = 0;
1052	{
1053	hsa_status_t Err = hsa_agent_get_info(
1054	HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
1055	if (Err != HSA_STATUS_SUCCESS) {
1056	DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I){};
1057	return;
1058	}
1059	enum { MaxQueueSize = 4096 };
1060	if (QueueSize > MaxQueueSize) {
1061	QueueSize = MaxQueueSize;
1062	}
1063	}
1064
1065	{
1066	HSAQueueScheduler QSched;
1067	if (!QSched.createQueues(HSAAgents[I], QueueSize))
1068	return;
1069	HSAQueueSchedulers.emplace_back(std::move(QSched));
1070	}
1071
1072	DeviceStateStore[I] = {nullptr, 0};
1073	}
1074
1075	for (int I = 0; I < NumberOfDevices; I++) {
1076	ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
1077	GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
1078	ComputeUnits[I] = 1;
1079	DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,{}
1080	GroupsPerDevice[I], ThreadsPerGroup[I]){};
1081	}
1082
1083	// Get environment variables regarding teams
1084	Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
1085	Env.NumTeams = readEnv("OMP_NUM_TEAMS");
1086	Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
1087	Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
1088	Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
1089
1090	// Default state.
1091	RequiresFlags = OMP_REQ_UNDEFINED;
1092
1093	ConstructionSucceeded = true;
1094	}
1095
1096	~RTLDeviceInfoTy() {
1097	DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"){};
1098	if (!HSAInitSuccess()) {
1099	// Then none of these can have been set up and they can't be torn down
1100	return;
1101	}
1102	// Run destructors on types that use HSA before
1103	// impl_finalize removes access to it
1104	DeviceStateStore.clear();
1105	KernelArgPoolMap.clear();
1106	// Terminate hostrpc before finalizing hsa
1107	hostrpc_terminate();
1108
1109	hsa_status_t Err;
1110	for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
1111	Err = hsa_executable_destroy(HSAExecutables[I]);
1112	if (Err != HSA_STATUS_SUCCESS) {
1113	DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{}
1114	"Destroying executable", get_error_string(Err)){};
1115	}
1116	}
1117	}
1118	};
1119
1120	pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
1121
1122	// Putting accesses to DeviceInfo global behind a function call prior
1123	// to changing to use init_plugin/deinit_plugin calls
1124	static RTLDeviceInfoTy DeviceInfoState;
1125	static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; }
1126
1127	namespace {
1128
1129	int32_t dataRetrieve(int32_t DeviceId, void HstPtr, void TgtPtr, int64_t Size,
1130	__tgt_async_info *AsyncInfo) {
1131	assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1131, __extension__ __PRETTY_FUNCTION__));
1132	assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1132, __extension__ __PRETTY_FUNCTION__));
1133	// Return success if we are not copying back to host from target.
1134	if (!HstPtr)
1135	return OFFLOAD_SUCCESS(0);
1136	hsa_status_t Err;
1137	DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{}
1138	(long long unsigned)(Elf64_Addr)TgtPtr,{}
1139	(long long unsigned)(Elf64_Addr)HstPtr){};
1140
1141	Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
1142	DeviceId);
1143
1144	if (Err != HSA_STATUS_SUCCESS) {
1145	DP("Error when copying data from device to host. Pointers: "{}
1146	"host = 0x%016lx, device = 0x%016lx, size = %lld\n",{}
1147	(Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){};
1148	return OFFLOAD_FAIL(~0);
1149	}
1150	DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{}
1151	(long long unsigned)(Elf64_Addr)TgtPtr,{}
1152	(long long unsigned)(Elf64_Addr)HstPtr){};
1153	return OFFLOAD_SUCCESS(0);
1154	}
1155
1156	int32_t dataSubmit(int32_t DeviceId, void TgtPtr, void HstPtr, int64_t Size,
1157	__tgt_async_info *AsyncInfo) {
1158	assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1158, __extension__ __PRETTY_FUNCTION__));
1159	hsa_status_t Err;
1160	assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1160, __extension__ __PRETTY_FUNCTION__));
1161	// Return success if we are not doing host to target.
1162	if (!HstPtr)
1163	return OFFLOAD_SUCCESS(0);
1164
1165	DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,{}
1166	(long long unsigned)(Elf64_Addr)HstPtr,{}
1167	(long long unsigned)(Elf64_Addr)TgtPtr){};
1168	Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
1169	DeviceId);
1170	if (Err != HSA_STATUS_SUCCESS) {
1171	DP("Error when copying data from host to device. Pointers: "{}
1172	"host = 0x%016lx, device = 0x%016lx, size = %lld\n",{}
1173	(Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){};
1174	return OFFLOAD_FAIL(~0);
1175	}
1176	return OFFLOAD_SUCCESS(0);
1177	}
1178
1179	// Async.
1180	// The implementation was written with cuda streams in mind. The semantics of
1181	// that are to execute kernels on a queue in order of insertion. A synchronise
1182	// call then makes writes visible between host and device. This means a series
1183	// of N data_submit_async calls are expected to execute serially. HSA offers
1184	// various options to run the data copies concurrently. This may require changes
1185	// to libomptarget.
1186
1187	// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
1188	// there are no outstanding kernels that need to be synchronized. Any async call
1189	// may be passed a Queue==0, at which point the cuda implementation will set it
1190	// to non-null (see getStream). The cuda streams are per-device. Upstream may
1191	// change this interface to explicitly initialize the AsyncInfo_pointer, but
1192	// until then hsa lazily initializes it as well.
1193
1194	void initAsyncInfo(__tgt_async_info *AsyncInfo) {
1195	// set non-null while using async calls, return to null to indicate completion
1196	assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail ("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1196, __extension__ __PRETTY_FUNCTION__));
1197	if (!AsyncInfo->Queue) {
1198	AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX(18446744073709551615UL));
1199	}
1200	}
1201	void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
1202	assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail ("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1202, __extension__ __PRETTY_FUNCTION__));
1203	assert(AsyncInfo->Queue)(static_cast <bool> (AsyncInfo->Queue) ? void (0) : __assert_fail ("AsyncInfo->Queue", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1203, __extension__ __PRETTY_FUNCTION__));
1204	AsyncInfo->Queue = 0;
1205	}
1206
1207	// Determine launch values for kernel.
1208	struct LaunchVals {
1209	int WorkgroupSize;
1210	int GridSize;
1211	};
1212	LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
1213	int ConstWGSize,
1214	llvm::omp::OMPTgtExecModeFlags ExecutionMode,
1215	int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
1216	int DeviceNumTeams) {
1217
1218	int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
1219	int NumGroups = 0;
1220
1221	int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
1222	if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
1223	MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
1224
1225	if (print_kernel_trace & STARTUP_DETAILS) {
1226	DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams){};
1227	DP("Max_Teams: %d\n", MaxTeams){};
1228	DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize){};
1229	DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize){};
1230	DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",{}
1231	RTLDeviceInfoTy::DefaultWgSize){};
1232	DP("thread_limit: %d\n", ThreadLimit){};
1233	DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1234	DP("ConstWGSize: %d\n", ConstWGSize){};
1235	}
1236	// check for thread_limit() clause
1237	if (ThreadLimit > 0) {
1238	ThreadsPerGroup = ThreadLimit;
1239	DP("Setting threads per block to requested %d\n", ThreadLimit){};
1240	// Add master warp for GENERIC
1241	if (ExecutionMode ==
1242	llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1243	ThreadsPerGroup += WarpSize;
1244	DP("Adding master wavefront: +%d threads\n", WarpSize){};
1245	}
1246	if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
1247	ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
1248	DP("Setting threads per block to maximum %d\n", ThreadsPerGroup){};
1249	}
1250	}
1251	// check flat_max_work_group_size attr here
1252	if (ThreadsPerGroup > ConstWGSize) {
1253	ThreadsPerGroup = ConstWGSize;
1254	DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",{}
1255	ThreadsPerGroup){};
1256	}
1257	if (print_kernel_trace & STARTUP_DETAILS)
1258	DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1259	DP("Preparing %d threads\n", ThreadsPerGroup){};
1260
1261	// Set default num_groups (teams)
1262	if (Env.TeamLimit > 0)
1263	NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
1264	else
1265	NumGroups = MaxTeams;
1266	DP("Set default num of groups %d\n", NumGroups){};
1267
1268	if (print_kernel_trace & STARTUP_DETAILS) {
1269	DP("num_groups: %d\n", NumGroups){};
1270	DP("num_teams: %d\n", NumTeams){};
1271	}
1272
1273	// Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1274	// This reduction is typical for default case (no thread_limit clause).
1275	// or when user goes crazy with num_teams clause.
1276	// FIXME: We cant distinguish between a constant or variable thread limit.
1277	// So we only handle constant thread_limits.
1278	if (ThreadsPerGroup >
1279	RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024
1280	// Should we round threadsPerGroup up to nearest WarpSize
1281	// here?
1282	NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
1283
1284	// check for num_teams() clause
1285	if (NumTeams > 0) {
1286	NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
1287	}
1288	if (print_kernel_trace & STARTUP_DETAILS) {
1289	DP("num_groups: %d\n", NumGroups){};
1290	DP("Env.NumTeams %d\n", Env.NumTeams){};
1291	DP("Env.TeamLimit %d\n", Env.TeamLimit){};
1292	}
1293
1294	if (Env.NumTeams > 0) {
1295	NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
1296	DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams){};
1297	} else if (Env.TeamLimit > 0) {
1298	NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
1299	DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit){};
1300	} else {
1301	if (NumTeams <= 0) {
1302	if (LoopTripcount > 0) {
1303	if (ExecutionMode ==
1304	llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
1305	// round up to the nearest integer
1306	NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
1307	} else if (ExecutionMode ==
1308	llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1309	NumGroups = LoopTripcount;
1310	} else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
1311	// This is a generic kernel that was transformed to use SPMD-mode
1312	// execution but uses Generic-mode semantics for scheduling.
1313	NumGroups = LoopTripcount;
1314	}
1315	DP("Using %d teams due to loop trip count %" PRIu64 " and number of "{}
1316	"threads per block %d\n",{}
1317	NumGroups, LoopTripcount, ThreadsPerGroup){};
1318	}
1319	} else {
1320	NumGroups = NumTeams;
1321	}
1322	if (NumGroups > MaxTeams) {
1323	NumGroups = MaxTeams;
1324	if (print_kernel_trace & STARTUP_DETAILS)
1325	DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams){};
1326	}
1327	if (NumGroups > NumTeams && NumTeams > 0) {
1328	NumGroups = NumTeams;
1329	if (print_kernel_trace & STARTUP_DETAILS)
1330	DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,{}
1331	NumTeams){};
1332	}
1333	}
1334
1335	// num_teams clause always honored, no matter what, unless DEFAULT is active.
1336	if (NumTeams > 0) {
1337	NumGroups = NumTeams;
1338	// Cap num_groups to EnvMaxTeamsDefault if set.
1339	if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
1340	NumGroups = Env.MaxTeamsDefault;
1341	}
1342	if (print_kernel_trace & STARTUP_DETAILS) {
1343	DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1344	DP("num_groups: %d\n", NumGroups){};
1345	DP("loop_tripcount: %ld\n", LoopTripcount){};
1346	}
1347	DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,{}
1348	ThreadsPerGroup){};
1349
1350	LaunchVals Res;
1351	Res.WorkgroupSize = ThreadsPerGroup;
1352	Res.GridSize = ThreadsPerGroup * NumGroups;
1353	return Res;
1354	}
1355
1356	static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
1357	uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
1358	bool Full = true;
1359	while (Full) {
1360	Full =
1361	PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
1362	}
1363	return PacketId;
1364	}
1365
1366	int32_t runRegionLocked(int32_t DeviceId, void TgtEntryPtr, void *TgtArgs,
1367	ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
1368	int32_t ThreadLimit, uint64_t LoopTripcount) {
1369	// Set the context we are using
1370	// update thread limit content in gpu memory if un-initialized or specified
1371	// from host
1372
1373	DP("Run target team region thread_limit %d\n", ThreadLimit){};
1374
1375	// All args are references.
1376	std::vector<void *> Args(ArgNum);
1377	std::vector<void *> Ptrs(ArgNum);
1378
1379	DP("Arg_num: %d\n", ArgNum){};
1380	for (int32_t I = 0; I < ArgNum; ++I) {
1381	Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1382	Args[I] = &Ptrs[I];
1383	DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])){};
1384	}
1385
1386	KernelTy KernelInfo = (KernelTy )TgtEntryPtr;
1387
1388	std::string KernelName = std::string(KernelInfo->Name);
1389	auto &KernelInfoTable = DeviceInfo().KernelInfoTable;
1390	if (KernelInfoTable[DeviceId].find(KernelName) ==
1391	KernelInfoTable[DeviceId].end()) {
1392	DP("Kernel %s not found\n", KernelName.c_str()){};
1393	return OFFLOAD_FAIL(~0);
1394	}
1395
1396	const atl_kernel_info_t KernelInfoEntry =
1397	KernelInfoTable[DeviceId][KernelName];
1398	const uint32_t GroupSegmentSize =
1399	KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize;
1400	const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
1401	const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
1402	const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
1403	const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
1404
1405	assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count)(static_cast <bool> (ArgNum == (int)KernelInfoEntry.explicit_argument_count ) ? void (0) : __assert_fail ("ArgNum == (int)KernelInfoEntry.explicit_argument_count" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1405, __extension__ __PRETTY_FUNCTION__));
1406
1407	/*
1408	* Set limit based on ThreadsPerGroup and GroupsPerDevice
1409	*/
1410	LaunchVals LV =
1411	getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env,
1412	KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
1413	NumTeams, // From run_region arg
1414	ThreadLimit, // From run_region arg
1415	LoopTripcount, // From run_region arg
1416	DeviceInfo().NumTeams[KernelInfo->DeviceId]);
1417	const int GridSize = LV.GridSize;
1418	const int WorkgroupSize = LV.WorkgroupSize;
1419
1420	if (print_kernel_trace >= LAUNCH) {
1421	int NumGroups = GridSize / WorkgroupSize;
1422	// enum modes are SPMD, GENERIC, NONE 0,1,2
1423	// if doing rtl timing, print to stderr, unless stdout requested.
1424	bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT \| RTL_TIMING);
1425	fprintf(TraceToStdout ? stdoutstdout : stderrstderr,
1426	"DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
1427	"reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
1428	"sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
1429	DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
1430	ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
1431	GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
1432	VgprSpillCount, LoopTripcount, KernelInfo->Name);
1433	}
1434
1435	// Run on the device.
1436	{
1437	hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next();
1438	if (!Queue) {
1439	return OFFLOAD_FAIL(~0);
1440	}
1441	uint64_t PacketId = acquireAvailablePacketId(Queue);
1442
1443	const uint32_t Mask = Queue->size - 1; // size is a power of 2
1444	hsa_kernel_dispatch_packet_t *Packet =
1445	(hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
1446
1447	// packet->header is written last
1448	Packet->setup = UINT16_C(1)1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1449	Packet->workgroup_size_x = WorkgroupSize;
1450	Packet->workgroup_size_y = 1;
1451	Packet->workgroup_size_z = 1;
1452	Packet->reserved0 = 0;
1453	Packet->grid_size_x = GridSize;
1454	Packet->grid_size_y = 1;
1455	Packet->grid_size_z = 1;
1456	Packet->private_segment_size = KernelInfoEntry.private_segment_size;
1457	Packet->group_segment_size = GroupSegmentSize;
1458	Packet->kernel_object = KernelInfoEntry.kernel_object;
1459	Packet->kernarg_address = 0; // use the block allocator
1460	Packet->reserved2 = 0; // impl writes id_ here
1461	Packet->completion_signal = {0}; // may want a pool of signals
1462
1463	KernelArgPool *ArgPool = nullptr;
1464	void *KernArg = nullptr;
1465	{
1466	auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
1467	if (It != KernelArgPoolMap.end()) {
1468	ArgPool = (It->second).get();
1469	}
1470	}
1471	if (!ArgPool) {
1472	DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,{}
1473	DeviceId){};
1474	}
1475	{
1476	if (ArgPool) {
1477	assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void )))(static_cast <bool> (ArgPool->KernargSegmentSize == ( ArgNum sizeof(void ))) ? void (0) : __assert_fail ("ArgPool->KernargSegmentSize == (ArgNum sizeof(void *))" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1477, __extension__ __PRETTY_FUNCTION__));
1478	KernArg = ArgPool->allocate(ArgNum);
1479	}
1480	if (!KernArg) {
1481	DP("Allocate kernarg failed\n"){};
1482	return OFFLOAD_FAIL(~0);
1483	}
1484
1485	// Copy explicit arguments
1486	for (int I = 0; I < ArgNum; I++) {
1487	memcpy((char )KernArg + sizeof(void ) * I, Args[I], sizeof(void *));
1488	}
1489
1490	// Initialize implicit arguments. TODO: Which of these can be dropped
1491	AMDGPUImplicitArgsTy ImplArgs = reinterpret_cast<AMDGPUImplicitArgsTy >(
1492	static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
1493	memset(ImplArgs, 0,
1494	sizeof(AMDGPUImplicitArgsTy)); // may not be necessary
1495	ImplArgs->OffsetX = 0;
1496	ImplArgs->OffsetY = 0;
1497	ImplArgs->OffsetZ = 0;
1498
1499	// assign a hostcall buffer for the selected Q
1500	if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE2)) {
1501	// hostrpc_assign_buffer is not thread safe, and this function is
1502	// under a multiple reader lock, not a writer lock.
1503	static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
1504	pthread_mutex_lock(&HostcallInitLock);
1505	uint64_t Buffer = hostrpc_assign_buffer(
1506	DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId);
1507	pthread_mutex_unlock(&HostcallInitLock);
1508	if (!Buffer) {
1509	DP("hostrpc_assign_buffer failed, gpu would dereference null and "{}
1510	"error\n"){};
1511	return OFFLOAD_FAIL(~0);
1512	}
1513
1514	DP("Implicit argument count: %d\n",{}
1515	KernelInfoEntry.implicit_argument_count){};
1516	if (KernelInfoEntry.implicit_argument_count >= 4) {
1517	// Initialise pointer for implicit_argument_count != 0 ABI
1518	// Guess that the right implicit argument is at offset 24 after
1519	// the explicit arguments. In the future, should be able to read
1520	// the offset from msgpack. Clang is not annotating it at present.
1521	uint64_t Offset =
1522	sizeof(void ) (KernelInfoEntry.explicit_argument_count + 3);
1523	if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
1524	DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "{}
1525	"args: %d\n",{}
1526	Offset + 8, ArgPool->kernargSizeIncludingImplicit()){};
1527	} else {
1528	memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
1529	}
1530	}
1531
1532	// initialise pointer for implicit_argument_count == 0 ABI
1533	ImplArgs->HostcallPtr = Buffer;
1534	}
1535
1536	Packet->kernarg_address = KernArg;
1537	}
1538
1539	hsa_signal_t S = DeviceInfo().FreeSignalPool.pop();
1540	if (S.handle == 0) {
1541	DP("Failed to get signal instance\n"){};
1542	return OFFLOAD_FAIL(~0);
1543	}
1544	Packet->completion_signal = S;
1545	hsa_signal_store_relaxed(Packet->completion_signal, 1);
1546
1547	// Publish the packet indicating it is ready to be processed
1548	core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
1549	core::createHeader(), Packet->setup);
1550
1551	// Since the packet is already published, its contents must not be
1552	// accessed any more
1553	hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
1554
1555	while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX(18446744073709551615UL),
1556	HSA_WAIT_STATE_BLOCKED) != 0)
1557	;
1558
1559	assert(ArgPool)(static_cast <bool> (ArgPool) ? void (0) : __assert_fail ("ArgPool", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1559, __extension__ __PRETTY_FUNCTION__));
1560	ArgPool->deallocate(KernArg);
1561	DeviceInfo().FreeSignalPool.push(S);
1562	}
1563
1564	DP("Kernel completed\n"){};
1565	return OFFLOAD_SUCCESS(0);
1566	}
1567
1568	bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
1569	const uint16_t AmdgcnMachineID = EM_AMDGPU;
1570	const int32_t R = elf_check_machine(Image, AmdgcnMachineID);
1571	if (!R) {
1572	DP("Supported machine ID not found\n"){};
1573	}
1574	return R;
1575	}
1576
1577	uint32_t elfEFlags(__tgt_device_image *Image) {
1578	const char ImgBegin = (char )Image->ImageStart;
1579	size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
1580
1581	StringRef Buffer = StringRef(ImgBegin, ImgSize);
1582	auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
1583	/InitContent=/false);
1584	if (!ElfOrErr) {
1585	consumeError(ElfOrErr.takeError());
1586	return 0;
1587	}
1588
1589	if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
1590	return ELFObj->getPlatformFlags();
1591	return 0;
1592	}
1593
1594	template <typename T> bool enforceUpperBound(T *Value, T Upper) {
1595	bool Changed = *Value > Upper;
1596	if (Changed) {
1597	*Value = Upper;
1598	}
1599	return Changed;
1600	}
1601
1602	struct SymbolInfo {
1603	const void *Addr = nullptr;
1604	uint32_t Size = UINT32_MAX(4294967295U);
1605	uint32_t ShType = SHT_NULL;
1606	};
1607
1608	int getSymbolInfoWithoutLoading(const ELFObjectFile<ELF64LE> &ELFObj,
1609	StringRef SymName, SymbolInfo *Res) {
1610	auto SymOrErr = getELFSymbol(ELFObj, SymName);
1611	if (!SymOrErr) {
1612	std::string ErrorString = toString(SymOrErr.takeError());
1613	DP("Failed ELF lookup: %s\n", ErrorString.c_str()){};
1614	return 1;
1615	}
1616	if (!*SymOrErr)
1617	return 1;
1618
1619	auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx);
1620	if (!SymSecOrErr) {
1621	std::string ErrorString = toString(SymOrErr.takeError());
1622	DP("Failed ELF lookup: %s\n", ErrorString.c_str()){};
1623	return 1;
1624	}
1625
1626	Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base();
1627	Res->Size = static_cast<uint32_t>((*SymOrErr)->st_size);
1628	Res->ShType = static_cast<uint32_t>((*SymSecOrErr)->sh_type);
1629	return 0;
1630	}
1631
1632	int getSymbolInfoWithoutLoading(char Base, size_t ImgSize, const char SymName,
1633	SymbolInfo *Res) {
1634	StringRef Buffer = StringRef(Base, ImgSize);
1635	auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
1636	/InitContent=/false);
1637	if (!ElfOrErr) {
1638	REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str())do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Failed to load ELF: %s\n" , toString(ElfOrErr.takeError()).c_str()); } while (0);;
1639	return 1;
1640	}
1641
1642	if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
1643	return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res);
1644	return 1;
1645	}
1646
1647	hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
1648	const char SymName, const void *VarAddr,
1649	uint32_t *VarSize) {
1650	SymbolInfo SI;
1651	int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
1652	if (Rc == 0) {
1653	*VarAddr = SI.Addr;
1654	*VarSize = SI.Size;
1655	return HSA_STATUS_SUCCESS;
1656	}
1657	return HSA_STATUS_ERROR;
1658	}
1659
1660	template <typename C>
1661	hsa_status_t moduleRegisterFromMemoryToPlace(
1662	std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
1663	std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1664	void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
1665	std::vector<hsa_executable_t> &HSAExecutables) {
1666	auto L = [](void Data, size_t Size, void CbState) -> hsa_status_t {
1667	C Unwrapped = static_cast<C >(CbState);
1668	return (*Unwrapped)(Data, Size);
1669	};
1670	return core::RegisterModuleFromMemory(
1671	KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
1672	DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
1673	HSAExecutables);
1674	}
1675
1676	uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
1677	uint64_t DeviceStateBytes = 0;
1678	{
1679	// If this is the deviceRTL, get the state variable size
1680	SymbolInfo SizeSi;
1681	int Rc = getSymbolInfoWithoutLoading(
1682	ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
1683
1684	if (Rc == 0) {
1685	if (SizeSi.Size != sizeof(uint64_t)) {
1686	DP("Found device_State_size variable with wrong size\n"){};
1687	return 0;
1688	}
1689
1690	// Read number of bytes directly from the elf
1691	memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
1692	}
1693	}
1694	return DeviceStateBytes;
1695	}
1696
1697	struct DeviceEnvironment {
1698	// initialise an DeviceEnvironmentTy in the deviceRTL
1699	// patches around differences in the deviceRTL between trunk, aomp,
1700	// rocmcc. Over time these differences will tend to zero and this class
1701	// simplified.
1702	// Symbol may be in .data or .bss, and may be missing fields, todo:
1703	// review aomp/trunk/rocm and simplify the following
1704
1705	// The symbol may also have been deadstripped because the device side
1706	// accessors were unused.
1707
1708	// If the symbol is in .data (aomp, rocm) it can be written directly.
1709	// If it is in .bss, we must wait for it to be allocated space on the
1710	// gpu (trunk) and initialize after loading.
1711	const char *sym() { return "__omp_rtl_device_environment"; }
1712
1713	DeviceEnvironmentTy HostDeviceEnv;
1714	SymbolInfo SI;
1715	bool Valid = false;
1716
1717	__tgt_device_image *Image;
1718	const size_t ImgSize;
1719
1720	DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
1721	__tgt_device_image *Image, const size_t ImgSize)
1722	: Image(Image), ImgSize(ImgSize) {
1723
1724	HostDeviceEnv.NumDevices = NumberDevices;
1725	HostDeviceEnv.DeviceNum = DeviceId;
1726	HostDeviceEnv.DebugKind = 0;
1727	HostDeviceEnv.DynamicMemSize = DynamicMemSize;
1728	if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
1729	HostDeviceEnv.DebugKind = std::stoi(EnvStr);
1730
1731	int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
1732	sym(), &SI);
1733	if (Rc != 0) {
1734	DP("Finding global device environment '%s' - symbol missing.\n", sym()){};
1735	return;
1736	}
1737
1738	if (SI.Size > sizeof(HostDeviceEnv)) {
1739	DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,{}
1740	sizeof(HostDeviceEnv)){};
1741	return;
1742	}
1743
1744	Valid = true;
1745	}
1746
1747	bool inImage() { return SI.ShType != SHT_NOBITS; }
1748
1749	hsa_status_t beforeLoading(void *Data, size_t Size) {
1750	if (Valid) {
1751	if (inImage()) {
1752	DP("Setting global device environment before load (%u bytes)\n",{}
1753	SI.Size){};
1754	uint64_t Offset = reinterpret_cast<const char *>(SI.Addr) -
1755	reinterpret_cast<const char *>(Image->ImageStart);
1756	void Pos = reinterpret_cast<char >(Data) + Offset;
1757	memcpy(Pos, &HostDeviceEnv, SI.Size);
1758	}
1759	}
1760	return HSA_STATUS_SUCCESS;
1761	}
1762
1763	hsa_status_t afterLoading() {
1764	if (Valid) {
1765	if (!inImage()) {
1766	DP("Setting global device environment after load (%u bytes)\n",{}
1767	SI.Size){};
1768	int DeviceId = HostDeviceEnv.DeviceNum;
1769	auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
1770	void *StatePtr;
1771	uint32_t StatePtrSize;
1772	hsa_status_t Err = interop_hsa_get_symbol_info(
1773	SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
1774	if (Err != HSA_STATUS_SUCCESS) {
1775	DP("failed to find %s in loaded image\n", sym()){};
1776	return Err;
1777	}
1778
1779	if (StatePtrSize != SI.Size) {
1780	DP("Symbol had size %u before loading, %u after\n", StatePtrSize,{}
1781	SI.Size){};
1782	return HSA_STATUS_ERROR;
1783	}
1784
1785	return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
1786	StatePtrSize, DeviceId);
1787	}
1788	}
1789	return HSA_STATUS_SUCCESS;
1790	}
1791	};
1792
1793	hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
1794	uint64_t Rounded = 4 * ((Size + 3) / 4);
1795	void *Ptr;
1796	hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
1797	hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
1798	if (Err != HSA_STATUS_SUCCESS) {
1799	return Err;
1800	}
1801
1802	hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
1803	if (Rc != HSA_STATUS_SUCCESS) {
1804	DP("zero fill device_state failed with %u\n", Rc){};
1805	core::Runtime::Memfree(Ptr);
1806	return HSA_STATUS_ERROR;
1807	}
1808
1809	*RetPtr = Ptr;
1810	return HSA_STATUS_SUCCESS;
1811	}
1812
1813	bool imageContainsSymbol(void Data, size_t Size, const char Sym) {
1814	SymbolInfo SI;
1815	int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
1816	return (Rc == 0) && (SI.Addr != nullptr);
1817	}
1818
1819	hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent,
1820	void **LockedHostPtr) {
1821	hsa_status_t err = is_locked(HostPtr, LockedHostPtr);
1822	if (err != HSA_STATUS_SUCCESS)
1823	return err;
1824
1825	// HostPtr is already locked, just return it
1826	if (*LockedHostPtr)
1827	return HSA_STATUS_SUCCESS;
1828
1829	hsa_agent_t Agents[1] = {Agent};
1830	return hsa_amd_memory_lock(HostPtr, Size, Agents, /num_agent=/1,
1831	LockedHostPtr);
1832	}
1833
1834	hsa_status_t unlock_memory(void *HostPtr) {
1835	void *LockedHostPtr = nullptr;
1836	hsa_status_t err = is_locked(HostPtr, &LockedHostPtr);
1837	if (err != HSA_STATUS_SUCCESS)
1838	return err;
1839
1840	// if LockedHostPtr is nullptr, then HostPtr was not locked
1841	if (!LockedHostPtr)
1842	return HSA_STATUS_SUCCESS;
1843
1844	err = hsa_amd_memory_unlock(HostPtr);
1845	return err;
1846	}
1847
1848	} // namespace
1849
1850	namespace core {
1851	hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
1852	return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(),
1853	&DeviceInfo().HSAAgents[0], NULL__null, Ptr);
1854	}
1855	} // namespace core
1856
1857	static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
1858	hsa_status_t err;
1859	uint32_t name_len;
1860	err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
1861	if (err != HSA_STATUS_SUCCESS) {
1862	DP("Error getting ISA info length\n"){};
1863	return err;
1864	}
1865
1866	char TargetID[name_len];
1867	err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
1868	if (err != HSA_STATUS_SUCCESS) {
1869	DP("Error getting ISA info name\n"){};
1870	return err;
1871	}
1872
1873	auto TripleTargetID = llvm::StringRef(TargetID);
1874	if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
1875	DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str());
1876	}
1877	return HSA_STATUS_SUCCESS;
1878	}
1879
1880	extern "C" {
1881	int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
1882	return elfMachineIdIsAmdgcn(Image);
1883	}
1884
1885	int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
1886	__tgt_image_info *info) {
1887	if (!__tgt_rtl_is_valid_binary(image))
1888	return false;
1889
1890	// A subarchitecture was not specified. Assume it is compatible.
1891	if (!info->Arch)
1892	return true;
1893
1894	int32_t NumberOfDevices = __tgt_rtl_number_of_devices();
1895
1896	for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
1897	__tgt_rtl_init_device(DeviceId);
1898	hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId];
1899	hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
1900	if (err != HSA_STATUS_SUCCESS) {
1901	DP("Error iterating ISAs\n"){};
1902	return false;
1903	}
1904	if (!isImageCompatibleWithEnv(info, DeviceInfo().TargetID[DeviceId]))
1905	return false;
1906	}
1907	DP("Image has Target ID compatible with the current environment: %s\n",{}
1908	info->Arch){};
1909	return true;
1910	}
1911
1912	int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS(0); }
1913	int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS(0); }
1914
1915	int __tgt_rtl_number_of_devices() {
1916	// If the construction failed, no methods are safe to call
1917	if (DeviceInfo().ConstructionSucceeded) {
1918	return DeviceInfo().NumberOfDevices;
1919	}
1920	DP("AMDGPU plugin construction failed. Zero devices available\n"){};
1921	return 0;
1922	}
1923
1924	int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1925	DP("Init requires flags to %ld\n", RequiresFlags){};
1926	DeviceInfo().RequiresFlags = RequiresFlags;
1927	return RequiresFlags;
1928	}
1929
1930	int32_t __tgt_rtl_init_device(int DeviceId) {
1931	hsa_status_t Err = hsa_init();
1932	if (Err != HSA_STATUS_SUCCESS) {
1933	DP("HSA Initialization Failed.\n"){};
1934	return HSA_STATUS_ERROR;
1935	}
1936	// this is per device id init
1937	DP("Initialize the device id: %d\n", DeviceId){};
1938
1939	hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
1940
1941	// Get number of Compute Unit
1942	uint32_t ComputeUnits = 0;
1943	Err = hsa_agent_get_info(
1944	Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
1945	&ComputeUnits);
1946	if (Err != HSA_STATUS_SUCCESS) {
1947	DeviceInfo().ComputeUnits[DeviceId] = 1;
1948	DP("Error getting compute units : settiing to 1\n"){};
1949	} else {
1950	DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits;
1951	DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]){};
1952	}
1953
1954	char GetInfoName[64]; // 64 max size returned by get info
1955	Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
1956	(void *)GetInfoName);
1957	if (Err)
1958	DeviceInfo().GPUName[DeviceId] = "--unknown gpu--";
1959	else {
1960	DeviceInfo().GPUName[DeviceId] = GetInfoName;
1961	}
1962
1963	if (print_kernel_trace & STARTUP_DETAILS)
1964	DP("Device#%-2d CU's: %2d %s\n", DeviceId,{}
1965	DeviceInfo().ComputeUnits[DeviceId],{}
1966	DeviceInfo().GPUName[DeviceId].c_str()){};
1967
1968	// Query attributes to determine number of threads/block and blocks/grid.
1969	uint16_t WorkgroupMaxDim[3];
1970	Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
1971	&WorkgroupMaxDim);
1972	if (Err != HSA_STATUS_SUCCESS) {
1973	DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
1974	DP("Error getting grid dims: num groups : %d\n",{}
1975	RTLDeviceInfoTy::DefaultNumTeams){};
1976	} else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
1977	DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
1978	DP("Using %d ROCm blocks per grid\n",{}
1979	DeviceInfo().GroupsPerDevice[DeviceId]){};
1980	} else {
1981	DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
1982	DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "{}
1983	"at the hard limit\n",{}
1984	WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit){};
1985	}
1986
1987	// Get thread limit
1988	hsa_dim3_t GridMaxDim;
1989	Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
1990	if (Err == HSA_STATUS_SUCCESS) {
1991	DeviceInfo().ThreadsPerGroup[DeviceId] =
1992	reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
1993	DeviceInfo().GroupsPerDevice[DeviceId];
1994
1995	if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) {
1996	DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
1997	DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){};
1998	} else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId],
1999	RTLDeviceInfoTy::MaxWgSize)) {
2000	DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){};
2001	} else {
2002	DP("Using ROCm Queried thread limit: %d\n",{}
2003	DeviceInfo().ThreadsPerGroup[DeviceId]){};
2004	}
2005	} else {
2006	DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2007	DP("Error getting max block dimension, use default:%d \n",{}
2008	RTLDeviceInfoTy::MaxWgSize){};
2009	}
2010
2011	// Get wavefront size
2012	uint32_t WavefrontSize = 0;
2013	Err =
2014	hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
2015	if (Err == HSA_STATUS_SUCCESS) {
2016	DP("Queried wavefront size: %d\n", WavefrontSize){};
2017	DeviceInfo().WarpSize[DeviceId] = WavefrontSize;
2018	} else {
2019	// TODO: Burn the wavefront size into the code object
2020	DP("Warning: Unknown wavefront size, assuming 64\n"){};
2021	DeviceInfo().WarpSize[DeviceId] = 64;
2022	}
2023
2024	// Adjust teams to the env variables
2025
2026	if (DeviceInfo().Env.TeamLimit > 0 &&
2027	(enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId],
2028	DeviceInfo().Env.TeamLimit))) {
2029	DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",{}
2030	DeviceInfo().Env.TeamLimit){};
2031	}
2032
2033	// Set default number of teams
2034	if (DeviceInfo().Env.NumTeams > 0) {
2035	DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams;
2036	DP("Default number of teams set according to environment %d\n",{}
2037	DeviceInfo().Env.NumTeams){};
2038	} else {
2039	char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
2040	int TeamsPerCU = DefaultTeamsPerCU;
2041	if (TeamsPerCUEnvStr) {
2042	TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
2043	}
2044
2045	DeviceInfo().NumTeams[DeviceId] =
2046	TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId];
2047	DP("Default number of teams = %d * number of compute units %d\n",{}
2048	TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]){};
2049	}
2050
2051	if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId],
2052	DeviceInfo().GroupsPerDevice[DeviceId])) {
2053	DP("Default number of teams exceeds device limit, capping at %d\n",{}
2054	DeviceInfo().GroupsPerDevice[DeviceId]){};
2055	}
2056
2057	// Adjust threads to the env variables
2058	if (DeviceInfo().Env.TeamThreadLimit > 0 &&
2059	(enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2060	DeviceInfo().Env.TeamThreadLimit))) {
2061	DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",{}
2062	DeviceInfo().Env.TeamThreadLimit){};
2063	}
2064
2065	// Set default number of threads
2066	DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
2067	DP("Default number of threads set according to library's default %d\n",{}
2068	RTLDeviceInfoTy::DefaultWgSize){};
2069	if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2070	DeviceInfo().ThreadsPerGroup[DeviceId])) {
2071	DP("Default number of threads exceeds device limit, capping at %d\n",{}
2072	DeviceInfo().ThreadsPerGroup[DeviceId]){};
2073	}
2074
2075	DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",{}
2076	DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],{}
2077	DeviceInfo().ThreadsPerGroup[DeviceId]){};
2078
2079	DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,{}
2080	DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],{}
2081	DeviceInfo().GroupsPerDevice[DeviceId],{}
2082	DeviceInfo().GroupsPerDevice[DeviceId] *{}
2083	DeviceInfo().ThreadsPerGroup[DeviceId]){};
2084
2085	return OFFLOAD_SUCCESS(0);
2086	}
2087
2088	static __tgt_target_table *
2089	__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
2090
2091	__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
2092	__tgt_device_image *Image) {
2093	DeviceInfo().LoadRunLock.lock();
2094	__tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
2095	DeviceInfo().LoadRunLock.unlock();
2096	return Res;
2097	}
2098
2099	__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2100	__tgt_device_image *Image) {
2101	// This function loads the device image onto gpu[DeviceId] and does other
2102	// per-image initialization work. Specifically:
2103	//
2104	// - Initialize an DeviceEnvironmentTy instance embedded in the
2105	// image at the symbol "__omp_rtl_device_environment"
2106	// Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
2107	//
2108	// - Allocate a large array per-gpu (could be moved to init_device)
2109	// - Read a uint64_t at symbol omptarget_nvptx_device_State_size
2110	// - Allocate at least that many bytes of gpu memory
2111	// - Zero initialize it
2112	// - Write the pointer to the symbol omptarget_nvptx_device_State
2113	//
2114	// - Pulls some per-kernel information together from various sources and
2115	// records it in the KernelsList for quicker access later
2116	//
2117	// The initialization can be done before or after loading the image onto the
2118	// gpu. This function presently does a mixture. Using the hsa api to get/set
2119	// the information is simpler to implement, in exchange for more complicated
2120	// runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
2121	// back from the gpu vs a hashtable lookup on the host.
2122
2123	const size_t ImgSize = (char )Image->ImageEnd - (char )Image->ImageStart;
2124
2125	DeviceInfo().clearOffloadEntriesTable(DeviceId);
2126
2127	// We do not need to set the ELF version because the caller of this function
2128	// had to do that to decide the right runtime to use
2129
2130	if (!elfMachineIdIsAmdgcn(Image))
2131	return NULL__null;
2132
2133	{
2134	auto Env =
2135	DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
2136	DeviceInfo().Env.DynamicMemSize, Image, ImgSize);
2137
2138	auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId];
2139	auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
2140	hsa_status_t Err = moduleRegisterFromMemoryToPlace(
2141	KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
2142	[&](void *Data, size_t Size) {
2143	if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
2144	__atomic_store_n(&DeviceInfo().HostcallRequired, true,
2145	__ATOMIC_RELEASE3);
2146	}
2147	return Env.beforeLoading(Data, Size);
2148	},
2149	DeviceInfo().HSAExecutables);
2150
2151	check("Module registering", Err){};
2152	if (Err != HSA_STATUS_SUCCESS) {
2153	const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str();
2154	const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
2155
2156	if (strcmp(DeviceName, ElfName) != 0) {
2157	DP("Possible gpu arch mismatch: device:%s, image:%s please check"{}
2158	" compiler flag: -march=<gpu>\n",{}
2159	DeviceName, ElfName){};
2160	} else {
2161	DP("Error loading image onto GPU: %s\n", get_error_string(Err)){};
2162	}
2163
2164	return NULL__null;
2165	}
2166
2167	Err = Env.afterLoading();
2168	if (Err != HSA_STATUS_SUCCESS) {
2169	return NULL__null;
2170	}
2171	}
2172
2173	DP("AMDGPU module successfully loaded!\n"){};
2174
2175	{
2176	// the device_State array is either large value in bss or a void* that
2177	// needs to be assigned to a pointer to an array of size device_state_bytes
2178	// If absent, it has been deadstripped and needs no setup.
2179
2180	void *StatePtr;
2181	uint32_t StatePtrSize;
2182	auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2183	hsa_status_t Err = interop_hsa_get_symbol_info(
2184	SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
2185	&StatePtrSize);
2186
2187	if (Err != HSA_STATUS_SUCCESS) {
2188	DP("No device_state symbol found, skipping initialization\n"){};
2189	} else {
2190	if (StatePtrSize < sizeof(void *)) {
2191	DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,{}
2192	sizeof(void *)){};
2193	return NULL__null;
2194	}
2195
2196	// if it's larger than a void*, assume it's a bss array and no further
2197	// initialization is required. Only try to set up a pointer for
2198	// sizeof(void*)
2199	if (StatePtrSize == sizeof(void *)) {
2200	uint64_t DeviceStateBytes =
2201	getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
2202	if (DeviceStateBytes == 0) {
2203	DP("Can't initialize device_State, missing size information\n"){};
2204	return NULL__null;
2205	}
2206
2207	auto &DSS = DeviceInfo().DeviceStateStore[DeviceId];
2208	if (DSS.first.get() == nullptr) {
2209	assert(DSS.second == 0)(static_cast <bool> (DSS.second == 0) ? void (0) : __assert_fail ("DSS.second == 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 2209, __extension__ __PRETTY_FUNCTION__));
2210	void *Ptr = NULL__null;
2211	hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
2212	if (Err != HSA_STATUS_SUCCESS) {
2213	DP("Failed to allocate device_state array\n"){};
2214	return NULL__null;
2215	}
2216	DSS = {
2217	std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
2218	DeviceStateBytes,
2219	};
2220	}
2221
2222	void *Ptr = DSS.first.get();
2223	if (DeviceStateBytes != DSS.second) {
2224	DP("Inconsistent sizes of device_State unsupported\n"){};
2225	return NULL__null;
2226	}
2227
2228	// write ptr to device memory so it can be used by later kernels
2229	Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
2230	sizeof(void *), DeviceId);
2231	if (Err != HSA_STATUS_SUCCESS) {
2232	DP("memcpy install of state_ptr failed\n"){};
2233	return NULL__null;
2234	}
2235	}
2236	}
2237	}
2238
2239	// Here, we take advantage of the data that is appended after img_end to get
2240	// the symbols' name we need to load. This data consist of the host entries
2241	// begin and end as well as the target name (see the offloading linker script
2242	// creation in clang compiler).
2243
2244	// Find the symbols in the module by name. The name can be obtain by
2245	// concatenating the host entry name with the target name
2246
2247	__tgt_offload_entry *HostBegin = Image->EntriesBegin;
2248	__tgt_offload_entry *HostEnd = Image->EntriesEnd;
2249
2250	for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
2251
2252	if (!E->addr) {
2253	// The host should have always something in the address to
2254	// uniquely identify the target region.
2255	DP("Analyzing host entry '<null>' (size = %lld)...\n",{}
2256	(unsigned long long)E->size){};
2257	return NULL__null;
2258	}
2259
2260	if (E->size) {
2261	__tgt_offload_entry Entry = *E;
2262
2263	void *Varptr;
2264	uint32_t Varsize;
2265
2266	auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2267	hsa_status_t Err = interop_hsa_get_symbol_info(
2268	SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
2269
2270	if (Err != HSA_STATUS_SUCCESS) {
2271	// Inform the user what symbol prevented offloading
2272	DP("Loading global '%s' (Failed)\n", E->name){};
2273	return NULL__null;
2274	}
2275
2276	if (Varsize != E->size) {
2277	DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,{}
2278	Varsize, E->size){};
2279	return NULL__null;
2280	}
2281
2282	DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",{}
2283	DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)){};
2284	Entry.addr = (void *)Varptr;
2285
2286	DeviceInfo().addOffloadEntry(DeviceId, Entry);
2287
2288	if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
2289	E->flags & OMP_DECLARE_TARGET_LINK) {
2290	// If unified memory is present any target link variables
2291	// can access host addresses directly. There is no longer a
2292	// need for device copies.
2293	Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
2294	sizeof(void *), DeviceId);
2295	if (Err != HSA_STATUS_SUCCESS)
2296	DP("Error when copying USM\n"){};
2297	DP("Copy linked variable host address (" DPxMOD ")"{}
2298	"to device address (" DPxMOD ")\n",{}
2299	DPxPTR(((void *)E->addr)), DPxPTR(Varptr)){};
2300	}
2301
2302	continue;
2303	}
2304
2305	DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)){};
2306
2307	// errors in kernarg_segment_size previously treated as = 0 (or as undef)
2308	uint32_t KernargSegmentSize = 0;
2309	auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId];
2310	hsa_status_t Err = HSA_STATUS_SUCCESS;
2311	if (!E->name) {
2312	Err = HSA_STATUS_ERROR;
2313	} else {
2314	std::string KernelStr = std::string(E->name);
2315	auto It = KernelInfoMap.find(KernelStr);
2316	if (It != KernelInfoMap.end()) {
2317	atl_kernel_info_t Info = It->second;
2318	KernargSegmentSize = Info.kernel_segment_size;
2319	} else {
2320	Err = HSA_STATUS_ERROR;
	Value stored to 'Err' is never read
2321	}
2322	}
2323
2324	// default value GENERIC (in case symbol is missing from cubin file)
2325	llvm::omp::OMPTgtExecModeFlags ExecModeVal =
2326	llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2327
2328	// get flat group size if present, else Default_WG_Size
2329	int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2330
2331	// get Kernel Descriptor if present.
2332	// Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
2333	struct KernDescValType {
2334	uint16_t Version;
2335	uint16_t TSize;
2336	uint16_t WGSize;
2337	};
2338	struct KernDescValType KernDescVal;
2339	std::string KernDescNameStr(E->name);
2340	KernDescNameStr += "_kern_desc";
2341	const char *KernDescName = KernDescNameStr.c_str();
2342
2343	const void *KernDescPtr;
2344	uint32_t KernDescSize;
2345	void *CallStackAddr = nullptr;
2346	Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
2347	&KernDescPtr, &KernDescSize);
2348
2349	if (Err == HSA_STATUS_SUCCESS) {
2350	if ((size_t)KernDescSize != sizeof(KernDescVal))
2351	DP("Loading global computation properties '%s' - size mismatch (%u != "{}
2352	"%lu)\n",{}
2353	KernDescName, KernDescSize, sizeof(KernDescVal)){};
2354
2355	memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
2356
2357	// Check structure size against recorded size.
2358	if ((size_t)KernDescSize != KernDescVal.TSize)
2359	DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",{}
2360	sizeof(KernDescVal), KernDescVal.TSize, KernDescName){};
2361
2362	DP("After loading global for %s KernDesc \n", KernDescName){};
2363	DP("KernDesc: Version: %d\n", KernDescVal.Version){};
2364	DP("KernDesc: TSize: %d\n", KernDescVal.TSize){};
2365	DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize){};
2366
2367	if (KernDescVal.WGSize == 0) {
2368	KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
2369	DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize){};
2370	}
2371	WGSizeVal = KernDescVal.WGSize;
2372	DP("WGSizeVal %d\n", WGSizeVal){};
2373	check("Loading KernDesc computation property", Err){};
2374	} else {
2375	DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName){};
2376
2377	// Flat group size
2378	std::string WGSizeNameStr(E->name);
2379	WGSizeNameStr += "_wg_size";
2380	const char *WGSizeName = WGSizeNameStr.c_str();
2381
2382	const void *WGSizePtr;
2383	uint32_t WGSize;
2384	Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
2385	&WGSizePtr, &WGSize);
2386
2387	if (Err == HSA_STATUS_SUCCESS) {
2388	if ((size_t)WGSize != sizeof(int16_t)) {
2389	DP("Loading global computation properties '%s' - size mismatch (%u "{}
2390	"!= "{}
2391	"%lu)\n",{}
2392	WGSizeName, WGSize, sizeof(int16_t)){};
2393	return NULL__null;
2394	}
2395
2396	memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
2397
2398	DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal){};
2399
2400	if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize \|\|
2401	WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
2402	DP("Error wrong WGSize value specified in HSA code object file: "{}
2403	"%d\n",{}
2404	WGSizeVal){};
2405	WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2406	}
2407	} else {
2408	DP("Warning: Loading WGSize '%s' - symbol not found, "{}
2409	"using default value %d\n",{}
2410	WGSizeName, WGSizeVal){};
2411	}
2412
2413	check("Loading WGSize computation property", Err){};
2414	}
2415
2416	// Read execution mode from global in binary
2417	std::string ExecModeNameStr(E->name);
2418	ExecModeNameStr += "_exec_mode";
2419	const char *ExecModeName = ExecModeNameStr.c_str();
2420
2421	const void *ExecModePtr;
2422	uint32_t VarSize;
2423	Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
2424	&ExecModePtr, &VarSize);
2425
2426	if (Err == HSA_STATUS_SUCCESS) {
2427	if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
2428	DP("Loading global computation properties '%s' - size mismatch(%u != "{}
2429	"%lu)\n",{}
2430	ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)){};
2431	return NULL__null;
2432	}
2433
2434	memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
2435
2436	DP("After loading global for %s ExecMode = %d\n", ExecModeName,{}
2437	ExecModeVal){};
2438
2439	if (ExecModeVal < 0 \|\|
2440	ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
2441	DP("Error wrong exec_mode value specified in HSA code object file: "{}
2442	"%d\n",{}
2443	ExecModeVal){};
2444	return NULL__null;
2445	}
2446	} else {
2447	DP("Loading global exec_mode '%s' - symbol missing, using default "{}
2448	"value "{}
2449	"GENERIC (1)\n",{}
2450	ExecModeName){};
2451	}
2452	check("Loading computation property", Err){};
2453
2454	KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
2455	CallStackAddr, E->name, KernargSegmentSize,
2456	DeviceInfo().KernArgPool));
2457	__tgt_offload_entry Entry = *E;
2458	Entry.addr = (void *)&KernelsList.back();
2459	DeviceInfo().addOffloadEntry(DeviceId, Entry);
2460	DP("Entry point %ld maps to %s\n", E - HostBegin, E->name){};
2461	}
2462
2463	return DeviceInfo().getOffloadEntriesTable(DeviceId);
2464	}
2465
2466	void __tgt_rtl_data_alloc(int DeviceId, int64_t Size, void , int32_t Kind) {
2467	void *Ptr = NULL__null;
2468	assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2468, __extension__ __PRETTY_FUNCTION__));
2469
2470	hsa_amd_memory_pool_t MemoryPool;
2471	switch (Kind) {
2472	case TARGET_ALLOC_DEFAULT:
2473	case TARGET_ALLOC_DEVICE:
2474	// GPU memory
2475	MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
2476	break;
2477	case TARGET_ALLOC_HOST:
2478	// non-migratable memory accessible by host and device(s)
2479	MemoryPool = DeviceInfo().getHostMemoryPool();
2480	break;
2481	default:
2482	REPORT("Invalid target data allocation kind or requested allocator not "do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not " "implemented yet\n"); } while (0);
2483	"implemented yet\n")do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not " "implemented yet\n"); } while (0);;
2484	return NULL__null;
2485	}
2486
2487	hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
2488	DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,{}
2489	(long long unsigned)(Elf64_Addr)Ptr){};
2490	Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL__null;
2491	return Ptr;
2492	}
2493
2494	int32_t __tgt_rtl_data_submit(int DeviceId, void TgtPtr, void HstPtr,
2495	int64_t Size) {
2496	assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2496, __extension__ __PRETTY_FUNCTION__));
2497	__tgt_async_info AsyncInfo;
2498	int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
2499	if (Rc != OFFLOAD_SUCCESS(0))
2500	return OFFLOAD_FAIL(~0);
2501
2502	return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2503	}
2504
2505	int32_t __tgt_rtl_data_submit_async(int DeviceId, void TgtPtr, void HstPtr,
2506	int64_t Size, __tgt_async_info *AsyncInfo) {
2507	assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2507, __extension__ __PRETTY_FUNCTION__));
2508	if (AsyncInfo) {
2509	initAsyncInfo(AsyncInfo);
2510	return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
2511	}
2512	return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
2513	}
2514
2515	int32_t __tgt_rtl_data_retrieve(int DeviceId, void HstPtr, void TgtPtr,
2516	int64_t Size) {
2517	assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2517, __extension__ __PRETTY_FUNCTION__));
2518	__tgt_async_info AsyncInfo;
2519	int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
2520	if (Rc != OFFLOAD_SUCCESS(0))
2521	return OFFLOAD_FAIL(~0);
2522
2523	return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2524	}
2525
2526	int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void HstPtr, void TgtPtr,
2527	int64_t Size,
2528	__tgt_async_info *AsyncInfo) {
2529	assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2529, __extension__ __PRETTY_FUNCTION__));
2530	assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2530, __extension__ __PRETTY_FUNCTION__));
2531	initAsyncInfo(AsyncInfo);
2532	return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
2533	}
2534
2535	int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) {
2536	assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2536, __extension__ __PRETTY_FUNCTION__));
2537	// HSA can free pointers allocated from different types of memory pool.
2538	hsa_status_t Err;
2539	DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr){};
2540	Err = core::Runtime::Memfree(TgtPtr);
2541	if (Err != HSA_STATUS_SUCCESS) {
2542	DP("Error when freeing CUDA memory\n"){};
2543	return OFFLOAD_FAIL(~0);
2544	}
2545	return OFFLOAD_SUCCESS(0);
2546	}
2547
2548	int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
2549	void *TgtArgs, ptrdiff_t TgtOffsets,
2550	KernelArgsTy *KernelArgs,
2551	__tgt_async_info *AsyncInfo) {
2552	assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&(static_cast <bool> (!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit [1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported." ) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__ __PRETTY_FUNCTION__))
2553	!KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&(static_cast <bool> (!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit [1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported." ) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__ __PRETTY_FUNCTION__))
2554	"Only one dimensional kernels supported.")(static_cast <bool> (!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit [1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported." ) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__ __PRETTY_FUNCTION__));
2555	assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2555, __extension__ __PRETTY_FUNCTION__));
2556	initAsyncInfo(AsyncInfo);
2557
2558	DeviceInfo().LoadRunLock.lock_shared();
2559	int32_t Res =
2560	runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2561	KernelArgs->NumArgs, KernelArgs->NumTeams[0],
2562	KernelArgs->ThreadLimit[0], KernelArgs->Tripcount);
2563
2564	DeviceInfo().LoadRunLock.unlock_shared();
2565	return Res;
2566	}
2567
2568	int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
2569	assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2569, __extension__ __PRETTY_FUNCTION__));
2570
2571	// Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
2572	// is not ensured by devices.cpp for amdgcn
2573	// assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
2574	if (AsyncInfo->Queue) {
2575	finiAsyncInfo(AsyncInfo);
2576	}
2577	return OFFLOAD_SUCCESS(0);
2578	}
2579
2580	void __tgt_rtl_print_device_info(int32_t DeviceId) {
2581	// TODO: Assertion to see if DeviceId is correct
2582	// NOTE: We don't need to set context for print device info.
2583
2584	DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
2585	}
2586
2587	int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size,
2588	void **LockedHostPtr) {
2589	assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2589, __extension__ __PRETTY_FUNCTION__));
2590
2591	hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
2592	hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr);
2593	if (err != HSA_STATUS_SUCCESS) {
2594	DP("Error in tgt_rtl_data_lock\n"){};
2595	return OFFLOAD_FAIL(~0);
2596	}
2597	DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size,{}
2598	(long long unsigned)(Elf64_Addr)*LockedHostPtr){};
2599	return OFFLOAD_SUCCESS(0);
2600	}
2601
2602	int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) {
2603	assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2603, __extension__ __PRETTY_FUNCTION__));
2604	hsa_status_t err = unlock_memory(HostPtr);
2605	if (err != HSA_STATUS_SUCCESS) {
2606	DP("Error in tgt_rtl_data_unlock\n"){};
2607	return OFFLOAD_FAIL(~0);
2608	}
2609
2610	DP("Tgt unlock data (tgt:%016llx).\n",{}
2611	(long long unsigned)(Elf64_Addr)HostPtr){};
2612	return OFFLOAD_SUCCESS(0);
2613	}
2614
2615	} // extern "C"