Bug Summary

File:build/source/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
Warning:line 2320, column 9
Value stored to 'Err' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name rtl.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-17/lib/clang/17 -D OMPT_SUPPORT=1 -D TARGET_NAME=AMDGPU -D _DEBUG -D _GLIBCXX_ASSERTIONS -D _GNU_SOURCE -D _LIBCPP_ENABLE_ASSERTIONS -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I projects/openmp/libomptarget/plugins/amdgpu -I /build/source/openmp/libomptarget/plugins/amdgpu -I include -I /build/source/llvm/include -I projects/openmp/runtime/src -I /build/source/openmp/libomptarget/plugins/amdgpu/dynamic_hsa -I /build/source/openmp/libomptarget/include -I /build/source/openmp/libomptarget/plugins/amdgpu/impl -I /build/source/openmp/libomptarget/plugins/amdgpu/../../plugins-nextgen/amdgpu/utils -I /build/source/openmp/libomptarget/plugins/common/elf_common -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-17/lib/clang/17/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1683717183 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -Wno-extra -Wno-pedantic -Wno-maybe-uninitialized -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=protected -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2023-05-10-133810-16478-1 -x c++ /build/source/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
1//===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// RTL for AMD hsa machine
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/StringRef.h"
14#include "llvm/Frontend/OpenMP/OMPConstants.h"
15#include "llvm/Frontend/OpenMP/OMPGridValues.h"
16#include "llvm/Object/ELF.h"
17#include "llvm/Object/ELFObjectFile.h"
18
19#include <algorithm>
20#include <assert.h>
21#include <cstdio>
22#include <cstdlib>
23#include <cstring>
24#include <functional>
25#include <list>
26#include <memory>
27#include <mutex>
28#include <shared_mutex>
29#include <unordered_map>
30#include <vector>
31
32#include "ELFSymbols.h"
33#include "impl_runtime.h"
34#include "interop_hsa.h"
35
36#include "UtilitiesRTL.h"
37#include "internal.h"
38#include "rt.h"
39
40#include "DeviceEnvironment.h"
41#include "get_elf_mach_gfx_name.h"
42#include "omptargetplugin.h"
43#include "print_tracing.h"
44
45using namespace llvm;
46using namespace llvm::object;
47using namespace llvm::ELF;
48using namespace llvm::omp::target::plugin::utils;
49
50// hostrpc interface, FIXME: consider moving to its own include these are
51// statically linked into amdgpu/plugin if present from hostrpc_services.a,
52// linked as --whole-archive to override the weak symbols that are used to
53// implement a fallback for toolchains that do not yet have a hostrpc library.
54extern "C" {
55uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
56 uint32_t DeviceId);
57hsa_status_t hostrpc_init();
58hsa_status_t hostrpc_terminate();
59
60__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
61__attribute__((weak)) hsa_status_t hostrpc_terminate() {
62 return HSA_STATUS_SUCCESS;
63}
64__attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
65 uint32_t DeviceId) {
66 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "{}
67 "missing\n",{}
68 DeviceId){};
69 return 0;
70}
71}
72
73// Heuristic parameters used for kernel launch
74// Number of teams per CU to allow scheduling flexibility
75static const unsigned DefaultTeamsPerCU = 4;
76
77int print_kernel_trace;
78
79#ifdef OMPTARGET_DEBUG
80#define check(msg, status){} \
81 if (status != HSA_STATUS_SUCCESS) { \
82 DP(#msg " failed\n"){}; \
83 } else { \
84 DP(#msg " succeeded\n"){}; \
85 }
86#else
87#define check(msg, status){} \
88 {}
89#endif
90
91#include "elf_common.h"
92
93namespace hsa {
94template <typename C> hsa_status_t iterate_agents(C Cb) {
95 auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
96 C *Unwrapped = static_cast<C *>(Data);
97 return (*Unwrapped)(Agent);
98 };
99 return hsa_iterate_agents(L, static_cast<void *>(&Cb));
100}
101
102template <typename C>
103hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
104 auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
105 C *Unwrapped = static_cast<C *>(Data);
106 return (*Unwrapped)(MemoryPool);
107 };
108
109 return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
110}
111
112} // namespace hsa
113
114/// Keep entries table per device
115struct FuncOrGblEntryTy {
116 __tgt_target_table Table;
117 std::vector<__tgt_offload_entry> Entries;
118};
119
120struct KernelArgPool {
121private:
122 static pthread_mutex_t Mutex;
123
124public:
125 uint32_t KernargSegmentSize;
126 void *KernargRegion = nullptr;
127 std::queue<int> FreeKernargSegments;
128
129 uint32_t kernargSizeIncludingImplicit() {
130 return KernargSegmentSize + sizeof(AMDGPUImplicitArgsTy);
131 }
132
133 ~KernelArgPool() {
134 if (KernargRegion) {
135 auto R = hsa_amd_memory_pool_free(KernargRegion);
136 if (R != HSA_STATUS_SUCCESS) {
137 DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)){};
138 }
139 }
140 }
141
142 // Can't really copy or move a mutex
143 KernelArgPool() = default;
144 KernelArgPool(const KernelArgPool &) = delete;
145 KernelArgPool(KernelArgPool &&) = delete;
146
147 KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
148 : KernargSegmentSize(KernargSegmentSize) {
149
150 // impl uses one pool per kernel for all gpus, with a fixed upper size
151 // preserving that exact scheme here, including the queue<int>
152
153 hsa_status_t Err = hsa_amd_memory_pool_allocate(
154 MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS(1024 * 16), 0,
155 &KernargRegion);
156
157 if (Err != HSA_STATUS_SUCCESS) {
158 DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)){};
159 KernargRegion = nullptr; // paranoid
160 return;
161 }
162
163 Err = core::allow_access_to_all_gpu_agents(KernargRegion);
164 if (Err != HSA_STATUS_SUCCESS) {
165 DP("hsa allow_access_to_all_gpu_agents failed: %s\n",{}
166 get_error_string(Err)){};
167 auto R = hsa_amd_memory_pool_free(KernargRegion);
168 if (R != HSA_STATUS_SUCCESS) {
169 // if free failed, can't do anything more to resolve it
170 DP("hsa memory poll free failed: %s\n", get_error_string(Err)){};
171 }
172 KernargRegion = nullptr;
173 return;
174 }
175
176 for (int I = 0; I < MAX_NUM_KERNELS(1024 * 16); I++) {
177 FreeKernargSegments.push(I);
178 }
179 }
180
181 void *allocate(uint64_t ArgNum) {
182 assert((ArgNum * sizeof(void *)) == KernargSegmentSize)(static_cast <bool> ((ArgNum * sizeof(void *)) == KernargSegmentSize
) ? void (0) : __assert_fail ("(ArgNum * sizeof(void *)) == KernargSegmentSize"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 182, __extension__
__PRETTY_FUNCTION__))
;
183 Lock L(&Mutex);
184 void *Res = nullptr;
185 if (!FreeKernargSegments.empty()) {
186
187 int FreeIdx = FreeKernargSegments.front();
188 Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
189 (FreeIdx * kernargSizeIncludingImplicit()));
190 assert(FreeIdx == pointerToIndex(Res))(static_cast <bool> (FreeIdx == pointerToIndex(Res)) ? void
(0) : __assert_fail ("FreeIdx == pointerToIndex(Res)", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 190, __extension__ __PRETTY_FUNCTION__))
;
191 FreeKernargSegments.pop();
192 }
193 return Res;
194 }
195
196 void deallocate(void *Ptr) {
197 Lock L(&Mutex);
198 int Idx = pointerToIndex(Ptr);
199 FreeKernargSegments.push(Idx);
200 }
201
202private:
203 int pointerToIndex(void *Ptr) {
204 ptrdiff_t Bytes =
205 static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
206 assert(Bytes >= 0)(static_cast <bool> (Bytes >= 0) ? void (0) : __assert_fail
("Bytes >= 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 206, __extension__ __PRETTY_FUNCTION__))
;
207 assert(Bytes % kernargSizeIncludingImplicit() == 0)(static_cast <bool> (Bytes % kernargSizeIncludingImplicit
() == 0) ? void (0) : __assert_fail ("Bytes % kernargSizeIncludingImplicit() == 0"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 207, __extension__
__PRETTY_FUNCTION__))
;
208 return Bytes / kernargSizeIncludingImplicit();
209 }
210 struct Lock {
211 Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
212 ~Lock() { pthread_mutex_unlock(M); }
213 pthread_mutex_t *M;
214 };
215};
216pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
217
218std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
219 KernelArgPoolMap;
220
221/// Use a single entity to encode a kernel and a set of flags
222struct KernelTy {
223 llvm::omp::OMPTgtExecModeFlags ExecutionMode;
224 int16_t ConstWGSize;
225 int32_t DeviceId;
226 void *CallStackAddr = nullptr;
227 const char *Name;
228
229 KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
230 int32_t DeviceId, void *CallStackAddr, const char *Name,
231 uint32_t KernargSegmentSize,
232 hsa_amd_memory_pool_t &KernArgMemoryPool)
233 : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
234 DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
235 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode){};
236
237 std::string N(Name);
238 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
239 KernelArgPoolMap.insert(
240 std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
241 KernargSegmentSize, KernArgMemoryPool))));
242 }
243 }
244};
245
246/// List that contains all the kernels.
247/// FIXME: we may need this to be per device and per library.
248std::list<KernelTy> KernelsList;
249
250template <typename Callback> static hsa_status_t findAgents(Callback CB) {
251
252 hsa_status_t Err =
253 hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
254 hsa_device_type_t DeviceType;
255 // get_info fails iff HSA runtime not yet initialized
256 hsa_status_t Err =
257 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
258
259 if (Err != HSA_STATUS_SUCCESS) {
260 if (print_kernel_trace > 0)
261 DP("rtl.cpp: err %s\n", get_error_string(Err)){};
262
263 return Err;
264 }
265
266 CB(DeviceType, Agent);
267 return HSA_STATUS_SUCCESS;
268 });
269
270 // iterate_agents fails iff HSA runtime not yet initialized
271 if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
272 DP("rtl.cpp: err %s\n", get_error_string(Err)){};
273 }
274
275 return Err;
276}
277
278static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
279 void *Data) {
280 if (Status != HSA_STATUS_SUCCESS) {
281 const char *StatusString;
282 if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
283 StatusString = "unavailable";
284 }
285 DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,{}
286 Status, StatusString){};
287 abort();
288 }
289}
290
291namespace core {
292namespace {
293
294bool checkResult(hsa_status_t Err, const char *ErrMsg) {
295 if (Err == HSA_STATUS_SUCCESS)
296 return true;
297
298 REPORT("%s", ErrMsg)do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s"
, ErrMsg); } while (0);
;
299 REPORT("%s", get_error_string(Err))do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s"
, get_error_string(Err)); } while (0);
;
300 return false;
301}
302
303void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
304 __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE3);
305}
306
307uint16_t createHeader() {
308 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
309 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
310 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
311 return Header;
312}
313
314hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
315 bool AllocAllowed = false;
316 hsa_status_t Err = hsa_amd_memory_pool_get_info(
317 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
318 &AllocAllowed);
319 if (Err != HSA_STATUS_SUCCESS) {
320 DP("Alloc allowed in memory pool check failed: %s\n",{}
321 get_error_string(Err)){};
322 return Err;
323 }
324
325 size_t Size = 0;
326 Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
327 &Size);
328 if (Err != HSA_STATUS_SUCCESS) {
329 DP("Get memory pool size failed: %s\n", get_error_string(Err)){};
330 return Err;
331 }
332
333 return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
334}
335
336hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
337 std::vector<hsa_amd_memory_pool_t> *Result =
338 static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
339
340 hsa_status_t Err;
341 if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
342 return Err;
343 }
344
345 Result->push_back(MemoryPool);
346 return HSA_STATUS_SUCCESS;
347}
348
349} // namespace
350} // namespace core
351
352struct EnvironmentVariables {
353 int NumTeams;
354 int TeamLimit;
355 int TeamThreadLimit;
356 int MaxTeamsDefault;
357 int DynamicMemSize;
358};
359
360template <uint32_t wavesize>
361static constexpr const llvm::omp::GV &getGridValue() {
362 return llvm::omp::getAMDGPUGridValues<wavesize>();
363}
364
365struct HSALifetime {
366 // Wrapper around HSA used to ensure it is constructed before other types
367 // and destructed after, which means said other types can use raii for
368 // cleanup without risking running outside of the lifetime of HSA
369 const hsa_status_t S;
370
371 bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
372 HSALifetime() : S(hsa_init()) {}
373
374 ~HSALifetime() {
375 if (S == HSA_STATUS_SUCCESS) {
376 hsa_status_t Err = hsa_shut_down();
377 if (Err != HSA_STATUS_SUCCESS) {
378 // Can't call into HSA to get a string from the integer
379 DP("Shutting down HSA failed: %d\n", Err){};
380 }
381 }
382 }
383};
384
385// Handle scheduling of multiple hsa_queue's per device to
386// multiple threads (one scheduler per device)
387class HSAQueueScheduler {
388public:
389 HSAQueueScheduler() : Current(0) {}
390
391 HSAQueueScheduler(const HSAQueueScheduler &) = delete;
392
393 HSAQueueScheduler(HSAQueueScheduler &&Q) {
394 Current = Q.Current.load();
395 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
396 HSAQueues[I] = Q.HSAQueues[I];
397 Q.HSAQueues[I] = nullptr;
398 }
399 }
400
401 // \return false if any HSA queue creation fails
402 bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
403 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
404 hsa_queue_t *Q = nullptr;
405 hsa_status_t Rc =
406 hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
407 callbackQueue, NULL__null, UINT32_MAX(4294967295U), UINT32_MAX(4294967295U), &Q);
408 if (Rc != HSA_STATUS_SUCCESS) {
409 DP("Failed to create HSA queue %d\n", I){};
410 return false;
411 }
412 HSAQueues[I] = Q;
413 }
414 return true;
415 }
416
417 ~HSAQueueScheduler() {
418 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
419 if (HSAQueues[I]) {
420 hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
421 if (Err != HSA_STATUS_SUCCESS)
422 DP("Error destroying HSA queue"){};
423 }
424 }
425 }
426
427 // \return next queue to use for device
428 hsa_queue_t *next() {
429 return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
430 NUM_QUEUES_PER_DEVICE];
431 }
432
433private:
434 // Number of queues per device
435 enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
436 hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
437 std::atomic<uint8_t> Current;
438};
439
440/// Class containing all the device information
441class RTLDeviceInfoTy : HSALifetime {
442 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
443
444 struct QueueDeleter {
445 void operator()(hsa_queue_t *Q) {
446 if (Q) {
447 hsa_status_t Err = hsa_queue_destroy(Q);
448 if (Err != HSA_STATUS_SUCCESS) {
449 DP("Error destroying hsa queue: %s\n", get_error_string(Err)){};
450 }
451 }
452 }
453 };
454
455public:
456 bool ConstructionSucceeded = false;
457
458 // load binary populates symbol tables and mutates various global state
459 // run uses those symbol tables
460 std::shared_timed_mutex LoadRunLock;
461
462 int NumberOfDevices = 0;
463
464 // GPU devices
465 std::vector<hsa_agent_t> HSAAgents;
466 std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
467
468 // CPUs
469 std::vector<hsa_agent_t> CPUAgents;
470
471 // Device properties
472 std::vector<int> ComputeUnits;
473 std::vector<int> GroupsPerDevice;
474 std::vector<int> ThreadsPerGroup;
475 std::vector<int> WarpSize;
476 std::vector<std::string> GPUName;
477 std::vector<std::string> TargetID;
478
479 // OpenMP properties
480 std::vector<int> NumTeams;
481 std::vector<int> NumThreads;
482
483 // OpenMP Environment properties
484 EnvironmentVariables Env;
485
486 // OpenMP Requires Flags
487 int64_t RequiresFlags;
488
489 // Resource pools
490 SignalPoolT FreeSignalPool;
491
492 bool HostcallRequired = false;
493
494 std::vector<hsa_executable_t> HSAExecutables;
495
496 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
497 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
498
499 hsa_amd_memory_pool_t KernArgPool;
500
501 // fine grained memory pool for host allocations
502 hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
503
504 // fine and coarse-grained memory pools per offloading device
505 std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
506 std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
507
508 struct ImplFreePtrDeletor {
509 void operator()(void *P) {
510 core::Runtime::Memfree(P); // ignore failure to free
511 }
512 };
513
514 // device_State shared across loaded binaries, error if inconsistent size
515 std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
516 DeviceStateStore;
517
518 static const unsigned HardTeamLimit =
519 (1 << 16) - 1; // 64K needed to fit in uint16
520 static const int DefaultNumTeams = 128;
521
522 // These need to be per-device since different devices can have different
523 // wave sizes, but are currently the same number for each so that refactor
524 // can be postponed.
525 static_assert(getGridValue<32>().GV_Max_Teams ==
526 getGridValue<64>().GV_Max_Teams,
527 "");
528 static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
529
530 static_assert(getGridValue<32>().GV_Max_WG_Size ==
531 getGridValue<64>().GV_Max_WG_Size,
532 "");
533 static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
534
535 static_assert(getGridValue<32>().GV_Default_WG_Size ==
536 getGridValue<64>().GV_Default_WG_Size,
537 "");
538 static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
539
540 using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
541 hsa_agent_t, hsa_amd_memory_pool_t);
542 hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
543 MemcpyFunc Func, int32_t DeviceId) {
544 hsa_agent_t Agent = HSAAgents[DeviceId];
545 hsa_signal_t S = FreeSignalPool.pop();
546 if (S.handle == 0) {
547 return HSA_STATUS_ERROR;
548 }
549 hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
550 FreeSignalPool.push(S);
551 return R;
552 }
553
554 hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
555 int32_t DeviceId) {
556 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
557 }
558
559 hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
560 int32_t DeviceId) {
561 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
562 }
563
564 static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
565 char TmpChar[1000];
566 uint16_t Major, Minor;
567 uint32_t TmpUInt;
568 uint32_t TmpUInt2;
569 uint32_t CacheSize[4];
570 bool TmpBool;
571 uint16_t WorkgroupMaxDim[3];
572 hsa_dim3_t GridMaxDim;
573
574 // Getting basic information about HSA and Device
575 core::checkResult(
576 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
577 "Error from hsa_system_get_info when obtaining "
578 "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
579 core::checkResult(
580 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
581 "Error from hsa_system_get_info when obtaining "
582 "HSA_SYSTEM_INFO_VERSION_MINOR\n");
583 printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
584 printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId);
585 core::checkResult(
586 hsa_agent_get_info(
587 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
588 "Error returned from hsa_agent_get_info when obtaining "
589 "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
590 printf(" Product Name: \t\t\t%s \n", TmpChar);
591 core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
592 "Error returned from hsa_agent_get_info when obtaining "
593 "HSA_AGENT_INFO_NAME\n");
594 printf(" Device Name: \t\t\t%s \n", TmpChar);
595 core::checkResult(
596 hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
597 "Error returned from hsa_agent_get_info when obtaining "
598 "HSA_AGENT_INFO_NAME\n");
599 printf(" Vendor Name: \t\t\t%s \n", TmpChar);
600 hsa_device_type_t DevType;
601 core::checkResult(
602 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
603 "Error returned from hsa_agent_get_info when obtaining "
604 "HSA_AGENT_INFO_DEVICE\n");
605 printf(" Device Type: \t\t\t%s \n",
606 DevType == HSA_DEVICE_TYPE_CPU
607 ? "CPU"
608 : (DevType == HSA_DEVICE_TYPE_GPU
609 ? "GPU"
610 : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
611 core::checkResult(
612 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
613 "Error returned from hsa_agent_get_info when obtaining "
614 "HSA_AGENT_INFO_QUEUES_MAX\n");
615 printf(" Max Queues: \t\t\t%u \n", TmpUInt);
616 core::checkResult(
617 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
618 "Error returned from hsa_agent_get_info when obtaining "
619 "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
620 printf(" Queue Min Size: \t\t\t%u \n", TmpUInt);
621 core::checkResult(
622 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
623 "Error returned from hsa_agent_get_info when obtaining "
624 "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
625 printf(" Queue Max Size: \t\t\t%u \n", TmpUInt);
626
627 // Getting cache information
628 printf(" Cache:\n");
629
630 // FIXME: This is deprecated according to HSA documentation. But using
631 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
632 // runtime.
633 core::checkResult(
634 hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
635 "Error returned from hsa_agent_get_info when obtaining "
636 "HSA_AGENT_INFO_CACHE_SIZE\n");
637
638 for (int I = 0; I < 4; I++) {
639 if (CacheSize[I]) {
640 printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
641 }
642 }
643
644 core::checkResult(
645 hsa_agent_get_info(Agent,
646 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
647 &TmpUInt),
648 "Error returned from hsa_agent_get_info when obtaining "
649 "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
650 printf(" Cacheline Size: \t\t\t%u \n", TmpUInt);
651 core::checkResult(
652 hsa_agent_get_info(
653 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
654 &TmpUInt),
655 "Error returned from hsa_agent_get_info when obtaining "
656 "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
657 printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
658 core::checkResult(
659 hsa_agent_get_info(
660 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
661 &TmpUInt),
662 "Error returned from hsa_agent_get_info when obtaining "
663 "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
664 printf(" Compute Units: \t\t\t%u \n", TmpUInt);
665 core::checkResult(hsa_agent_get_info(
666 Agent,
667 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
668 &TmpUInt),
669 "Error returned from hsa_agent_get_info when obtaining "
670 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
671 printf(" SIMD per CU: \t\t\t%u \n", TmpUInt);
672 core::checkResult(
673 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
674 "Error returned from hsa_agent_get_info when obtaining "
675 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
676 printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
677 core::checkResult(
678 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
679 "Error returned from hsa_agent_get_info when obtaining "
680 "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
681 printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2);
682 core::checkResult(
683 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
684 "Error returned from hsa_agent_get_info when obtaining "
685 "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
686 printf(" Workgroup Max Size: \t\t%u \n", TmpUInt);
687 core::checkResult(hsa_agent_get_info(Agent,
688 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
689 WorkgroupMaxDim),
690 "Error returned from hsa_agent_get_info when obtaining "
691 "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
692 printf(" Workgroup Max Size per Dimension:\n");
693 printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
694 printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
695 printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
696 core::checkResult(hsa_agent_get_info(
697 Agent,
698 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
699 &TmpUInt),
700 "Error returned from hsa_agent_get_info when obtaining "
701 "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
702 printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt);
703 printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
704 core::checkResult(
705 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
706 "Error returned from hsa_agent_get_info when obtaining "
707 "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
708 printf(" Grid Max Size: \t\t\t%u \n", TmpUInt);
709 core::checkResult(
710 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
711 "Error returned from hsa_agent_get_info when obtaining "
712 "HSA_AGENT_INFO_GRID_MAX_DIM\n");
713 printf(" Grid Max Size per Dimension: \t\t\n");
714 printf(" x: \t\t\t\t%u\n", GridMaxDim.x);
715 printf(" y: \t\t\t\t%u\n", GridMaxDim.y);
716 printf(" z: \t\t\t\t%u\n", GridMaxDim.z);
717 core::checkResult(
718 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
719 "Error returned from hsa_agent_get_info when obtaining "
720 "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
721 printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
722
723 printf(" Memory Pools:\n");
724 auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
725 std::string TmpStr;
726 size_t Size;
727 bool Alloc, Access;
728 hsa_amd_segment_t Segment;
729 hsa_amd_memory_pool_global_flag_t GlobalFlags;
730 core::checkResult(
731 hsa_amd_memory_pool_get_info(
732 Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
733 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
734 "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
735 core::checkResult(hsa_amd_memory_pool_get_info(
736 Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
737 "Error returned from hsa_amd_memory_pool_get_info when "
738 "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
739
740 switch (Segment) {
741 case HSA_AMD_SEGMENT_GLOBAL:
742 TmpStr = "GLOBAL; FLAGS: ";
743 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
744 TmpStr += "KERNARG, ";
745 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
746 TmpStr += "FINE GRAINED, ";
747 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
748 TmpStr += "COARSE GRAINED, ";
749 break;
750 case HSA_AMD_SEGMENT_READONLY:
751 TmpStr = "READONLY";
752 break;
753 case HSA_AMD_SEGMENT_PRIVATE:
754 TmpStr = "PRIVATE";
755 break;
756 case HSA_AMD_SEGMENT_GROUP:
757 TmpStr = "GROUP";
758 break;
759 }
760 printf(" Pool %s: \n", TmpStr.c_str());
761
762 core::checkResult(hsa_amd_memory_pool_get_info(
763 Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
764 "Error returned from hsa_amd_memory_pool_get_info when "
765 "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
766 printf(" Size: \t\t\t\t %zu bytes\n", Size);
767 core::checkResult(
768 hsa_amd_memory_pool_get_info(
769 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
770 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
771 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
772 printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
773 core::checkResult(
774 hsa_amd_memory_pool_get_info(
775 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
776 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
777 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
778 printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size);
779 core::checkResult(
780 hsa_amd_memory_pool_get_info(
781 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
782 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
783 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
784 printf(" Runtime Alloc alignment: \t %zu bytes\n", Size);
785 core::checkResult(
786 hsa_amd_memory_pool_get_info(
787 Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
788 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
789 "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
790 printf(" Accessable by all: \t\t %s\n",
791 (Access ? "TRUE" : "FALSE"));
792
793 return HSA_STATUS_SUCCESS;
794 };
795 // Iterate over all the memory regions for this agent. Get the memory region
796 // type and size
797 hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
798
799 printf(" ISAs:\n");
800 auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
801 char TmpChar[1000];
802 core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
803 "Error returned from hsa_isa_get_info_alt when "
804 "obtaining HSA_ISA_INFO_NAME\n");
805 printf(" Name: \t\t\t\t %s\n", TmpChar);
806
807 return HSA_STATUS_SUCCESS;
808 };
809 // Iterate over all the memory regions for this agent. Get the memory region
810 // type and size
811 hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
812 }
813
814 // Record entry point associated with device
815 void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
816 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 817, __extension__
__PRETTY_FUNCTION__))
817 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 817, __extension__
__PRETTY_FUNCTION__))
;
818 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
819
820 E.Entries.push_back(Entry);
821 }
822
823 // Return true if the entry is associated with device
824 bool findOffloadEntry(int32_t DeviceId, void *Addr) {
825 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 826, __extension__
__PRETTY_FUNCTION__))
826 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 826, __extension__
__PRETTY_FUNCTION__))
;
827 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
828
829 for (auto &It : E.Entries) {
830 if (It.addr == Addr)
831 return true;
832 }
833
834 return false;
835 }
836
837 // Return the pointer to the target entries table
838 __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
839 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 840, __extension__
__PRETTY_FUNCTION__))
840 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 840, __extension__
__PRETTY_FUNCTION__))
;
841 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
842
843 int32_t Size = E.Entries.size();
844
845 // Table is empty
846 if (!Size)
847 return 0;
848
849 __tgt_offload_entry *Begin = &E.Entries[0];
850 __tgt_offload_entry *End = &E.Entries[Size - 1];
851
852 // Update table info according to the entries and return the pointer
853 E.Table.EntriesBegin = Begin;
854 E.Table.EntriesEnd = ++End;
855
856 return &E.Table;
857 }
858
859 // Clear entries table for a device
860 void clearOffloadEntriesTable(int DeviceId) {
861 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 862, __extension__
__PRETTY_FUNCTION__))
862 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 862, __extension__
__PRETTY_FUNCTION__))
;
863 FuncGblEntries[DeviceId].emplace_back();
864 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
865 // KernelArgPoolMap.clear();
866 E.Entries.clear();
867 E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
868 }
869
870 hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
871 unsigned int DeviceId) {
872 assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.")(static_cast <bool> (DeviceId < DeviceFineGrainedMemoryPools
.size() && "Error here.") ? void (0) : __assert_fail (
"DeviceId < DeviceFineGrainedMemoryPools.size() && \"Error here.\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 872, __extension__
__PRETTY_FUNCTION__))
;
873 uint32_t GlobalFlags = 0;
874 hsa_status_t Err = hsa_amd_memory_pool_get_info(
875 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
876
877 if (Err != HSA_STATUS_SUCCESS) {
878 return Err;
879 }
880
881 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
882 DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
883 } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
884 DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
885 }
886
887 return HSA_STATUS_SUCCESS;
888 }
889
890 hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
891 for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
892 hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
893 Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
894 hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
895 if (ValidStatus != HSA_STATUS_SUCCESS) {
896 DP("Alloc allowed in memory pool check failed: %s\n",{}
897 get_error_string(ValidStatus)){};
898 return HSA_STATUS_SUCCESS;
899 }
900 return addDeviceMemoryPool(MemoryPool, DeviceId);
901 });
902
903 if (Err != HSA_STATUS_SUCCESS) {
904 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{}
905 "Iterate all memory pools", get_error_string(Err)){};
906 return Err;
907 }
908 }
909 return HSA_STATUS_SUCCESS;
910 }
911
912 hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
913 std::vector<hsa_amd_memory_pool_t> HostPools;
914
915 // collect all the "valid" pools for all the given agents.
916 for (const auto &Agent : Agents) {
917 hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
918 Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
919 if (Err != HSA_STATUS_SUCCESS) {
920 DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)){};
921 }
922 }
923
924 // We need two fine-grained pools.
925 // 1. One with kernarg flag set for storing kernel arguments
926 // 2. Second for host allocations
927 bool FineGrainedMemoryPoolSet = false;
928 bool KernArgPoolSet = false;
929 for (const auto &MemoryPool : HostPools) {
930 hsa_status_t Err = HSA_STATUS_SUCCESS;
931 uint32_t GlobalFlags = 0;
932 Err = hsa_amd_memory_pool_get_info(
933 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
934 if (Err != HSA_STATUS_SUCCESS) {
935 DP("Get memory pool info failed: %s\n", get_error_string(Err)){};
936 return Err;
937 }
938
939 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
940 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
941 KernArgPool = MemoryPool;
942 KernArgPoolSet = true;
943 } else {
944 HostFineGrainedMemoryPool = MemoryPool;
945 FineGrainedMemoryPoolSet = true;
946 }
947 }
948 }
949
950 if (FineGrainedMemoryPoolSet && KernArgPoolSet)
951 return HSA_STATUS_SUCCESS;
952
953 return HSA_STATUS_ERROR;
954 }
955
956 hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
957 assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&(static_cast <bool> (DeviceId >= 0 && DeviceId
< DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id"
) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 958, __extension__
__PRETTY_FUNCTION__))
958 "Invalid device Id")(static_cast <bool> (DeviceId >= 0 && DeviceId
< DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id"
) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 958, __extension__
__PRETTY_FUNCTION__))
;
959 return DeviceCoarseGrainedMemoryPools[DeviceId];
960 }
961
962 hsa_amd_memory_pool_t getHostMemoryPool() {
963 return HostFineGrainedMemoryPool;
964 }
965
966 static int readEnv(const char *Env, int Default = -1) {
967 const char *EnvStr = getenv(Env);
968 int Res = Default;
969 if (EnvStr) {
970 Res = std::stoi(EnvStr);
971 DP("Parsed %s=%d\n", Env, Res){};
972 }
973 return Res;
974 }
975
976 RTLDeviceInfoTy() {
977 DP("Start initializing " GETNAME(TARGET_NAME) "\n"){};
978
979 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
980 // anytime. You do not need a debug library build.
981 // 0 => no tracing
982 // 1 => tracing dispatch only
983 // >1 => verbosity increase
984
985 if (!HSAInitSuccess()) {
986 DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"){};
987 return;
988 }
989
990 if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
991 print_kernel_trace = atoi(EnvStr);
992 else
993 print_kernel_trace = 0;
994
995 hsa_status_t Err = core::atl_init_gpu_context();
996 if (Err != HSA_STATUS_SUCCESS) {
997 DP("Error when initializing " GETNAME(TARGET_NAME) "\n"){};
998 return;
999 }
1000
1001 // Init hostcall soon after initializing hsa
1002 hostrpc_init();
1003
1004 Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
1005 if (DeviceType == HSA_DEVICE_TYPE_CPU) {
1006 CPUAgents.push_back(Agent);
1007 } else {
1008 HSAAgents.push_back(Agent);
1009 }
1010 });
1011 if (Err != HSA_STATUS_SUCCESS)
1012 return;
1013
1014 NumberOfDevices = (int)HSAAgents.size();
1015
1016 if (NumberOfDevices == 0) {
1017 DP("There are no devices supporting HSA.\n"){};
1018 return;
1019 }
1020 DP("There are %d devices supporting HSA.\n", NumberOfDevices){};
1021
1022 // Init the device info
1023 HSAQueueSchedulers.reserve(NumberOfDevices);
1024 FuncGblEntries.resize(NumberOfDevices);
1025 ThreadsPerGroup.resize(NumberOfDevices);
1026 ComputeUnits.resize(NumberOfDevices);
1027 GPUName.resize(NumberOfDevices);
1028 GroupsPerDevice.resize(NumberOfDevices);
1029 WarpSize.resize(NumberOfDevices);
1030 NumTeams.resize(NumberOfDevices);
1031 NumThreads.resize(NumberOfDevices);
1032 DeviceStateStore.resize(NumberOfDevices);
1033 KernelInfoTable.resize(NumberOfDevices);
1034 SymbolInfoTable.resize(NumberOfDevices);
1035 DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
1036 DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
1037
1038 Err = setupDevicePools(HSAAgents);
1039 if (Err != HSA_STATUS_SUCCESS) {
1040 DP("Setup for Device Memory Pools failed\n"){};
1041 return;
1042 }
1043
1044 Err = setupHostMemoryPools(CPUAgents);
1045 if (Err != HSA_STATUS_SUCCESS) {
1046 DP("Setup for Host Memory Pools failed\n"){};
1047 return;
1048 }
1049
1050 for (int I = 0; I < NumberOfDevices; I++) {
1051 uint32_t QueueSize = 0;
1052 {
1053 hsa_status_t Err = hsa_agent_get_info(
1054 HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
1055 if (Err != HSA_STATUS_SUCCESS) {
1056 DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I){};
1057 return;
1058 }
1059 enum { MaxQueueSize = 4096 };
1060 if (QueueSize > MaxQueueSize) {
1061 QueueSize = MaxQueueSize;
1062 }
1063 }
1064
1065 {
1066 HSAQueueScheduler QSched;
1067 if (!QSched.createQueues(HSAAgents[I], QueueSize))
1068 return;
1069 HSAQueueSchedulers.emplace_back(std::move(QSched));
1070 }
1071
1072 DeviceStateStore[I] = {nullptr, 0};
1073 }
1074
1075 for (int I = 0; I < NumberOfDevices; I++) {
1076 ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
1077 GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
1078 ComputeUnits[I] = 1;
1079 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,{}
1080 GroupsPerDevice[I], ThreadsPerGroup[I]){};
1081 }
1082
1083 // Get environment variables regarding teams
1084 Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
1085 Env.NumTeams = readEnv("OMP_NUM_TEAMS");
1086 Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
1087 Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
1088 Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
1089
1090 // Default state.
1091 RequiresFlags = OMP_REQ_UNDEFINED;
1092
1093 ConstructionSucceeded = true;
1094 }
1095
1096 ~RTLDeviceInfoTy() {
1097 DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"){};
1098 if (!HSAInitSuccess()) {
1099 // Then none of these can have been set up and they can't be torn down
1100 return;
1101 }
1102 // Run destructors on types that use HSA before
1103 // impl_finalize removes access to it
1104 DeviceStateStore.clear();
1105 KernelArgPoolMap.clear();
1106 // Terminate hostrpc before finalizing hsa
1107 hostrpc_terminate();
1108
1109 hsa_status_t Err;
1110 for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
1111 Err = hsa_executable_destroy(HSAExecutables[I]);
1112 if (Err != HSA_STATUS_SUCCESS) {
1113 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{}
1114 "Destroying executable", get_error_string(Err)){};
1115 }
1116 }
1117 }
1118};
1119
1120pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
1121
1122// Putting accesses to DeviceInfo global behind a function call prior
1123// to changing to use init_plugin/deinit_plugin calls
1124static RTLDeviceInfoTy DeviceInfoState;
1125static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; }
1126
1127namespace {
1128
1129int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
1130 __tgt_async_info *AsyncInfo) {
1131 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1131, __extension__
__PRETTY_FUNCTION__))
;
1132 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1132, __extension__
__PRETTY_FUNCTION__))
;
1133 // Return success if we are not copying back to host from target.
1134 if (!HstPtr)
1135 return OFFLOAD_SUCCESS(0);
1136 hsa_status_t Err;
1137 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{}
1138 (long long unsigned)(Elf64_Addr)TgtPtr,{}
1139 (long long unsigned)(Elf64_Addr)HstPtr){};
1140
1141 Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
1142 DeviceId);
1143
1144 if (Err != HSA_STATUS_SUCCESS) {
1145 DP("Error when copying data from device to host. Pointers: "{}
1146 "host = 0x%016lx, device = 0x%016lx, size = %lld\n",{}
1147 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){};
1148 return OFFLOAD_FAIL(~0);
1149 }
1150 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{}
1151 (long long unsigned)(Elf64_Addr)TgtPtr,{}
1152 (long long unsigned)(Elf64_Addr)HstPtr){};
1153 return OFFLOAD_SUCCESS(0);
1154}
1155
1156int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
1157 __tgt_async_info *AsyncInfo) {
1158 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1158, __extension__
__PRETTY_FUNCTION__))
;
1159 hsa_status_t Err;
1160 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1160, __extension__
__PRETTY_FUNCTION__))
;
1161 // Return success if we are not doing host to target.
1162 if (!HstPtr)
1163 return OFFLOAD_SUCCESS(0);
1164
1165 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,{}
1166 (long long unsigned)(Elf64_Addr)HstPtr,{}
1167 (long long unsigned)(Elf64_Addr)TgtPtr){};
1168 Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
1169 DeviceId);
1170 if (Err != HSA_STATUS_SUCCESS) {
1171 DP("Error when copying data from host to device. Pointers: "{}
1172 "host = 0x%016lx, device = 0x%016lx, size = %lld\n",{}
1173 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){};
1174 return OFFLOAD_FAIL(~0);
1175 }
1176 return OFFLOAD_SUCCESS(0);
1177}
1178
1179// Async.
1180// The implementation was written with cuda streams in mind. The semantics of
1181// that are to execute kernels on a queue in order of insertion. A synchronise
1182// call then makes writes visible between host and device. This means a series
1183// of N data_submit_async calls are expected to execute serially. HSA offers
1184// various options to run the data copies concurrently. This may require changes
1185// to libomptarget.
1186
1187// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
1188// there are no outstanding kernels that need to be synchronized. Any async call
1189// may be passed a Queue==0, at which point the cuda implementation will set it
1190// to non-null (see getStream). The cuda streams are per-device. Upstream may
1191// change this interface to explicitly initialize the AsyncInfo_pointer, but
1192// until then hsa lazily initializes it as well.
1193
1194void initAsyncInfo(__tgt_async_info *AsyncInfo) {
1195 // set non-null while using async calls, return to null to indicate completion
1196 assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail
("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1196, __extension__ __PRETTY_FUNCTION__))
;
1197 if (!AsyncInfo->Queue) {
1198 AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX(18446744073709551615UL));
1199 }
1200}
1201void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
1202 assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail
("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1202, __extension__ __PRETTY_FUNCTION__))
;
1203 assert(AsyncInfo->Queue)(static_cast <bool> (AsyncInfo->Queue) ? void (0) : __assert_fail
("AsyncInfo->Queue", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1203, __extension__ __PRETTY_FUNCTION__))
;
1204 AsyncInfo->Queue = 0;
1205}
1206
1207// Determine launch values for kernel.
1208struct LaunchVals {
1209 int WorkgroupSize;
1210 int GridSize;
1211};
1212LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
1213 int ConstWGSize,
1214 llvm::omp::OMPTgtExecModeFlags ExecutionMode,
1215 int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
1216 int DeviceNumTeams) {
1217
1218 int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
1219 int NumGroups = 0;
1220
1221 int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
1222 if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
1223 MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
1224
1225 if (print_kernel_trace & STARTUP_DETAILS) {
1226 DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams){};
1227 DP("Max_Teams: %d\n", MaxTeams){};
1228 DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize){};
1229 DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize){};
1230 DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",{}
1231 RTLDeviceInfoTy::DefaultWgSize){};
1232 DP("thread_limit: %d\n", ThreadLimit){};
1233 DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1234 DP("ConstWGSize: %d\n", ConstWGSize){};
1235 }
1236 // check for thread_limit() clause
1237 if (ThreadLimit > 0) {
1238 ThreadsPerGroup = ThreadLimit;
1239 DP("Setting threads per block to requested %d\n", ThreadLimit){};
1240 // Add master warp for GENERIC
1241 if (ExecutionMode ==
1242 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1243 ThreadsPerGroup += WarpSize;
1244 DP("Adding master wavefront: +%d threads\n", WarpSize){};
1245 }
1246 if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
1247 ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
1248 DP("Setting threads per block to maximum %d\n", ThreadsPerGroup){};
1249 }
1250 }
1251 // check flat_max_work_group_size attr here
1252 if (ThreadsPerGroup > ConstWGSize) {
1253 ThreadsPerGroup = ConstWGSize;
1254 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",{}
1255 ThreadsPerGroup){};
1256 }
1257 if (print_kernel_trace & STARTUP_DETAILS)
1258 DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1259 DP("Preparing %d threads\n", ThreadsPerGroup){};
1260
1261 // Set default num_groups (teams)
1262 if (Env.TeamLimit > 0)
1263 NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
1264 else
1265 NumGroups = MaxTeams;
1266 DP("Set default num of groups %d\n", NumGroups){};
1267
1268 if (print_kernel_trace & STARTUP_DETAILS) {
1269 DP("num_groups: %d\n", NumGroups){};
1270 DP("num_teams: %d\n", NumTeams){};
1271 }
1272
1273 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1274 // This reduction is typical for default case (no thread_limit clause).
1275 // or when user goes crazy with num_teams clause.
1276 // FIXME: We cant distinguish between a constant or variable thread limit.
1277 // So we only handle constant thread_limits.
1278 if (ThreadsPerGroup >
1279 RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024
1280 // Should we round threadsPerGroup up to nearest WarpSize
1281 // here?
1282 NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
1283
1284 // check for num_teams() clause
1285 if (NumTeams > 0) {
1286 NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
1287 }
1288 if (print_kernel_trace & STARTUP_DETAILS) {
1289 DP("num_groups: %d\n", NumGroups){};
1290 DP("Env.NumTeams %d\n", Env.NumTeams){};
1291 DP("Env.TeamLimit %d\n", Env.TeamLimit){};
1292 }
1293
1294 if (Env.NumTeams > 0) {
1295 NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
1296 DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams){};
1297 } else if (Env.TeamLimit > 0) {
1298 NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
1299 DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit){};
1300 } else {
1301 if (NumTeams <= 0) {
1302 if (LoopTripcount > 0) {
1303 if (ExecutionMode ==
1304 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
1305 // round up to the nearest integer
1306 NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
1307 } else if (ExecutionMode ==
1308 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1309 NumGroups = LoopTripcount;
1310 } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
1311 // This is a generic kernel that was transformed to use SPMD-mode
1312 // execution but uses Generic-mode semantics for scheduling.
1313 NumGroups = LoopTripcount;
1314 }
1315 DP("Using %d teams due to loop trip count %" PRIu64 " and number of "{}
1316 "threads per block %d\n",{}
1317 NumGroups, LoopTripcount, ThreadsPerGroup){};
1318 }
1319 } else {
1320 NumGroups = NumTeams;
1321 }
1322 if (NumGroups > MaxTeams) {
1323 NumGroups = MaxTeams;
1324 if (print_kernel_trace & STARTUP_DETAILS)
1325 DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams){};
1326 }
1327 if (NumGroups > NumTeams && NumTeams > 0) {
1328 NumGroups = NumTeams;
1329 if (print_kernel_trace & STARTUP_DETAILS)
1330 DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,{}
1331 NumTeams){};
1332 }
1333 }
1334
1335 // num_teams clause always honored, no matter what, unless DEFAULT is active.
1336 if (NumTeams > 0) {
1337 NumGroups = NumTeams;
1338 // Cap num_groups to EnvMaxTeamsDefault if set.
1339 if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
1340 NumGroups = Env.MaxTeamsDefault;
1341 }
1342 if (print_kernel_trace & STARTUP_DETAILS) {
1343 DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1344 DP("num_groups: %d\n", NumGroups){};
1345 DP("loop_tripcount: %ld\n", LoopTripcount){};
1346 }
1347 DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,{}
1348 ThreadsPerGroup){};
1349
1350 LaunchVals Res;
1351 Res.WorkgroupSize = ThreadsPerGroup;
1352 Res.GridSize = ThreadsPerGroup * NumGroups;
1353 return Res;
1354}
1355
1356static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
1357 uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
1358 bool Full = true;
1359 while (Full) {
1360 Full =
1361 PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
1362 }
1363 return PacketId;
1364}
1365
1366int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1367 ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
1368 int32_t ThreadLimit, uint64_t LoopTripcount) {
1369 // Set the context we are using
1370 // update thread limit content in gpu memory if un-initialized or specified
1371 // from host
1372
1373 DP("Run target team region thread_limit %d\n", ThreadLimit){};
1374
1375 // All args are references.
1376 std::vector<void *> Args(ArgNum);
1377 std::vector<void *> Ptrs(ArgNum);
1378
1379 DP("Arg_num: %d\n", ArgNum){};
1380 for (int32_t I = 0; I < ArgNum; ++I) {
1381 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1382 Args[I] = &Ptrs[I];
1383 DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])){};
1384 }
1385
1386 KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
1387
1388 std::string KernelName = std::string(KernelInfo->Name);
1389 auto &KernelInfoTable = DeviceInfo().KernelInfoTable;
1390 if (KernelInfoTable[DeviceId].find(KernelName) ==
1391 KernelInfoTable[DeviceId].end()) {
1392 DP("Kernel %s not found\n", KernelName.c_str()){};
1393 return OFFLOAD_FAIL(~0);
1394 }
1395
1396 const atl_kernel_info_t KernelInfoEntry =
1397 KernelInfoTable[DeviceId][KernelName];
1398 const uint32_t GroupSegmentSize =
1399 KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize;
1400 const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
1401 const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
1402 const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
1403 const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
1404
1405 assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count)(static_cast <bool> (ArgNum == (int)KernelInfoEntry.explicit_argument_count
) ? void (0) : __assert_fail ("ArgNum == (int)KernelInfoEntry.explicit_argument_count"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1405, __extension__
__PRETTY_FUNCTION__))
;
1406
1407 /*
1408 * Set limit based on ThreadsPerGroup and GroupsPerDevice
1409 */
1410 LaunchVals LV =
1411 getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env,
1412 KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
1413 NumTeams, // From run_region arg
1414 ThreadLimit, // From run_region arg
1415 LoopTripcount, // From run_region arg
1416 DeviceInfo().NumTeams[KernelInfo->DeviceId]);
1417 const int GridSize = LV.GridSize;
1418 const int WorkgroupSize = LV.WorkgroupSize;
1419
1420 if (print_kernel_trace >= LAUNCH) {
1421 int NumGroups = GridSize / WorkgroupSize;
1422 // enum modes are SPMD, GENERIC, NONE 0,1,2
1423 // if doing rtl timing, print to stderr, unless stdout requested.
1424 bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
1425 fprintf(TraceToStdout ? stdoutstdout : stderrstderr,
1426 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
1427 "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
1428 "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
1429 DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
1430 ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
1431 GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
1432 VgprSpillCount, LoopTripcount, KernelInfo->Name);
1433 }
1434
1435 // Run on the device.
1436 {
1437 hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next();
1438 if (!Queue) {
1439 return OFFLOAD_FAIL(~0);
1440 }
1441 uint64_t PacketId = acquireAvailablePacketId(Queue);
1442
1443 const uint32_t Mask = Queue->size - 1; // size is a power of 2
1444 hsa_kernel_dispatch_packet_t *Packet =
1445 (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
1446
1447 // packet->header is written last
1448 Packet->setup = UINT16_C(1)1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1449 Packet->workgroup_size_x = WorkgroupSize;
1450 Packet->workgroup_size_y = 1;
1451 Packet->workgroup_size_z = 1;
1452 Packet->reserved0 = 0;
1453 Packet->grid_size_x = GridSize;
1454 Packet->grid_size_y = 1;
1455 Packet->grid_size_z = 1;
1456 Packet->private_segment_size = KernelInfoEntry.private_segment_size;
1457 Packet->group_segment_size = GroupSegmentSize;
1458 Packet->kernel_object = KernelInfoEntry.kernel_object;
1459 Packet->kernarg_address = 0; // use the block allocator
1460 Packet->reserved2 = 0; // impl writes id_ here
1461 Packet->completion_signal = {0}; // may want a pool of signals
1462
1463 KernelArgPool *ArgPool = nullptr;
1464 void *KernArg = nullptr;
1465 {
1466 auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
1467 if (It != KernelArgPoolMap.end()) {
1468 ArgPool = (It->second).get();
1469 }
1470 }
1471 if (!ArgPool) {
1472 DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,{}
1473 DeviceId){};
1474 }
1475 {
1476 if (ArgPool) {
1477 assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)))(static_cast <bool> (ArgPool->KernargSegmentSize == (
ArgNum * sizeof(void *))) ? void (0) : __assert_fail ("ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1477, __extension__
__PRETTY_FUNCTION__))
;
1478 KernArg = ArgPool->allocate(ArgNum);
1479 }
1480 if (!KernArg) {
1481 DP("Allocate kernarg failed\n"){};
1482 return OFFLOAD_FAIL(~0);
1483 }
1484
1485 // Copy explicit arguments
1486 for (int I = 0; I < ArgNum; I++) {
1487 memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
1488 }
1489
1490 // Initialize implicit arguments. TODO: Which of these can be dropped
1491 AMDGPUImplicitArgsTy *ImplArgs = reinterpret_cast<AMDGPUImplicitArgsTy *>(
1492 static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
1493 memset(ImplArgs, 0,
1494 sizeof(AMDGPUImplicitArgsTy)); // may not be necessary
1495 ImplArgs->OffsetX = 0;
1496 ImplArgs->OffsetY = 0;
1497 ImplArgs->OffsetZ = 0;
1498
1499 // assign a hostcall buffer for the selected Q
1500 if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE2)) {
1501 // hostrpc_assign_buffer is not thread safe, and this function is
1502 // under a multiple reader lock, not a writer lock.
1503 static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
1504 pthread_mutex_lock(&HostcallInitLock);
1505 uint64_t Buffer = hostrpc_assign_buffer(
1506 DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId);
1507 pthread_mutex_unlock(&HostcallInitLock);
1508 if (!Buffer) {
1509 DP("hostrpc_assign_buffer failed, gpu would dereference null and "{}
1510 "error\n"){};
1511 return OFFLOAD_FAIL(~0);
1512 }
1513
1514 DP("Implicit argument count: %d\n",{}
1515 KernelInfoEntry.implicit_argument_count){};
1516 if (KernelInfoEntry.implicit_argument_count >= 4) {
1517 // Initialise pointer for implicit_argument_count != 0 ABI
1518 // Guess that the right implicit argument is at offset 24 after
1519 // the explicit arguments. In the future, should be able to read
1520 // the offset from msgpack. Clang is not annotating it at present.
1521 uint64_t Offset =
1522 sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
1523 if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
1524 DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "{}
1525 "args: %d\n",{}
1526 Offset + 8, ArgPool->kernargSizeIncludingImplicit()){};
1527 } else {
1528 memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
1529 }
1530 }
1531
1532 // initialise pointer for implicit_argument_count == 0 ABI
1533 ImplArgs->HostcallPtr = Buffer;
1534 }
1535
1536 Packet->kernarg_address = KernArg;
1537 }
1538
1539 hsa_signal_t S = DeviceInfo().FreeSignalPool.pop();
1540 if (S.handle == 0) {
1541 DP("Failed to get signal instance\n"){};
1542 return OFFLOAD_FAIL(~0);
1543 }
1544 Packet->completion_signal = S;
1545 hsa_signal_store_relaxed(Packet->completion_signal, 1);
1546
1547 // Publish the packet indicating it is ready to be processed
1548 core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
1549 core::createHeader(), Packet->setup);
1550
1551 // Since the packet is already published, its contents must not be
1552 // accessed any more
1553 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
1554
1555 while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX(18446744073709551615UL),
1556 HSA_WAIT_STATE_BLOCKED) != 0)
1557 ;
1558
1559 assert(ArgPool)(static_cast <bool> (ArgPool) ? void (0) : __assert_fail
("ArgPool", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1559, __extension__ __PRETTY_FUNCTION__))
;
1560 ArgPool->deallocate(KernArg);
1561 DeviceInfo().FreeSignalPool.push(S);
1562 }
1563
1564 DP("Kernel completed\n"){};
1565 return OFFLOAD_SUCCESS(0);
1566}
1567
1568bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
1569 const uint16_t AmdgcnMachineID = EM_AMDGPU;
1570 const int32_t R = elf_check_machine(Image, AmdgcnMachineID);
1571 if (!R) {
1572 DP("Supported machine ID not found\n"){};
1573 }
1574 return R;
1575}
1576
1577uint32_t elfEFlags(__tgt_device_image *Image) {
1578 const char *ImgBegin = (char *)Image->ImageStart;
1579 size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
1580
1581 StringRef Buffer = StringRef(ImgBegin, ImgSize);
1582 auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
1583 /*InitContent=*/false);
1584 if (!ElfOrErr) {
1585 consumeError(ElfOrErr.takeError());
1586 return 0;
1587 }
1588
1589 if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
1590 return ELFObj->getPlatformFlags();
1591 return 0;
1592}
1593
1594template <typename T> bool enforceUpperBound(T *Value, T Upper) {
1595 bool Changed = *Value > Upper;
1596 if (Changed) {
1597 *Value = Upper;
1598 }
1599 return Changed;
1600}
1601
1602struct SymbolInfo {
1603 const void *Addr = nullptr;
1604 uint32_t Size = UINT32_MAX(4294967295U);
1605 uint32_t ShType = SHT_NULL;
1606};
1607
1608int getSymbolInfoWithoutLoading(const ELFObjectFile<ELF64LE> &ELFObj,
1609 StringRef SymName, SymbolInfo *Res) {
1610 auto SymOrErr = getELFSymbol(ELFObj, SymName);
1611 if (!SymOrErr) {
1612 std::string ErrorString = toString(SymOrErr.takeError());
1613 DP("Failed ELF lookup: %s\n", ErrorString.c_str()){};
1614 return 1;
1615 }
1616 if (!*SymOrErr)
1617 return 1;
1618
1619 auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx);
1620 if (!SymSecOrErr) {
1621 std::string ErrorString = toString(SymOrErr.takeError());
1622 DP("Failed ELF lookup: %s\n", ErrorString.c_str()){};
1623 return 1;
1624 }
1625
1626 Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base();
1627 Res->Size = static_cast<uint32_t>((*SymOrErr)->st_size);
1628 Res->ShType = static_cast<uint32_t>((*SymSecOrErr)->sh_type);
1629 return 0;
1630}
1631
1632int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName,
1633 SymbolInfo *Res) {
1634 StringRef Buffer = StringRef(Base, ImgSize);
1635 auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
1636 /*InitContent=*/false);
1637 if (!ElfOrErr) {
1638 REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str())do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Failed to load ELF: %s\n"
, toString(ElfOrErr.takeError()).c_str()); } while (0);
;
1639 return 1;
1640 }
1641
1642 if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
1643 return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res);
1644 return 1;
1645}
1646
1647hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
1648 const char *SymName, const void **VarAddr,
1649 uint32_t *VarSize) {
1650 SymbolInfo SI;
1651 int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
1652 if (Rc == 0) {
1653 *VarAddr = SI.Addr;
1654 *VarSize = SI.Size;
1655 return HSA_STATUS_SUCCESS;
1656 }
1657 return HSA_STATUS_ERROR;
1658}
1659
1660template <typename C>
1661hsa_status_t moduleRegisterFromMemoryToPlace(
1662 std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
1663 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1664 void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
1665 std::vector<hsa_executable_t> &HSAExecutables) {
1666 auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
1667 C *Unwrapped = static_cast<C *>(CbState);
1668 return (*Unwrapped)(Data, Size);
1669 };
1670 return core::RegisterModuleFromMemory(
1671 KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
1672 DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
1673 HSAExecutables);
1674}
1675
1676uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
1677 uint64_t DeviceStateBytes = 0;
1678 {
1679 // If this is the deviceRTL, get the state variable size
1680 SymbolInfo SizeSi;
1681 int Rc = getSymbolInfoWithoutLoading(
1682 ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
1683
1684 if (Rc == 0) {
1685 if (SizeSi.Size != sizeof(uint64_t)) {
1686 DP("Found device_State_size variable with wrong size\n"){};
1687 return 0;
1688 }
1689
1690 // Read number of bytes directly from the elf
1691 memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
1692 }
1693 }
1694 return DeviceStateBytes;
1695}
1696
1697struct DeviceEnvironment {
1698 // initialise an DeviceEnvironmentTy in the deviceRTL
1699 // patches around differences in the deviceRTL between trunk, aomp,
1700 // rocmcc. Over time these differences will tend to zero and this class
1701 // simplified.
1702 // Symbol may be in .data or .bss, and may be missing fields, todo:
1703 // review aomp/trunk/rocm and simplify the following
1704
1705 // The symbol may also have been deadstripped because the device side
1706 // accessors were unused.
1707
1708 // If the symbol is in .data (aomp, rocm) it can be written directly.
1709 // If it is in .bss, we must wait for it to be allocated space on the
1710 // gpu (trunk) and initialize after loading.
1711 const char *sym() { return "__omp_rtl_device_environment"; }
1712
1713 DeviceEnvironmentTy HostDeviceEnv;
1714 SymbolInfo SI;
1715 bool Valid = false;
1716
1717 __tgt_device_image *Image;
1718 const size_t ImgSize;
1719
1720 DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
1721 __tgt_device_image *Image, const size_t ImgSize)
1722 : Image(Image), ImgSize(ImgSize) {
1723
1724 HostDeviceEnv.NumDevices = NumberDevices;
1725 HostDeviceEnv.DeviceNum = DeviceId;
1726 HostDeviceEnv.DebugKind = 0;
1727 HostDeviceEnv.DynamicMemSize = DynamicMemSize;
1728 if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
1729 HostDeviceEnv.DebugKind = std::stoi(EnvStr);
1730
1731 int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
1732 sym(), &SI);
1733 if (Rc != 0) {
1734 DP("Finding global device environment '%s' - symbol missing.\n", sym()){};
1735 return;
1736 }
1737
1738 if (SI.Size > sizeof(HostDeviceEnv)) {
1739 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,{}
1740 sizeof(HostDeviceEnv)){};
1741 return;
1742 }
1743
1744 Valid = true;
1745 }
1746
1747 bool inImage() { return SI.ShType != SHT_NOBITS; }
1748
1749 hsa_status_t beforeLoading(void *Data, size_t Size) {
1750 if (Valid) {
1751 if (inImage()) {
1752 DP("Setting global device environment before load (%u bytes)\n",{}
1753 SI.Size){};
1754 uint64_t Offset = reinterpret_cast<const char *>(SI.Addr) -
1755 reinterpret_cast<const char *>(Image->ImageStart);
1756 void *Pos = reinterpret_cast<char *>(Data) + Offset;
1757 memcpy(Pos, &HostDeviceEnv, SI.Size);
1758 }
1759 }
1760 return HSA_STATUS_SUCCESS;
1761 }
1762
1763 hsa_status_t afterLoading() {
1764 if (Valid) {
1765 if (!inImage()) {
1766 DP("Setting global device environment after load (%u bytes)\n",{}
1767 SI.Size){};
1768 int DeviceId = HostDeviceEnv.DeviceNum;
1769 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
1770 void *StatePtr;
1771 uint32_t StatePtrSize;
1772 hsa_status_t Err = interop_hsa_get_symbol_info(
1773 SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
1774 if (Err != HSA_STATUS_SUCCESS) {
1775 DP("failed to find %s in loaded image\n", sym()){};
1776 return Err;
1777 }
1778
1779 if (StatePtrSize != SI.Size) {
1780 DP("Symbol had size %u before loading, %u after\n", StatePtrSize,{}
1781 SI.Size){};
1782 return HSA_STATUS_ERROR;
1783 }
1784
1785 return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
1786 StatePtrSize, DeviceId);
1787 }
1788 }
1789 return HSA_STATUS_SUCCESS;
1790 }
1791};
1792
1793hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
1794 uint64_t Rounded = 4 * ((Size + 3) / 4);
1795 void *Ptr;
1796 hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
1797 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
1798 if (Err != HSA_STATUS_SUCCESS) {
1799 return Err;
1800 }
1801
1802 hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
1803 if (Rc != HSA_STATUS_SUCCESS) {
1804 DP("zero fill device_state failed with %u\n", Rc){};
1805 core::Runtime::Memfree(Ptr);
1806 return HSA_STATUS_ERROR;
1807 }
1808
1809 *RetPtr = Ptr;
1810 return HSA_STATUS_SUCCESS;
1811}
1812
1813bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
1814 SymbolInfo SI;
1815 int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
1816 return (Rc == 0) && (SI.Addr != nullptr);
1817}
1818
1819hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent,
1820 void **LockedHostPtr) {
1821 hsa_status_t err = is_locked(HostPtr, LockedHostPtr);
1822 if (err != HSA_STATUS_SUCCESS)
1823 return err;
1824
1825 // HostPtr is already locked, just return it
1826 if (*LockedHostPtr)
1827 return HSA_STATUS_SUCCESS;
1828
1829 hsa_agent_t Agents[1] = {Agent};
1830 return hsa_amd_memory_lock(HostPtr, Size, Agents, /*num_agent=*/1,
1831 LockedHostPtr);
1832}
1833
1834hsa_status_t unlock_memory(void *HostPtr) {
1835 void *LockedHostPtr = nullptr;
1836 hsa_status_t err = is_locked(HostPtr, &LockedHostPtr);
1837 if (err != HSA_STATUS_SUCCESS)
1838 return err;
1839
1840 // if LockedHostPtr is nullptr, then HostPtr was not locked
1841 if (!LockedHostPtr)
1842 return HSA_STATUS_SUCCESS;
1843
1844 err = hsa_amd_memory_unlock(HostPtr);
1845 return err;
1846}
1847
1848} // namespace
1849
1850namespace core {
1851hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
1852 return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(),
1853 &DeviceInfo().HSAAgents[0], NULL__null, Ptr);
1854}
1855} // namespace core
1856
1857static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
1858 hsa_status_t err;
1859 uint32_t name_len;
1860 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
1861 if (err != HSA_STATUS_SUCCESS) {
1862 DP("Error getting ISA info length\n"){};
1863 return err;
1864 }
1865
1866 char TargetID[name_len];
1867 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
1868 if (err != HSA_STATUS_SUCCESS) {
1869 DP("Error getting ISA info name\n"){};
1870 return err;
1871 }
1872
1873 auto TripleTargetID = llvm::StringRef(TargetID);
1874 if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
1875 DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str());
1876 }
1877 return HSA_STATUS_SUCCESS;
1878}
1879
1880extern "C" {
1881int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
1882 return elfMachineIdIsAmdgcn(Image);
1883}
1884
1885int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
1886 __tgt_image_info *info) {
1887 if (!__tgt_rtl_is_valid_binary(image))
1888 return false;
1889
1890 // A subarchitecture was not specified. Assume it is compatible.
1891 if (!info->Arch)
1892 return true;
1893
1894 int32_t NumberOfDevices = __tgt_rtl_number_of_devices();
1895
1896 for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
1897 __tgt_rtl_init_device(DeviceId);
1898 hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId];
1899 hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
1900 if (err != HSA_STATUS_SUCCESS) {
1901 DP("Error iterating ISAs\n"){};
1902 return false;
1903 }
1904 if (!isImageCompatibleWithEnv(info, DeviceInfo().TargetID[DeviceId]))
1905 return false;
1906 }
1907 DP("Image has Target ID compatible with the current environment: %s\n",{}
1908 info->Arch){};
1909 return true;
1910}
1911
1912int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS(0); }
1913int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS(0); }
1914
1915int __tgt_rtl_number_of_devices() {
1916 // If the construction failed, no methods are safe to call
1917 if (DeviceInfo().ConstructionSucceeded) {
1918 return DeviceInfo().NumberOfDevices;
1919 }
1920 DP("AMDGPU plugin construction failed. Zero devices available\n"){};
1921 return 0;
1922}
1923
1924int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1925 DP("Init requires flags to %ld\n", RequiresFlags){};
1926 DeviceInfo().RequiresFlags = RequiresFlags;
1927 return RequiresFlags;
1928}
1929
1930int32_t __tgt_rtl_init_device(int DeviceId) {
1931 hsa_status_t Err = hsa_init();
1932 if (Err != HSA_STATUS_SUCCESS) {
1933 DP("HSA Initialization Failed.\n"){};
1934 return HSA_STATUS_ERROR;
1935 }
1936 // this is per device id init
1937 DP("Initialize the device id: %d\n", DeviceId){};
1938
1939 hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
1940
1941 // Get number of Compute Unit
1942 uint32_t ComputeUnits = 0;
1943 Err = hsa_agent_get_info(
1944 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
1945 &ComputeUnits);
1946 if (Err != HSA_STATUS_SUCCESS) {
1947 DeviceInfo().ComputeUnits[DeviceId] = 1;
1948 DP("Error getting compute units : settiing to 1\n"){};
1949 } else {
1950 DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits;
1951 DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]){};
1952 }
1953
1954 char GetInfoName[64]; // 64 max size returned by get info
1955 Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
1956 (void *)GetInfoName);
1957 if (Err)
1958 DeviceInfo().GPUName[DeviceId] = "--unknown gpu--";
1959 else {
1960 DeviceInfo().GPUName[DeviceId] = GetInfoName;
1961 }
1962
1963 if (print_kernel_trace & STARTUP_DETAILS)
1964 DP("Device#%-2d CU's: %2d %s\n", DeviceId,{}
1965 DeviceInfo().ComputeUnits[DeviceId],{}
1966 DeviceInfo().GPUName[DeviceId].c_str()){};
1967
1968 // Query attributes to determine number of threads/block and blocks/grid.
1969 uint16_t WorkgroupMaxDim[3];
1970 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
1971 &WorkgroupMaxDim);
1972 if (Err != HSA_STATUS_SUCCESS) {
1973 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
1974 DP("Error getting grid dims: num groups : %d\n",{}
1975 RTLDeviceInfoTy::DefaultNumTeams){};
1976 } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
1977 DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
1978 DP("Using %d ROCm blocks per grid\n",{}
1979 DeviceInfo().GroupsPerDevice[DeviceId]){};
1980 } else {
1981 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
1982 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "{}
1983 "at the hard limit\n",{}
1984 WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit){};
1985 }
1986
1987 // Get thread limit
1988 hsa_dim3_t GridMaxDim;
1989 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
1990 if (Err == HSA_STATUS_SUCCESS) {
1991 DeviceInfo().ThreadsPerGroup[DeviceId] =
1992 reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
1993 DeviceInfo().GroupsPerDevice[DeviceId];
1994
1995 if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) {
1996 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
1997 DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){};
1998 } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId],
1999 RTLDeviceInfoTy::MaxWgSize)) {
2000 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){};
2001 } else {
2002 DP("Using ROCm Queried thread limit: %d\n",{}
2003 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2004 }
2005 } else {
2006 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2007 DP("Error getting max block dimension, use default:%d \n",{}
2008 RTLDeviceInfoTy::MaxWgSize){};
2009 }
2010
2011 // Get wavefront size
2012 uint32_t WavefrontSize = 0;
2013 Err =
2014 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
2015 if (Err == HSA_STATUS_SUCCESS) {
2016 DP("Queried wavefront size: %d\n", WavefrontSize){};
2017 DeviceInfo().WarpSize[DeviceId] = WavefrontSize;
2018 } else {
2019 // TODO: Burn the wavefront size into the code object
2020 DP("Warning: Unknown wavefront size, assuming 64\n"){};
2021 DeviceInfo().WarpSize[DeviceId] = 64;
2022 }
2023
2024 // Adjust teams to the env variables
2025
2026 if (DeviceInfo().Env.TeamLimit > 0 &&
2027 (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId],
2028 DeviceInfo().Env.TeamLimit))) {
2029 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",{}
2030 DeviceInfo().Env.TeamLimit){};
2031 }
2032
2033 // Set default number of teams
2034 if (DeviceInfo().Env.NumTeams > 0) {
2035 DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams;
2036 DP("Default number of teams set according to environment %d\n",{}
2037 DeviceInfo().Env.NumTeams){};
2038 } else {
2039 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
2040 int TeamsPerCU = DefaultTeamsPerCU;
2041 if (TeamsPerCUEnvStr) {
2042 TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
2043 }
2044
2045 DeviceInfo().NumTeams[DeviceId] =
2046 TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId];
2047 DP("Default number of teams = %d * number of compute units %d\n",{}
2048 TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]){};
2049 }
2050
2051 if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId],
2052 DeviceInfo().GroupsPerDevice[DeviceId])) {
2053 DP("Default number of teams exceeds device limit, capping at %d\n",{}
2054 DeviceInfo().GroupsPerDevice[DeviceId]){};
2055 }
2056
2057 // Adjust threads to the env variables
2058 if (DeviceInfo().Env.TeamThreadLimit > 0 &&
2059 (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2060 DeviceInfo().Env.TeamThreadLimit))) {
2061 DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",{}
2062 DeviceInfo().Env.TeamThreadLimit){};
2063 }
2064
2065 // Set default number of threads
2066 DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
2067 DP("Default number of threads set according to library's default %d\n",{}
2068 RTLDeviceInfoTy::DefaultWgSize){};
2069 if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2070 DeviceInfo().ThreadsPerGroup[DeviceId])) {
2071 DP("Default number of threads exceeds device limit, capping at %d\n",{}
2072 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2073 }
2074
2075 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",{}
2076 DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],{}
2077 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2078
2079 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,{}
2080 DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],{}
2081 DeviceInfo().GroupsPerDevice[DeviceId],{}
2082 DeviceInfo().GroupsPerDevice[DeviceId] *{}
2083 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2084
2085 return OFFLOAD_SUCCESS(0);
2086}
2087
2088static __tgt_target_table *
2089__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
2090
2091__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
2092 __tgt_device_image *Image) {
2093 DeviceInfo().LoadRunLock.lock();
2094 __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
2095 DeviceInfo().LoadRunLock.unlock();
2096 return Res;
2097}
2098
2099__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2100 __tgt_device_image *Image) {
2101 // This function loads the device image onto gpu[DeviceId] and does other
2102 // per-image initialization work. Specifically:
2103 //
2104 // - Initialize an DeviceEnvironmentTy instance embedded in the
2105 // image at the symbol "__omp_rtl_device_environment"
2106 // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
2107 //
2108 // - Allocate a large array per-gpu (could be moved to init_device)
2109 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size
2110 // - Allocate at least that many bytes of gpu memory
2111 // - Zero initialize it
2112 // - Write the pointer to the symbol omptarget_nvptx_device_State
2113 //
2114 // - Pulls some per-kernel information together from various sources and
2115 // records it in the KernelsList for quicker access later
2116 //
2117 // The initialization can be done before or after loading the image onto the
2118 // gpu. This function presently does a mixture. Using the hsa api to get/set
2119 // the information is simpler to implement, in exchange for more complicated
2120 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
2121 // back from the gpu vs a hashtable lookup on the host.
2122
2123 const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
2124
2125 DeviceInfo().clearOffloadEntriesTable(DeviceId);
2126
2127 // We do not need to set the ELF version because the caller of this function
2128 // had to do that to decide the right runtime to use
2129
2130 if (!elfMachineIdIsAmdgcn(Image))
2131 return NULL__null;
2132
2133 {
2134 auto Env =
2135 DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
2136 DeviceInfo().Env.DynamicMemSize, Image, ImgSize);
2137
2138 auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId];
2139 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
2140 hsa_status_t Err = moduleRegisterFromMemoryToPlace(
2141 KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
2142 [&](void *Data, size_t Size) {
2143 if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
2144 __atomic_store_n(&DeviceInfo().HostcallRequired, true,
2145 __ATOMIC_RELEASE3);
2146 }
2147 return Env.beforeLoading(Data, Size);
2148 },
2149 DeviceInfo().HSAExecutables);
2150
2151 check("Module registering", Err){};
2152 if (Err != HSA_STATUS_SUCCESS) {
2153 const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str();
2154 const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
2155
2156 if (strcmp(DeviceName, ElfName) != 0) {
2157 DP("Possible gpu arch mismatch: device:%s, image:%s please check"{}
2158 " compiler flag: -march=<gpu>\n",{}
2159 DeviceName, ElfName){};
2160 } else {
2161 DP("Error loading image onto GPU: %s\n", get_error_string(Err)){};
2162 }
2163
2164 return NULL__null;
2165 }
2166
2167 Err = Env.afterLoading();
2168 if (Err != HSA_STATUS_SUCCESS) {
2169 return NULL__null;
2170 }
2171 }
2172
2173 DP("AMDGPU module successfully loaded!\n"){};
2174
2175 {
2176 // the device_State array is either large value in bss or a void* that
2177 // needs to be assigned to a pointer to an array of size device_state_bytes
2178 // If absent, it has been deadstripped and needs no setup.
2179
2180 void *StatePtr;
2181 uint32_t StatePtrSize;
2182 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2183 hsa_status_t Err = interop_hsa_get_symbol_info(
2184 SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
2185 &StatePtrSize);
2186
2187 if (Err != HSA_STATUS_SUCCESS) {
2188 DP("No device_state symbol found, skipping initialization\n"){};
2189 } else {
2190 if (StatePtrSize < sizeof(void *)) {
2191 DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,{}
2192 sizeof(void *)){};
2193 return NULL__null;
2194 }
2195
2196 // if it's larger than a void*, assume it's a bss array and no further
2197 // initialization is required. Only try to set up a pointer for
2198 // sizeof(void*)
2199 if (StatePtrSize == sizeof(void *)) {
2200 uint64_t DeviceStateBytes =
2201 getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
2202 if (DeviceStateBytes == 0) {
2203 DP("Can't initialize device_State, missing size information\n"){};
2204 return NULL__null;
2205 }
2206
2207 auto &DSS = DeviceInfo().DeviceStateStore[DeviceId];
2208 if (DSS.first.get() == nullptr) {
2209 assert(DSS.second == 0)(static_cast <bool> (DSS.second == 0) ? void (0) : __assert_fail
("DSS.second == 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 2209, __extension__ __PRETTY_FUNCTION__))
;
2210 void *Ptr = NULL__null;
2211 hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
2212 if (Err != HSA_STATUS_SUCCESS) {
2213 DP("Failed to allocate device_state array\n"){};
2214 return NULL__null;
2215 }
2216 DSS = {
2217 std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
2218 DeviceStateBytes,
2219 };
2220 }
2221
2222 void *Ptr = DSS.first.get();
2223 if (DeviceStateBytes != DSS.second) {
2224 DP("Inconsistent sizes of device_State unsupported\n"){};
2225 return NULL__null;
2226 }
2227
2228 // write ptr to device memory so it can be used by later kernels
2229 Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
2230 sizeof(void *), DeviceId);
2231 if (Err != HSA_STATUS_SUCCESS) {
2232 DP("memcpy install of state_ptr failed\n"){};
2233 return NULL__null;
2234 }
2235 }
2236 }
2237 }
2238
2239 // Here, we take advantage of the data that is appended after img_end to get
2240 // the symbols' name we need to load. This data consist of the host entries
2241 // begin and end as well as the target name (see the offloading linker script
2242 // creation in clang compiler).
2243
2244 // Find the symbols in the module by name. The name can be obtain by
2245 // concatenating the host entry name with the target name
2246
2247 __tgt_offload_entry *HostBegin = Image->EntriesBegin;
2248 __tgt_offload_entry *HostEnd = Image->EntriesEnd;
2249
2250 for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
2251
2252 if (!E->addr) {
2253 // The host should have always something in the address to
2254 // uniquely identify the target region.
2255 DP("Analyzing host entry '<null>' (size = %lld)...\n",{}
2256 (unsigned long long)E->size){};
2257 return NULL__null;
2258 }
2259
2260 if (E->size) {
2261 __tgt_offload_entry Entry = *E;
2262
2263 void *Varptr;
2264 uint32_t Varsize;
2265
2266 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2267 hsa_status_t Err = interop_hsa_get_symbol_info(
2268 SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
2269
2270 if (Err != HSA_STATUS_SUCCESS) {
2271 // Inform the user what symbol prevented offloading
2272 DP("Loading global '%s' (Failed)\n", E->name){};
2273 return NULL__null;
2274 }
2275
2276 if (Varsize != E->size) {
2277 DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,{}
2278 Varsize, E->size){};
2279 return NULL__null;
2280 }
2281
2282 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",{}
2283 DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)){};
2284 Entry.addr = (void *)Varptr;
2285
2286 DeviceInfo().addOffloadEntry(DeviceId, Entry);
2287
2288 if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
2289 E->flags & OMP_DECLARE_TARGET_LINK) {
2290 // If unified memory is present any target link variables
2291 // can access host addresses directly. There is no longer a
2292 // need for device copies.
2293 Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
2294 sizeof(void *), DeviceId);
2295 if (Err != HSA_STATUS_SUCCESS)
2296 DP("Error when copying USM\n"){};
2297 DP("Copy linked variable host address (" DPxMOD ")"{}
2298 "to device address (" DPxMOD ")\n",{}
2299 DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)){};
2300 }
2301
2302 continue;
2303 }
2304
2305 DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)){};
2306
2307 // errors in kernarg_segment_size previously treated as = 0 (or as undef)
2308 uint32_t KernargSegmentSize = 0;
2309 auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId];
2310 hsa_status_t Err = HSA_STATUS_SUCCESS;
2311 if (!E->name) {
2312 Err = HSA_STATUS_ERROR;
2313 } else {
2314 std::string KernelStr = std::string(E->name);
2315 auto It = KernelInfoMap.find(KernelStr);
2316 if (It != KernelInfoMap.end()) {
2317 atl_kernel_info_t Info = It->second;
2318 KernargSegmentSize = Info.kernel_segment_size;
2319 } else {
2320 Err = HSA_STATUS_ERROR;
Value stored to 'Err' is never read
2321 }
2322 }
2323
2324 // default value GENERIC (in case symbol is missing from cubin file)
2325 llvm::omp::OMPTgtExecModeFlags ExecModeVal =
2326 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2327
2328 // get flat group size if present, else Default_WG_Size
2329 int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2330
2331 // get Kernel Descriptor if present.
2332 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
2333 struct KernDescValType {
2334 uint16_t Version;
2335 uint16_t TSize;
2336 uint16_t WGSize;
2337 };
2338 struct KernDescValType KernDescVal;
2339 std::string KernDescNameStr(E->name);
2340 KernDescNameStr += "_kern_desc";
2341 const char *KernDescName = KernDescNameStr.c_str();
2342
2343 const void *KernDescPtr;
2344 uint32_t KernDescSize;
2345 void *CallStackAddr = nullptr;
2346 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
2347 &KernDescPtr, &KernDescSize);
2348
2349 if (Err == HSA_STATUS_SUCCESS) {
2350 if ((size_t)KernDescSize != sizeof(KernDescVal))
2351 DP("Loading global computation properties '%s' - size mismatch (%u != "{}
2352 "%lu)\n",{}
2353 KernDescName, KernDescSize, sizeof(KernDescVal)){};
2354
2355 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
2356
2357 // Check structure size against recorded size.
2358 if ((size_t)KernDescSize != KernDescVal.TSize)
2359 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",{}
2360 sizeof(KernDescVal), KernDescVal.TSize, KernDescName){};
2361
2362 DP("After loading global for %s KernDesc \n", KernDescName){};
2363 DP("KernDesc: Version: %d\n", KernDescVal.Version){};
2364 DP("KernDesc: TSize: %d\n", KernDescVal.TSize){};
2365 DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize){};
2366
2367 if (KernDescVal.WGSize == 0) {
2368 KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
2369 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize){};
2370 }
2371 WGSizeVal = KernDescVal.WGSize;
2372 DP("WGSizeVal %d\n", WGSizeVal){};
2373 check("Loading KernDesc computation property", Err){};
2374 } else {
2375 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName){};
2376
2377 // Flat group size
2378 std::string WGSizeNameStr(E->name);
2379 WGSizeNameStr += "_wg_size";
2380 const char *WGSizeName = WGSizeNameStr.c_str();
2381
2382 const void *WGSizePtr;
2383 uint32_t WGSize;
2384 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
2385 &WGSizePtr, &WGSize);
2386
2387 if (Err == HSA_STATUS_SUCCESS) {
2388 if ((size_t)WGSize != sizeof(int16_t)) {
2389 DP("Loading global computation properties '%s' - size mismatch (%u "{}
2390 "!= "{}
2391 "%lu)\n",{}
2392 WGSizeName, WGSize, sizeof(int16_t)){};
2393 return NULL__null;
2394 }
2395
2396 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
2397
2398 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal){};
2399
2400 if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
2401 WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
2402 DP("Error wrong WGSize value specified in HSA code object file: "{}
2403 "%d\n",{}
2404 WGSizeVal){};
2405 WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2406 }
2407 } else {
2408 DP("Warning: Loading WGSize '%s' - symbol not found, "{}
2409 "using default value %d\n",{}
2410 WGSizeName, WGSizeVal){};
2411 }
2412
2413 check("Loading WGSize computation property", Err){};
2414 }
2415
2416 // Read execution mode from global in binary
2417 std::string ExecModeNameStr(E->name);
2418 ExecModeNameStr += "_exec_mode";
2419 const char *ExecModeName = ExecModeNameStr.c_str();
2420
2421 const void *ExecModePtr;
2422 uint32_t VarSize;
2423 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
2424 &ExecModePtr, &VarSize);
2425
2426 if (Err == HSA_STATUS_SUCCESS) {
2427 if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
2428 DP("Loading global computation properties '%s' - size mismatch(%u != "{}
2429 "%lu)\n",{}
2430 ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)){};
2431 return NULL__null;
2432 }
2433
2434 memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
2435
2436 DP("After loading global for %s ExecMode = %d\n", ExecModeName,{}
2437 ExecModeVal){};
2438
2439 if (ExecModeVal < 0 ||
2440 ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
2441 DP("Error wrong exec_mode value specified in HSA code object file: "{}
2442 "%d\n",{}
2443 ExecModeVal){};
2444 return NULL__null;
2445 }
2446 } else {
2447 DP("Loading global exec_mode '%s' - symbol missing, using default "{}
2448 "value "{}
2449 "GENERIC (1)\n",{}
2450 ExecModeName){};
2451 }
2452 check("Loading computation property", Err){};
2453
2454 KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
2455 CallStackAddr, E->name, KernargSegmentSize,
2456 DeviceInfo().KernArgPool));
2457 __tgt_offload_entry Entry = *E;
2458 Entry.addr = (void *)&KernelsList.back();
2459 DeviceInfo().addOffloadEntry(DeviceId, Entry);
2460 DP("Entry point %ld maps to %s\n", E - HostBegin, E->name){};
2461 }
2462
2463 return DeviceInfo().getOffloadEntriesTable(DeviceId);
2464}
2465
2466void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
2467 void *Ptr = NULL__null;
2468 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2468, __extension__
__PRETTY_FUNCTION__))
;
2469
2470 hsa_amd_memory_pool_t MemoryPool;
2471 switch (Kind) {
2472 case TARGET_ALLOC_DEFAULT:
2473 case TARGET_ALLOC_DEVICE:
2474 // GPU memory
2475 MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
2476 break;
2477 case TARGET_ALLOC_HOST:
2478 // non-migratable memory accessible by host and device(s)
2479 MemoryPool = DeviceInfo().getHostMemoryPool();
2480 break;
2481 default:
2482 REPORT("Invalid target data allocation kind or requested allocator not "do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not "
"implemented yet\n"); } while (0);
2483 "implemented yet\n")do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not "
"implemented yet\n"); } while (0);
;
2484 return NULL__null;
2485 }
2486
2487 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
2488 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,{}
2489 (long long unsigned)(Elf64_Addr)Ptr){};
2490 Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL__null;
2491 return Ptr;
2492}
2493
2494int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
2495 int64_t Size) {
2496 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2496, __extension__
__PRETTY_FUNCTION__))
;
2497 __tgt_async_info AsyncInfo;
2498 int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
2499 if (Rc != OFFLOAD_SUCCESS(0))
2500 return OFFLOAD_FAIL(~0);
2501
2502 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2503}
2504
2505int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
2506 int64_t Size, __tgt_async_info *AsyncInfo) {
2507 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2507, __extension__
__PRETTY_FUNCTION__))
;
2508 if (AsyncInfo) {
2509 initAsyncInfo(AsyncInfo);
2510 return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
2511 }
2512 return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
2513}
2514
2515int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
2516 int64_t Size) {
2517 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2517, __extension__
__PRETTY_FUNCTION__))
;
2518 __tgt_async_info AsyncInfo;
2519 int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
2520 if (Rc != OFFLOAD_SUCCESS(0))
2521 return OFFLOAD_FAIL(~0);
2522
2523 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2524}
2525
2526int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
2527 int64_t Size,
2528 __tgt_async_info *AsyncInfo) {
2529 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2529, __extension__
__PRETTY_FUNCTION__))
;
2530 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2530, __extension__
__PRETTY_FUNCTION__))
;
2531 initAsyncInfo(AsyncInfo);
2532 return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
2533}
2534
2535int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) {
2536 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2536, __extension__
__PRETTY_FUNCTION__))
;
2537 // HSA can free pointers allocated from different types of memory pool.
2538 hsa_status_t Err;
2539 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr){};
2540 Err = core::Runtime::Memfree(TgtPtr);
2541 if (Err != HSA_STATUS_SUCCESS) {
2542 DP("Error when freeing CUDA memory\n"){};
2543 return OFFLOAD_FAIL(~0);
2544 }
2545 return OFFLOAD_SUCCESS(0);
2546}
2547
2548int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
2549 void **TgtArgs, ptrdiff_t *TgtOffsets,
2550 KernelArgsTy *KernelArgs,
2551 __tgt_async_info *AsyncInfo) {
2552 assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&(static_cast <bool> (!KernelArgs->NumTeams[1] &&
!KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit
[1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported."
) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__
__PRETTY_FUNCTION__))
2553 !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&(static_cast <bool> (!KernelArgs->NumTeams[1] &&
!KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit
[1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported."
) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__
__PRETTY_FUNCTION__))
2554 "Only one dimensional kernels supported.")(static_cast <bool> (!KernelArgs->NumTeams[1] &&
!KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit
[1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported."
) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__
__PRETTY_FUNCTION__))
;
2555 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2555, __extension__
__PRETTY_FUNCTION__))
;
2556 initAsyncInfo(AsyncInfo);
2557
2558 DeviceInfo().LoadRunLock.lock_shared();
2559 int32_t Res =
2560 runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2561 KernelArgs->NumArgs, KernelArgs->NumTeams[0],
2562 KernelArgs->ThreadLimit[0], KernelArgs->Tripcount);
2563
2564 DeviceInfo().LoadRunLock.unlock_shared();
2565 return Res;
2566}
2567
2568int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
2569 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2569, __extension__
__PRETTY_FUNCTION__))
;
2570
2571 // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
2572 // is not ensured by devices.cpp for amdgcn
2573 // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
2574 if (AsyncInfo->Queue) {
2575 finiAsyncInfo(AsyncInfo);
2576 }
2577 return OFFLOAD_SUCCESS(0);
2578}
2579
2580void __tgt_rtl_print_device_info(int32_t DeviceId) {
2581 // TODO: Assertion to see if DeviceId is correct
2582 // NOTE: We don't need to set context for print device info.
2583
2584 DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
2585}
2586
2587int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size,
2588 void **LockedHostPtr) {
2589 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2589, __extension__
__PRETTY_FUNCTION__))
;
2590
2591 hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
2592 hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr);
2593 if (err != HSA_STATUS_SUCCESS) {
2594 DP("Error in tgt_rtl_data_lock\n"){};
2595 return OFFLOAD_FAIL(~0);
2596 }
2597 DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size,{}
2598 (long long unsigned)(Elf64_Addr)*LockedHostPtr){};
2599 return OFFLOAD_SUCCESS(0);
2600}
2601
2602int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) {
2603 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2603, __extension__
__PRETTY_FUNCTION__))
;
2604 hsa_status_t err = unlock_memory(HostPtr);
2605 if (err != HSA_STATUS_SUCCESS) {
2606 DP("Error in tgt_rtl_data_unlock\n"){};
2607 return OFFLOAD_FAIL(~0);
2608 }
2609
2610 DP("Tgt unlock data (tgt:%016llx).\n",{}
2611 (long long unsigned)(Elf64_Addr)HostPtr){};
2612 return OFFLOAD_SUCCESS(0);
2613}
2614
2615} // extern "C"