Bug Summary

File:build/source/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
Warning:line 2383, column 9
Value stored to 'Err' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name rtl.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -D TARGET_NAME=AMDGPU -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -I projects/openmp/libomptarget/plugins/amdgpu -I /build/source/openmp/libomptarget/plugins/amdgpu -I include -I /build/source/llvm/include -I /build/source/openmp/libomptarget/plugins/amdgpu/dynamic_hsa -I /build/source/openmp/libomptarget/include -I /build/source/openmp/libomptarget/plugins/amdgpu/impl -I /build/source/openmp/libomptarget/plugins/common/elf_common -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1668078801 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -Wno-enum-constexpr-conversion -Wno-extra -Wno-pedantic -Wno-maybe-uninitialized -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=protected -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-11-10-135928-647445-1 -x c++ /build/source/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
1//===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// RTL for AMD hsa machine
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/StringMap.h"
14#include "llvm/ADT/StringRef.h"
15#include "llvm/Frontend/OpenMP/OMPConstants.h"
16#include "llvm/Frontend/OpenMP/OMPGridValues.h"
17#include "llvm/Object/ELF.h"
18#include "llvm/Object/ELFObjectFile.h"
19
20#include <algorithm>
21#include <assert.h>
22#include <cstdio>
23#include <cstdlib>
24#include <cstring>
25#include <functional>
26#include <list>
27#include <memory>
28#include <mutex>
29#include <shared_mutex>
30#include <unordered_map>
31#include <vector>
32
33#include "ELFSymbols.h"
34#include "impl_runtime.h"
35#include "interop_hsa.h"
36
37#include "internal.h"
38#include "rt.h"
39
40#include "DeviceEnvironment.h"
41#include "get_elf_mach_gfx_name.h"
42#include "omptargetplugin.h"
43#include "print_tracing.h"
44
45using namespace llvm;
46using namespace llvm::object;
47using namespace llvm::ELF;
48
49// hostrpc interface, FIXME: consider moving to its own include these are
50// statically linked into amdgpu/plugin if present from hostrpc_services.a,
51// linked as --whole-archive to override the weak symbols that are used to
52// implement a fallback for toolchains that do not yet have a hostrpc library.
53extern "C" {
54uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
55 uint32_t DeviceId);
56hsa_status_t hostrpc_init();
57hsa_status_t hostrpc_terminate();
58
59__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
60__attribute__((weak)) hsa_status_t hostrpc_terminate() {
61 return HSA_STATUS_SUCCESS;
62}
63__attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
64 uint32_t DeviceId) {
65 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "{}
66 "missing\n",{}
67 DeviceId){};
68 return 0;
69}
70}
71
72// Heuristic parameters used for kernel launch
73// Number of teams per CU to allow scheduling flexibility
74static const unsigned DefaultTeamsPerCU = 4;
75
76int print_kernel_trace;
77
78#ifdef OMPTARGET_DEBUG
79#define check(msg, status){} \
80 if (status != HSA_STATUS_SUCCESS) { \
81 DP(#msg " failed\n"){}; \
82 } else { \
83 DP(#msg " succeeded\n"){}; \
84 }
85#else
86#define check(msg, status){} \
87 {}
88#endif
89
90#include "elf_common.h"
91
92namespace hsa {
93template <typename C> hsa_status_t iterate_agents(C Cb) {
94 auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
95 C *Unwrapped = static_cast<C *>(Data);
96 return (*Unwrapped)(Agent);
97 };
98 return hsa_iterate_agents(L, static_cast<void *>(&Cb));
99}
100
101template <typename C>
102hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
103 auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
104 C *Unwrapped = static_cast<C *>(Data);
105 return (*Unwrapped)(MemoryPool);
106 };
107
108 return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
109}
110
111} // namespace hsa
112
113/// Keep entries table per device
114struct FuncOrGblEntryTy {
115 __tgt_target_table Table;
116 std::vector<__tgt_offload_entry> Entries;
117};
118
119struct KernelArgPool {
120private:
121 static pthread_mutex_t Mutex;
122
123public:
124 uint32_t KernargSegmentSize;
125 void *KernargRegion = nullptr;
126 std::queue<int> FreeKernargSegments;
127
128 uint32_t kernargSizeIncludingImplicit() {
129 return KernargSegmentSize + sizeof(impl_implicit_args_t);
130 }
131
132 ~KernelArgPool() {
133 if (KernargRegion) {
134 auto R = hsa_amd_memory_pool_free(KernargRegion);
135 if (R != HSA_STATUS_SUCCESS) {
136 DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)){};
137 }
138 }
139 }
140
141 // Can't really copy or move a mutex
142 KernelArgPool() = default;
143 KernelArgPool(const KernelArgPool &) = delete;
144 KernelArgPool(KernelArgPool &&) = delete;
145
146 KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
147 : KernargSegmentSize(KernargSegmentSize) {
148
149 // impl uses one pool per kernel for all gpus, with a fixed upper size
150 // preserving that exact scheme here, including the queue<int>
151
152 hsa_status_t Err = hsa_amd_memory_pool_allocate(
153 MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS(1024 * 16), 0,
154 &KernargRegion);
155
156 if (Err != HSA_STATUS_SUCCESS) {
157 DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)){};
158 KernargRegion = nullptr; // paranoid
159 return;
160 }
161
162 Err = core::allow_access_to_all_gpu_agents(KernargRegion);
163 if (Err != HSA_STATUS_SUCCESS) {
164 DP("hsa allow_access_to_all_gpu_agents failed: %s\n",{}
165 get_error_string(Err)){};
166 auto R = hsa_amd_memory_pool_free(KernargRegion);
167 if (R != HSA_STATUS_SUCCESS) {
168 // if free failed, can't do anything more to resolve it
169 DP("hsa memory poll free failed: %s\n", get_error_string(Err)){};
170 }
171 KernargRegion = nullptr;
172 return;
173 }
174
175 for (int I = 0; I < MAX_NUM_KERNELS(1024 * 16); I++) {
176 FreeKernargSegments.push(I);
177 }
178 }
179
180 void *allocate(uint64_t ArgNum) {
181 assert((ArgNum * sizeof(void *)) == KernargSegmentSize)(static_cast <bool> ((ArgNum * sizeof(void *)) == KernargSegmentSize
) ? void (0) : __assert_fail ("(ArgNum * sizeof(void *)) == KernargSegmentSize"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 181, __extension__
__PRETTY_FUNCTION__))
;
182 Lock L(&Mutex);
183 void *Res = nullptr;
184 if (!FreeKernargSegments.empty()) {
185
186 int FreeIdx = FreeKernargSegments.front();
187 Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
188 (FreeIdx * kernargSizeIncludingImplicit()));
189 assert(FreeIdx == pointerToIndex(Res))(static_cast <bool> (FreeIdx == pointerToIndex(Res)) ? void
(0) : __assert_fail ("FreeIdx == pointerToIndex(Res)", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 189, __extension__ __PRETTY_FUNCTION__))
;
190 FreeKernargSegments.pop();
191 }
192 return Res;
193 }
194
195 void deallocate(void *Ptr) {
196 Lock L(&Mutex);
197 int Idx = pointerToIndex(Ptr);
198 FreeKernargSegments.push(Idx);
199 }
200
201private:
202 int pointerToIndex(void *Ptr) {
203 ptrdiff_t Bytes =
204 static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
205 assert(Bytes >= 0)(static_cast <bool> (Bytes >= 0) ? void (0) : __assert_fail
("Bytes >= 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 205, __extension__ __PRETTY_FUNCTION__))
;
206 assert(Bytes % kernargSizeIncludingImplicit() == 0)(static_cast <bool> (Bytes % kernargSizeIncludingImplicit
() == 0) ? void (0) : __assert_fail ("Bytes % kernargSizeIncludingImplicit() == 0"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 206, __extension__
__PRETTY_FUNCTION__))
;
207 return Bytes / kernargSizeIncludingImplicit();
208 }
209 struct Lock {
210 Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
211 ~Lock() { pthread_mutex_unlock(M); }
212 pthread_mutex_t *M;
213 };
214};
215pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
216
217std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
218 KernelArgPoolMap;
219
220/// Use a single entity to encode a kernel and a set of flags
221struct KernelTy {
222 llvm::omp::OMPTgtExecModeFlags ExecutionMode;
223 int16_t ConstWGSize;
224 int32_t DeviceId;
225 void *CallStackAddr = nullptr;
226 const char *Name;
227
228 KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
229 int32_t DeviceId, void *CallStackAddr, const char *Name,
230 uint32_t KernargSegmentSize,
231 hsa_amd_memory_pool_t &KernArgMemoryPool)
232 : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
233 DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
234 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode){};
235
236 std::string N(Name);
237 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
238 KernelArgPoolMap.insert(
239 std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
240 KernargSegmentSize, KernArgMemoryPool))));
241 }
242 }
243};
244
245/// List that contains all the kernels.
246/// FIXME: we may need this to be per device and per library.
247std::list<KernelTy> KernelsList;
248
249template <typename Callback> static hsa_status_t findAgents(Callback CB) {
250
251 hsa_status_t Err =
252 hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
253 hsa_device_type_t DeviceType;
254 // get_info fails iff HSA runtime not yet initialized
255 hsa_status_t Err =
256 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
257
258 if (Err != HSA_STATUS_SUCCESS) {
259 if (print_kernel_trace > 0)
260 DP("rtl.cpp: err %s\n", get_error_string(Err)){};
261
262 return Err;
263 }
264
265 CB(DeviceType, Agent);
266 return HSA_STATUS_SUCCESS;
267 });
268
269 // iterate_agents fails iff HSA runtime not yet initialized
270 if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
271 DP("rtl.cpp: err %s\n", get_error_string(Err)){};
272 }
273
274 return Err;
275}
276
277static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
278 void *Data) {
279 if (Status != HSA_STATUS_SUCCESS) {
280 const char *StatusString;
281 if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
282 StatusString = "unavailable";
283 }
284 DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,{}
285 Status, StatusString){};
286 abort();
287 }
288}
289
290namespace core {
291namespace {
292
293bool checkResult(hsa_status_t Err, const char *ErrMsg) {
294 if (Err == HSA_STATUS_SUCCESS)
295 return true;
296
297 REPORT("%s", ErrMsg)do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s"
, ErrMsg); } while (0);
;
298 REPORT("%s", get_error_string(Err))do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s"
, get_error_string(Err)); } while (0);
;
299 return false;
300}
301
302void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
303 __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE3);
304}
305
306uint16_t createHeader() {
307 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
308 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
309 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
310 return Header;
311}
312
313hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
314 bool AllocAllowed = false;
315 hsa_status_t Err = hsa_amd_memory_pool_get_info(
316 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
317 &AllocAllowed);
318 if (Err != HSA_STATUS_SUCCESS) {
319 DP("Alloc allowed in memory pool check failed: %s\n",{}
320 get_error_string(Err)){};
321 return Err;
322 }
323
324 size_t Size = 0;
325 Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
326 &Size);
327 if (Err != HSA_STATUS_SUCCESS) {
328 DP("Get memory pool size failed: %s\n", get_error_string(Err)){};
329 return Err;
330 }
331
332 return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
333}
334
335hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
336 std::vector<hsa_amd_memory_pool_t> *Result =
337 static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
338
339 hsa_status_t Err;
340 if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
341 return Err;
342 }
343
344 Result->push_back(MemoryPool);
345 return HSA_STATUS_SUCCESS;
346}
347
348} // namespace
349} // namespace core
350
351struct EnvironmentVariables {
352 int NumTeams;
353 int TeamLimit;
354 int TeamThreadLimit;
355 int MaxTeamsDefault;
356 int DynamicMemSize;
357};
358
359template <uint32_t wavesize>
360static constexpr const llvm::omp::GV &getGridValue() {
361 return llvm::omp::getAMDGPUGridValues<wavesize>();
362}
363
364struct HSALifetime {
365 // Wrapper around HSA used to ensure it is constructed before other types
366 // and destructed after, which means said other types can use raii for
367 // cleanup without risking running outside of the lifetime of HSA
368 const hsa_status_t S;
369
370 bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
371 HSALifetime() : S(hsa_init()) {}
372
373 ~HSALifetime() {
374 if (S == HSA_STATUS_SUCCESS) {
375 hsa_status_t Err = hsa_shut_down();
376 if (Err != HSA_STATUS_SUCCESS) {
377 // Can't call into HSA to get a string from the integer
378 DP("Shutting down HSA failed: %d\n", Err){};
379 }
380 }
381 }
382};
383
384// Handle scheduling of multiple hsa_queue's per device to
385// multiple threads (one scheduler per device)
386class HSAQueueScheduler {
387public:
388 HSAQueueScheduler() : Current(0) {}
389
390 HSAQueueScheduler(const HSAQueueScheduler &) = delete;
391
392 HSAQueueScheduler(HSAQueueScheduler &&Q) {
393 Current = Q.Current.load();
394 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
395 HSAQueues[I] = Q.HSAQueues[I];
396 Q.HSAQueues[I] = nullptr;
397 }
398 }
399
400 // \return false if any HSA queue creation fails
401 bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
402 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
403 hsa_queue_t *Q = nullptr;
404 hsa_status_t Rc =
405 hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
406 callbackQueue, NULL__null, UINT32_MAX(4294967295U), UINT32_MAX(4294967295U), &Q);
407 if (Rc != HSA_STATUS_SUCCESS) {
408 DP("Failed to create HSA queue %d\n", I){};
409 return false;
410 }
411 HSAQueues[I] = Q;
412 }
413 return true;
414 }
415
416 ~HSAQueueScheduler() {
417 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
418 if (HSAQueues[I]) {
419 hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
420 if (Err != HSA_STATUS_SUCCESS)
421 DP("Error destroying HSA queue"){};
422 }
423 }
424 }
425
426 // \return next queue to use for device
427 hsa_queue_t *next() {
428 return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
429 NUM_QUEUES_PER_DEVICE];
430 }
431
432private:
433 // Number of queues per device
434 enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
435 hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
436 std::atomic<uint8_t> Current;
437};
438
439/// Class containing all the device information
440class RTLDeviceInfoTy : HSALifetime {
441 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
442
443 struct QueueDeleter {
444 void operator()(hsa_queue_t *Q) {
445 if (Q) {
446 hsa_status_t Err = hsa_queue_destroy(Q);
447 if (Err != HSA_STATUS_SUCCESS) {
448 DP("Error destroying hsa queue: %s\n", get_error_string(Err)){};
449 }
450 }
451 }
452 };
453
454public:
455 bool ConstructionSucceeded = false;
456
457 // load binary populates symbol tables and mutates various global state
458 // run uses those symbol tables
459 std::shared_timed_mutex LoadRunLock;
460
461 int NumberOfDevices = 0;
462
463 // GPU devices
464 std::vector<hsa_agent_t> HSAAgents;
465 std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
466
467 // CPUs
468 std::vector<hsa_agent_t> CPUAgents;
469
470 // Device properties
471 std::vector<int> ComputeUnits;
472 std::vector<int> GroupsPerDevice;
473 std::vector<int> ThreadsPerGroup;
474 std::vector<int> WarpSize;
475 std::vector<std::string> GPUName;
476 std::vector<std::string> TargetID;
477
478 // OpenMP properties
479 std::vector<int> NumTeams;
480 std::vector<int> NumThreads;
481
482 // OpenMP Environment properties
483 EnvironmentVariables Env;
484
485 // OpenMP Requires Flags
486 int64_t RequiresFlags;
487
488 // Resource pools
489 SignalPoolT FreeSignalPool;
490
491 bool HostcallRequired = false;
492
493 std::vector<hsa_executable_t> HSAExecutables;
494
495 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
496 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
497
498 hsa_amd_memory_pool_t KernArgPool;
499
500 // fine grained memory pool for host allocations
501 hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
502
503 // fine and coarse-grained memory pools per offloading device
504 std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
505 std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
506
507 struct ImplFreePtrDeletor {
508 void operator()(void *P) {
509 core::Runtime::Memfree(P); // ignore failure to free
510 }
511 };
512
513 // device_State shared across loaded binaries, error if inconsistent size
514 std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
515 DeviceStateStore;
516
517 static const unsigned HardTeamLimit =
518 (1 << 16) - 1; // 64K needed to fit in uint16
519 static const int DefaultNumTeams = 128;
520
521 // These need to be per-device since different devices can have different
522 // wave sizes, but are currently the same number for each so that refactor
523 // can be postponed.
524 static_assert(getGridValue<32>().GV_Max_Teams ==
525 getGridValue<64>().GV_Max_Teams,
526 "");
527 static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
528
529 static_assert(getGridValue<32>().GV_Max_WG_Size ==
530 getGridValue<64>().GV_Max_WG_Size,
531 "");
532 static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
533
534 static_assert(getGridValue<32>().GV_Default_WG_Size ==
535 getGridValue<64>().GV_Default_WG_Size,
536 "");
537 static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
538
539 using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
540 hsa_agent_t, hsa_amd_memory_pool_t);
541 hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
542 MemcpyFunc Func, int32_t DeviceId) {
543 hsa_agent_t Agent = HSAAgents[DeviceId];
544 hsa_signal_t S = FreeSignalPool.pop();
545 if (S.handle == 0) {
546 return HSA_STATUS_ERROR;
547 }
548 hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
549 FreeSignalPool.push(S);
550 return R;
551 }
552
553 hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
554 int32_t DeviceId) {
555 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
556 }
557
558 hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
559 int32_t DeviceId) {
560 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
561 }
562
563 static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
564 char TmpChar[1000];
565 uint16_t Major, Minor;
566 uint32_t TmpUInt;
567 uint32_t TmpUInt2;
568 uint32_t CacheSize[4];
569 bool TmpBool;
570 uint16_t WorkgroupMaxDim[3];
571 hsa_dim3_t GridMaxDim;
572
573 // Getting basic information about HSA and Device
574 core::checkResult(
575 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
576 "Error from hsa_system_get_info when obtaining "
577 "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
578 core::checkResult(
579 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
580 "Error from hsa_system_get_info when obtaining "
581 "HSA_SYSTEM_INFO_VERSION_MINOR\n");
582 printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
583 printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId);
584 core::checkResult(
585 hsa_agent_get_info(
586 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
587 "Error returned from hsa_agent_get_info when obtaining "
588 "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
589 printf(" Product Name: \t\t\t%s \n", TmpChar);
590 core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
591 "Error returned from hsa_agent_get_info when obtaining "
592 "HSA_AGENT_INFO_NAME\n");
593 printf(" Device Name: \t\t\t%s \n", TmpChar);
594 core::checkResult(
595 hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
596 "Error returned from hsa_agent_get_info when obtaining "
597 "HSA_AGENT_INFO_NAME\n");
598 printf(" Vendor Name: \t\t\t%s \n", TmpChar);
599 hsa_device_type_t DevType;
600 core::checkResult(
601 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
602 "Error returned from hsa_agent_get_info when obtaining "
603 "HSA_AGENT_INFO_DEVICE\n");
604 printf(" Device Type: \t\t\t%s \n",
605 DevType == HSA_DEVICE_TYPE_CPU
606 ? "CPU"
607 : (DevType == HSA_DEVICE_TYPE_GPU
608 ? "GPU"
609 : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
610 core::checkResult(
611 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
612 "Error returned from hsa_agent_get_info when obtaining "
613 "HSA_AGENT_INFO_QUEUES_MAX\n");
614 printf(" Max Queues: \t\t\t%u \n", TmpUInt);
615 core::checkResult(
616 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
617 "Error returned from hsa_agent_get_info when obtaining "
618 "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
619 printf(" Queue Min Size: \t\t\t%u \n", TmpUInt);
620 core::checkResult(
621 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
622 "Error returned from hsa_agent_get_info when obtaining "
623 "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
624 printf(" Queue Max Size: \t\t\t%u \n", TmpUInt);
625
626 // Getting cache information
627 printf(" Cache:\n");
628
629 // FIXME: This is deprecated according to HSA documentation. But using
630 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
631 // runtime.
632 core::checkResult(
633 hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
634 "Error returned from hsa_agent_get_info when obtaining "
635 "HSA_AGENT_INFO_CACHE_SIZE\n");
636
637 for (int I = 0; I < 4; I++) {
638 if (CacheSize[I]) {
639 printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
640 }
641 }
642
643 core::checkResult(
644 hsa_agent_get_info(Agent,
645 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
646 &TmpUInt),
647 "Error returned from hsa_agent_get_info when obtaining "
648 "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
649 printf(" Cacheline Size: \t\t\t%u \n", TmpUInt);
650 core::checkResult(
651 hsa_agent_get_info(
652 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
653 &TmpUInt),
654 "Error returned from hsa_agent_get_info when obtaining "
655 "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
656 printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
657 core::checkResult(
658 hsa_agent_get_info(
659 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
660 &TmpUInt),
661 "Error returned from hsa_agent_get_info when obtaining "
662 "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
663 printf(" Compute Units: \t\t\t%u \n", TmpUInt);
664 core::checkResult(hsa_agent_get_info(
665 Agent,
666 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
667 &TmpUInt),
668 "Error returned from hsa_agent_get_info when obtaining "
669 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
670 printf(" SIMD per CU: \t\t\t%u \n", TmpUInt);
671 core::checkResult(
672 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
673 "Error returned from hsa_agent_get_info when obtaining "
674 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
675 printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
676 core::checkResult(
677 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
678 "Error returned from hsa_agent_get_info when obtaining "
679 "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
680 printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2);
681 core::checkResult(
682 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
683 "Error returned from hsa_agent_get_info when obtaining "
684 "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
685 printf(" Workgroup Max Size: \t\t%u \n", TmpUInt);
686 core::checkResult(hsa_agent_get_info(Agent,
687 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
688 WorkgroupMaxDim),
689 "Error returned from hsa_agent_get_info when obtaining "
690 "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
691 printf(" Workgroup Max Size per Dimension:\n");
692 printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
693 printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
694 printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
695 core::checkResult(hsa_agent_get_info(
696 Agent,
697 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
698 &TmpUInt),
699 "Error returned from hsa_agent_get_info when obtaining "
700 "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
701 printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt);
702 printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
703 core::checkResult(
704 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
705 "Error returned from hsa_agent_get_info when obtaining "
706 "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
707 printf(" Grid Max Size: \t\t\t%u \n", TmpUInt);
708 core::checkResult(
709 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
710 "Error returned from hsa_agent_get_info when obtaining "
711 "HSA_AGENT_INFO_GRID_MAX_DIM\n");
712 printf(" Grid Max Size per Dimension: \t\t\n");
713 printf(" x: \t\t\t\t%u\n", GridMaxDim.x);
714 printf(" y: \t\t\t\t%u\n", GridMaxDim.y);
715 printf(" z: \t\t\t\t%u\n", GridMaxDim.z);
716 core::checkResult(
717 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
718 "Error returned from hsa_agent_get_info when obtaining "
719 "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
720 printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
721
722 printf(" Memory Pools:\n");
723 auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
724 std::string TmpStr;
725 size_t Size;
726 bool Alloc, Access;
727 hsa_amd_segment_t Segment;
728 hsa_amd_memory_pool_global_flag_t GlobalFlags;
729 core::checkResult(
730 hsa_amd_memory_pool_get_info(
731 Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
732 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
733 "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
734 core::checkResult(hsa_amd_memory_pool_get_info(
735 Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
736 "Error returned from hsa_amd_memory_pool_get_info when "
737 "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
738
739 switch (Segment) {
740 case HSA_AMD_SEGMENT_GLOBAL:
741 TmpStr = "GLOBAL; FLAGS: ";
742 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
743 TmpStr += "KERNARG, ";
744 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
745 TmpStr += "FINE GRAINED, ";
746 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
747 TmpStr += "COARSE GRAINED, ";
748 break;
749 case HSA_AMD_SEGMENT_READONLY:
750 TmpStr = "READONLY";
751 break;
752 case HSA_AMD_SEGMENT_PRIVATE:
753 TmpStr = "PRIVATE";
754 break;
755 case HSA_AMD_SEGMENT_GROUP:
756 TmpStr = "GROUP";
757 break;
758 }
759 printf(" Pool %s: \n", TmpStr.c_str());
760
761 core::checkResult(hsa_amd_memory_pool_get_info(
762 Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
763 "Error returned from hsa_amd_memory_pool_get_info when "
764 "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
765 printf(" Size: \t\t\t\t %zu bytes\n", Size);
766 core::checkResult(
767 hsa_amd_memory_pool_get_info(
768 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
769 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
770 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
771 printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
772 core::checkResult(
773 hsa_amd_memory_pool_get_info(
774 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
775 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
776 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
777 printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size);
778 core::checkResult(
779 hsa_amd_memory_pool_get_info(
780 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
781 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
782 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
783 printf(" Runtime Alloc alignment: \t %zu bytes\n", Size);
784 core::checkResult(
785 hsa_amd_memory_pool_get_info(
786 Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
787 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
788 "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
789 printf(" Accessable by all: \t\t %s\n",
790 (Access ? "TRUE" : "FALSE"));
791
792 return HSA_STATUS_SUCCESS;
793 };
794 // Iterate over all the memory regions for this agent. Get the memory region
795 // type and size
796 hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
797
798 printf(" ISAs:\n");
799 auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
800 char TmpChar[1000];
801 core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
802 "Error returned from hsa_isa_get_info_alt when "
803 "obtaining HSA_ISA_INFO_NAME\n");
804 printf(" Name: \t\t\t\t %s\n", TmpChar);
805
806 return HSA_STATUS_SUCCESS;
807 };
808 // Iterate over all the memory regions for this agent. Get the memory region
809 // type and size
810 hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
811 }
812
813 // Record entry point associated with device
814 void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
815 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 816, __extension__
__PRETTY_FUNCTION__))
816 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 816, __extension__
__PRETTY_FUNCTION__))
;
817 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
818
819 E.Entries.push_back(Entry);
820 }
821
822 // Return true if the entry is associated with device
823 bool findOffloadEntry(int32_t DeviceId, void *Addr) {
824 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 825, __extension__
__PRETTY_FUNCTION__))
825 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 825, __extension__
__PRETTY_FUNCTION__))
;
826 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
827
828 for (auto &It : E.Entries) {
829 if (It.addr == Addr)
830 return true;
831 }
832
833 return false;
834 }
835
836 // Return the pointer to the target entries table
837 __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
838 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 839, __extension__
__PRETTY_FUNCTION__))
839 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 839, __extension__
__PRETTY_FUNCTION__))
;
840 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
841
842 int32_t Size = E.Entries.size();
843
844 // Table is empty
845 if (!Size)
846 return 0;
847
848 __tgt_offload_entry *Begin = &E.Entries[0];
849 __tgt_offload_entry *End = &E.Entries[Size - 1];
850
851 // Update table info according to the entries and return the pointer
852 E.Table.EntriesBegin = Begin;
853 E.Table.EntriesEnd = ++End;
854
855 return &E.Table;
856 }
857
858 // Clear entries table for a device
859 void clearOffloadEntriesTable(int DeviceId) {
860 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 861, __extension__
__PRETTY_FUNCTION__))
861 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 861, __extension__
__PRETTY_FUNCTION__))
;
862 FuncGblEntries[DeviceId].emplace_back();
863 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
864 // KernelArgPoolMap.clear();
865 E.Entries.clear();
866 E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
867 }
868
869 hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
870 unsigned int DeviceId) {
871 assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.")(static_cast <bool> (DeviceId < DeviceFineGrainedMemoryPools
.size() && "Error here.") ? void (0) : __assert_fail (
"DeviceId < DeviceFineGrainedMemoryPools.size() && \"Error here.\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 871, __extension__
__PRETTY_FUNCTION__))
;
872 uint32_t GlobalFlags = 0;
873 hsa_status_t Err = hsa_amd_memory_pool_get_info(
874 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
875
876 if (Err != HSA_STATUS_SUCCESS) {
877 return Err;
878 }
879
880 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
881 DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
882 } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
883 DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
884 }
885
886 return HSA_STATUS_SUCCESS;
887 }
888
889 hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
890 for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
891 hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
892 Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
893 hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
894 if (ValidStatus != HSA_STATUS_SUCCESS) {
895 DP("Alloc allowed in memory pool check failed: %s\n",{}
896 get_error_string(ValidStatus)){};
897 return HSA_STATUS_SUCCESS;
898 }
899 return addDeviceMemoryPool(MemoryPool, DeviceId);
900 });
901
902 if (Err != HSA_STATUS_SUCCESS) {
903 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{}
904 "Iterate all memory pools", get_error_string(Err)){};
905 return Err;
906 }
907 }
908 return HSA_STATUS_SUCCESS;
909 }
910
911 hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
912 std::vector<hsa_amd_memory_pool_t> HostPools;
913
914 // collect all the "valid" pools for all the given agents.
915 for (const auto &Agent : Agents) {
916 hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
917 Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
918 if (Err != HSA_STATUS_SUCCESS) {
919 DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)){};
920 }
921 }
922
923 // We need two fine-grained pools.
924 // 1. One with kernarg flag set for storing kernel arguments
925 // 2. Second for host allocations
926 bool FineGrainedMemoryPoolSet = false;
927 bool KernArgPoolSet = false;
928 for (const auto &MemoryPool : HostPools) {
929 hsa_status_t Err = HSA_STATUS_SUCCESS;
930 uint32_t GlobalFlags = 0;
931 Err = hsa_amd_memory_pool_get_info(
932 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
933 if (Err != HSA_STATUS_SUCCESS) {
934 DP("Get memory pool info failed: %s\n", get_error_string(Err)){};
935 return Err;
936 }
937
938 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
939 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
940 KernArgPool = MemoryPool;
941 KernArgPoolSet = true;
942 }
943 HostFineGrainedMemoryPool = MemoryPool;
944 FineGrainedMemoryPoolSet = true;
945 }
946 }
947
948 if (FineGrainedMemoryPoolSet && KernArgPoolSet)
949 return HSA_STATUS_SUCCESS;
950
951 return HSA_STATUS_ERROR;
952 }
953
954 hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
955 assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&(static_cast <bool> (DeviceId >= 0 && DeviceId
< DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id"
) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 956, __extension__
__PRETTY_FUNCTION__))
956 "Invalid device Id")(static_cast <bool> (DeviceId >= 0 && DeviceId
< DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id"
) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 956, __extension__
__PRETTY_FUNCTION__))
;
957 return DeviceCoarseGrainedMemoryPools[DeviceId];
958 }
959
960 hsa_amd_memory_pool_t getHostMemoryPool() {
961 return HostFineGrainedMemoryPool;
962 }
963
964 static int readEnv(const char *Env, int Default = -1) {
965 const char *EnvStr = getenv(Env);
966 int Res = Default;
967 if (EnvStr) {
968 Res = std::stoi(EnvStr);
969 DP("Parsed %s=%d\n", Env, Res){};
970 }
971 return Res;
972 }
973
974 RTLDeviceInfoTy() {
975 DP("Start initializing " GETNAME(TARGET_NAME) "\n"){};
976
977 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
978 // anytime. You do not need a debug library build.
979 // 0 => no tracing
980 // 1 => tracing dispatch only
981 // >1 => verbosity increase
982
983 if (!HSAInitSuccess()) {
984 DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"){};
985 return;
986 }
987
988 if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
989 print_kernel_trace = atoi(EnvStr);
990 else
991 print_kernel_trace = 0;
992
993 hsa_status_t Err = core::atl_init_gpu_context();
994 if (Err != HSA_STATUS_SUCCESS) {
995 DP("Error when initializing " GETNAME(TARGET_NAME) "\n"){};
996 return;
997 }
998
999 // Init hostcall soon after initializing hsa
1000 hostrpc_init();
1001
1002 Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
1003 if (DeviceType == HSA_DEVICE_TYPE_CPU) {
1004 CPUAgents.push_back(Agent);
1005 } else {
1006 HSAAgents.push_back(Agent);
1007 }
1008 });
1009 if (Err != HSA_STATUS_SUCCESS)
1010 return;
1011
1012 NumberOfDevices = (int)HSAAgents.size();
1013
1014 if (NumberOfDevices == 0) {
1015 DP("There are no devices supporting HSA.\n"){};
1016 return;
1017 }
1018 DP("There are %d devices supporting HSA.\n", NumberOfDevices){};
1019
1020 // Init the device info
1021 HSAQueueSchedulers.reserve(NumberOfDevices);
1022 FuncGblEntries.resize(NumberOfDevices);
1023 ThreadsPerGroup.resize(NumberOfDevices);
1024 ComputeUnits.resize(NumberOfDevices);
1025 GPUName.resize(NumberOfDevices);
1026 GroupsPerDevice.resize(NumberOfDevices);
1027 WarpSize.resize(NumberOfDevices);
1028 NumTeams.resize(NumberOfDevices);
1029 NumThreads.resize(NumberOfDevices);
1030 DeviceStateStore.resize(NumberOfDevices);
1031 KernelInfoTable.resize(NumberOfDevices);
1032 SymbolInfoTable.resize(NumberOfDevices);
1033 DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
1034 DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
1035
1036 Err = setupDevicePools(HSAAgents);
1037 if (Err != HSA_STATUS_SUCCESS) {
1038 DP("Setup for Device Memory Pools failed\n"){};
1039 return;
1040 }
1041
1042 Err = setupHostMemoryPools(CPUAgents);
1043 if (Err != HSA_STATUS_SUCCESS) {
1044 DP("Setup for Host Memory Pools failed\n"){};
1045 return;
1046 }
1047
1048 for (int I = 0; I < NumberOfDevices; I++) {
1049 uint32_t QueueSize = 0;
1050 {
1051 hsa_status_t Err = hsa_agent_get_info(
1052 HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
1053 if (Err != HSA_STATUS_SUCCESS) {
1054 DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I){};
1055 return;
1056 }
1057 enum { MaxQueueSize = 4096 };
1058 if (QueueSize > MaxQueueSize) {
1059 QueueSize = MaxQueueSize;
1060 }
1061 }
1062
1063 {
1064 HSAQueueScheduler QSched;
1065 if (!QSched.createQueues(HSAAgents[I], QueueSize))
1066 return;
1067 HSAQueueSchedulers.emplace_back(std::move(QSched));
1068 }
1069
1070 DeviceStateStore[I] = {nullptr, 0};
1071 }
1072
1073 for (int I = 0; I < NumberOfDevices; I++) {
1074 ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
1075 GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
1076 ComputeUnits[I] = 1;
1077 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,{}
1078 GroupsPerDevice[I], ThreadsPerGroup[I]){};
1079 }
1080
1081 // Get environment variables regarding teams
1082 Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
1083 Env.NumTeams = readEnv("OMP_NUM_TEAMS");
1084 Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
1085 Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
1086 Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
1087
1088 // Default state.
1089 RequiresFlags = OMP_REQ_UNDEFINED;
1090
1091 ConstructionSucceeded = true;
1092 }
1093
1094 ~RTLDeviceInfoTy() {
1095 DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"){};
1096 if (!HSAInitSuccess()) {
1097 // Then none of these can have been set up and they can't be torn down
1098 return;
1099 }
1100 // Run destructors on types that use HSA before
1101 // impl_finalize removes access to it
1102 DeviceStateStore.clear();
1103 KernelArgPoolMap.clear();
1104 // Terminate hostrpc before finalizing hsa
1105 hostrpc_terminate();
1106
1107 hsa_status_t Err;
1108 for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
1109 Err = hsa_executable_destroy(HSAExecutables[I]);
1110 if (Err != HSA_STATUS_SUCCESS) {
1111 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{}
1112 "Destroying executable", get_error_string(Err)){};
1113 }
1114 }
1115 }
1116};
1117
1118pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
1119
1120// Putting accesses to DeviceInfo global behind a function call prior
1121// to changing to use init_plugin/deinit_plugin calls
1122static RTLDeviceInfoTy DeviceInfoState;
1123static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; }
1124
1125namespace {
1126
1127int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
1128 __tgt_async_info *AsyncInfo) {
1129 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1129, __extension__
__PRETTY_FUNCTION__))
;
1130 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1130, __extension__
__PRETTY_FUNCTION__))
;
1131 // Return success if we are not copying back to host from target.
1132 if (!HstPtr)
1133 return OFFLOAD_SUCCESS(0);
1134 hsa_status_t Err;
1135 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{}
1136 (long long unsigned)(Elf64_Addr)TgtPtr,{}
1137 (long long unsigned)(Elf64_Addr)HstPtr){};
1138
1139 Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
1140 DeviceId);
1141
1142 if (Err != HSA_STATUS_SUCCESS) {
1143 DP("Error when copying data from device to host. Pointers: "{}
1144 "host = 0x%016lx, device = 0x%016lx, size = %lld\n",{}
1145 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){};
1146 return OFFLOAD_FAIL(~0);
1147 }
1148 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{}
1149 (long long unsigned)(Elf64_Addr)TgtPtr,{}
1150 (long long unsigned)(Elf64_Addr)HstPtr){};
1151 return OFFLOAD_SUCCESS(0);
1152}
1153
1154int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
1155 __tgt_async_info *AsyncInfo) {
1156 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1156, __extension__
__PRETTY_FUNCTION__))
;
1157 hsa_status_t Err;
1158 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1158, __extension__
__PRETTY_FUNCTION__))
;
1159 // Return success if we are not doing host to target.
1160 if (!HstPtr)
1161 return OFFLOAD_SUCCESS(0);
1162
1163 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,{}
1164 (long long unsigned)(Elf64_Addr)HstPtr,{}
1165 (long long unsigned)(Elf64_Addr)TgtPtr){};
1166 Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
1167 DeviceId);
1168 if (Err != HSA_STATUS_SUCCESS) {
1169 DP("Error when copying data from host to device. Pointers: "{}
1170 "host = 0x%016lx, device = 0x%016lx, size = %lld\n",{}
1171 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){};
1172 return OFFLOAD_FAIL(~0);
1173 }
1174 return OFFLOAD_SUCCESS(0);
1175}
1176
1177// Async.
1178// The implementation was written with cuda streams in mind. The semantics of
1179// that are to execute kernels on a queue in order of insertion. A synchronise
1180// call then makes writes visible between host and device. This means a series
1181// of N data_submit_async calls are expected to execute serially. HSA offers
1182// various options to run the data copies concurrently. This may require changes
1183// to libomptarget.
1184
1185// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
1186// there are no outstanding kernels that need to be synchronized. Any async call
1187// may be passed a Queue==0, at which point the cuda implementation will set it
1188// to non-null (see getStream). The cuda streams are per-device. Upstream may
1189// change this interface to explicitly initialize the AsyncInfo_pointer, but
1190// until then hsa lazily initializes it as well.
1191
1192void initAsyncInfo(__tgt_async_info *AsyncInfo) {
1193 // set non-null while using async calls, return to null to indicate completion
1194 assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail
("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1194, __extension__ __PRETTY_FUNCTION__))
;
1195 if (!AsyncInfo->Queue) {
1196 AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX(18446744073709551615UL));
1197 }
1198}
1199void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
1200 assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail
("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1200, __extension__ __PRETTY_FUNCTION__))
;
1201 assert(AsyncInfo->Queue)(static_cast <bool> (AsyncInfo->Queue) ? void (0) : __assert_fail
("AsyncInfo->Queue", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1201, __extension__ __PRETTY_FUNCTION__))
;
1202 AsyncInfo->Queue = 0;
1203}
1204
1205// Determine launch values for kernel.
1206struct LaunchVals {
1207 int WorkgroupSize;
1208 int GridSize;
1209};
1210LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
1211 int ConstWGSize,
1212 llvm::omp::OMPTgtExecModeFlags ExecutionMode,
1213 int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
1214 int DeviceNumTeams) {
1215
1216 int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
1217 int NumGroups = 0;
1218
1219 int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
1220 if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
1221 MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
1222
1223 if (print_kernel_trace & STARTUP_DETAILS) {
1224 DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams){};
1225 DP("Max_Teams: %d\n", MaxTeams){};
1226 DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize){};
1227 DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize){};
1228 DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",{}
1229 RTLDeviceInfoTy::DefaultWgSize){};
1230 DP("thread_limit: %d\n", ThreadLimit){};
1231 DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1232 DP("ConstWGSize: %d\n", ConstWGSize){};
1233 }
1234 // check for thread_limit() clause
1235 if (ThreadLimit > 0) {
1236 ThreadsPerGroup = ThreadLimit;
1237 DP("Setting threads per block to requested %d\n", ThreadLimit){};
1238 // Add master warp for GENERIC
1239 if (ExecutionMode ==
1240 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1241 ThreadsPerGroup += WarpSize;
1242 DP("Adding master wavefront: +%d threads\n", WarpSize){};
1243 }
1244 if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
1245 ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
1246 DP("Setting threads per block to maximum %d\n", ThreadsPerGroup){};
1247 }
1248 }
1249 // check flat_max_work_group_size attr here
1250 if (ThreadsPerGroup > ConstWGSize) {
1251 ThreadsPerGroup = ConstWGSize;
1252 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",{}
1253 ThreadsPerGroup){};
1254 }
1255 if (print_kernel_trace & STARTUP_DETAILS)
1256 DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1257 DP("Preparing %d threads\n", ThreadsPerGroup){};
1258
1259 // Set default num_groups (teams)
1260 if (Env.TeamLimit > 0)
1261 NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
1262 else
1263 NumGroups = MaxTeams;
1264 DP("Set default num of groups %d\n", NumGroups){};
1265
1266 if (print_kernel_trace & STARTUP_DETAILS) {
1267 DP("num_groups: %d\n", NumGroups){};
1268 DP("num_teams: %d\n", NumTeams){};
1269 }
1270
1271 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1272 // This reduction is typical for default case (no thread_limit clause).
1273 // or when user goes crazy with num_teams clause.
1274 // FIXME: We cant distinguish between a constant or variable thread limit.
1275 // So we only handle constant thread_limits.
1276 if (ThreadsPerGroup >
1277 RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024
1278 // Should we round threadsPerGroup up to nearest WarpSize
1279 // here?
1280 NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
1281
1282 // check for num_teams() clause
1283 if (NumTeams > 0) {
1284 NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
1285 }
1286 if (print_kernel_trace & STARTUP_DETAILS) {
1287 DP("num_groups: %d\n", NumGroups){};
1288 DP("Env.NumTeams %d\n", Env.NumTeams){};
1289 DP("Env.TeamLimit %d\n", Env.TeamLimit){};
1290 }
1291
1292 if (Env.NumTeams > 0) {
1293 NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
1294 DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams){};
1295 } else if (Env.TeamLimit > 0) {
1296 NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
1297 DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit){};
1298 } else {
1299 if (NumTeams <= 0) {
1300 if (LoopTripcount > 0) {
1301 if (ExecutionMode ==
1302 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
1303 // round up to the nearest integer
1304 NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
1305 } else if (ExecutionMode ==
1306 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1307 NumGroups = LoopTripcount;
1308 } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
1309 // This is a generic kernel that was transformed to use SPMD-mode
1310 // execution but uses Generic-mode semantics for scheduling.
1311 NumGroups = LoopTripcount;
1312 }
1313 DP("Using %d teams due to loop trip count %" PRIu64 " and number of "{}
1314 "threads per block %d\n",{}
1315 NumGroups, LoopTripcount, ThreadsPerGroup){};
1316 }
1317 } else {
1318 NumGroups = NumTeams;
1319 }
1320 if (NumGroups > MaxTeams) {
1321 NumGroups = MaxTeams;
1322 if (print_kernel_trace & STARTUP_DETAILS)
1323 DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams){};
1324 }
1325 if (NumGroups > NumTeams && NumTeams > 0) {
1326 NumGroups = NumTeams;
1327 if (print_kernel_trace & STARTUP_DETAILS)
1328 DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,{}
1329 NumTeams){};
1330 }
1331 }
1332
1333 // num_teams clause always honored, no matter what, unless DEFAULT is active.
1334 if (NumTeams > 0) {
1335 NumGroups = NumTeams;
1336 // Cap num_groups to EnvMaxTeamsDefault if set.
1337 if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
1338 NumGroups = Env.MaxTeamsDefault;
1339 }
1340 if (print_kernel_trace & STARTUP_DETAILS) {
1341 DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1342 DP("num_groups: %d\n", NumGroups){};
1343 DP("loop_tripcount: %ld\n", LoopTripcount){};
1344 }
1345 DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,{}
1346 ThreadsPerGroup){};
1347
1348 LaunchVals Res;
1349 Res.WorkgroupSize = ThreadsPerGroup;
1350 Res.GridSize = ThreadsPerGroup * NumGroups;
1351 return Res;
1352}
1353
1354static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
1355 uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
1356 bool Full = true;
1357 while (Full) {
1358 Full =
1359 PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
1360 }
1361 return PacketId;
1362}
1363
1364int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1365 ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
1366 int32_t ThreadLimit, uint64_t LoopTripcount) {
1367 // Set the context we are using
1368 // update thread limit content in gpu memory if un-initialized or specified
1369 // from host
1370
1371 DP("Run target team region thread_limit %d\n", ThreadLimit){};
1372
1373 // All args are references.
1374 std::vector<void *> Args(ArgNum);
1375 std::vector<void *> Ptrs(ArgNum);
1376
1377 DP("Arg_num: %d\n", ArgNum){};
1378 for (int32_t I = 0; I < ArgNum; ++I) {
1379 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1380 Args[I] = &Ptrs[I];
1381 DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])){};
1382 }
1383
1384 KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
1385
1386 std::string KernelName = std::string(KernelInfo->Name);
1387 auto &KernelInfoTable = DeviceInfo().KernelInfoTable;
1388 if (KernelInfoTable[DeviceId].find(KernelName) ==
1389 KernelInfoTable[DeviceId].end()) {
1390 DP("Kernel %s not found\n", KernelName.c_str()){};
1391 return OFFLOAD_FAIL(~0);
1392 }
1393
1394 const atl_kernel_info_t KernelInfoEntry =
1395 KernelInfoTable[DeviceId][KernelName];
1396 const uint32_t GroupSegmentSize =
1397 KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize;
1398 const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
1399 const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
1400 const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
1401 const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
1402
1403 assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count)(static_cast <bool> (ArgNum == (int)KernelInfoEntry.explicit_argument_count
) ? void (0) : __assert_fail ("ArgNum == (int)KernelInfoEntry.explicit_argument_count"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1403, __extension__
__PRETTY_FUNCTION__))
;
1404
1405 /*
1406 * Set limit based on ThreadsPerGroup and GroupsPerDevice
1407 */
1408 LaunchVals LV =
1409 getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env,
1410 KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
1411 NumTeams, // From run_region arg
1412 ThreadLimit, // From run_region arg
1413 LoopTripcount, // From run_region arg
1414 DeviceInfo().NumTeams[KernelInfo->DeviceId]);
1415 const int GridSize = LV.GridSize;
1416 const int WorkgroupSize = LV.WorkgroupSize;
1417
1418 if (print_kernel_trace >= LAUNCH) {
1419 int NumGroups = GridSize / WorkgroupSize;
1420 // enum modes are SPMD, GENERIC, NONE 0,1,2
1421 // if doing rtl timing, print to stderr, unless stdout requested.
1422 bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
1423 fprintf(TraceToStdout ? stdoutstdout : stderrstderr,
1424 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
1425 "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
1426 "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
1427 DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
1428 ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
1429 GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
1430 VgprSpillCount, LoopTripcount, KernelInfo->Name);
1431 }
1432
1433 // Run on the device.
1434 {
1435 hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next();
1436 if (!Queue) {
1437 return OFFLOAD_FAIL(~0);
1438 }
1439 uint64_t PacketId = acquireAvailablePacketId(Queue);
1440
1441 const uint32_t Mask = Queue->size - 1; // size is a power of 2
1442 hsa_kernel_dispatch_packet_t *Packet =
1443 (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
1444
1445 // packet->header is written last
1446 Packet->setup = UINT16_C(1)1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1447 Packet->workgroup_size_x = WorkgroupSize;
1448 Packet->workgroup_size_y = 1;
1449 Packet->workgroup_size_z = 1;
1450 Packet->reserved0 = 0;
1451 Packet->grid_size_x = GridSize;
1452 Packet->grid_size_y = 1;
1453 Packet->grid_size_z = 1;
1454 Packet->private_segment_size = KernelInfoEntry.private_segment_size;
1455 Packet->group_segment_size = GroupSegmentSize;
1456 Packet->kernel_object = KernelInfoEntry.kernel_object;
1457 Packet->kernarg_address = 0; // use the block allocator
1458 Packet->reserved2 = 0; // impl writes id_ here
1459 Packet->completion_signal = {0}; // may want a pool of signals
1460
1461 KernelArgPool *ArgPool = nullptr;
1462 void *KernArg = nullptr;
1463 {
1464 auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
1465 if (It != KernelArgPoolMap.end()) {
1466 ArgPool = (It->second).get();
1467 }
1468 }
1469 if (!ArgPool) {
1470 DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,{}
1471 DeviceId){};
1472 }
1473 {
1474 if (ArgPool) {
1475 assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)))(static_cast <bool> (ArgPool->KernargSegmentSize == (
ArgNum * sizeof(void *))) ? void (0) : __assert_fail ("ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1475, __extension__
__PRETTY_FUNCTION__))
;
1476 KernArg = ArgPool->allocate(ArgNum);
1477 }
1478 if (!KernArg) {
1479 DP("Allocate kernarg failed\n"){};
1480 return OFFLOAD_FAIL(~0);
1481 }
1482
1483 // Copy explicit arguments
1484 for (int I = 0; I < ArgNum; I++) {
1485 memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
1486 }
1487
1488 // Initialize implicit arguments. TODO: Which of these can be dropped
1489 impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>(
1490 static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
1491 memset(ImplArgs, 0,
1492 sizeof(impl_implicit_args_t)); // may not be necessary
1493 ImplArgs->offset_x = 0;
1494 ImplArgs->offset_y = 0;
1495 ImplArgs->offset_z = 0;
1496
1497 // assign a hostcall buffer for the selected Q
1498 if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE2)) {
1499 // hostrpc_assign_buffer is not thread safe, and this function is
1500 // under a multiple reader lock, not a writer lock.
1501 static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
1502 pthread_mutex_lock(&HostcallInitLock);
1503 uint64_t Buffer = hostrpc_assign_buffer(
1504 DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId);
1505 pthread_mutex_unlock(&HostcallInitLock);
1506 if (!Buffer) {
1507 DP("hostrpc_assign_buffer failed, gpu would dereference null and "{}
1508 "error\n"){};
1509 return OFFLOAD_FAIL(~0);
1510 }
1511
1512 DP("Implicit argument count: %d\n",{}
1513 KernelInfoEntry.implicit_argument_count){};
1514 if (KernelInfoEntry.implicit_argument_count >= 4) {
1515 // Initialise pointer for implicit_argument_count != 0 ABI
1516 // Guess that the right implicit argument is at offset 24 after
1517 // the explicit arguments. In the future, should be able to read
1518 // the offset from msgpack. Clang is not annotating it at present.
1519 uint64_t Offset =
1520 sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
1521 if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
1522 DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "{}
1523 "args: %d\n",{}
1524 Offset + 8, ArgPool->kernargSizeIncludingImplicit()){};
1525 } else {
1526 memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
1527 }
1528 }
1529
1530 // initialise pointer for implicit_argument_count == 0 ABI
1531 ImplArgs->hostcall_ptr = Buffer;
1532 }
1533
1534 Packet->kernarg_address = KernArg;
1535 }
1536
1537 hsa_signal_t S = DeviceInfo().FreeSignalPool.pop();
1538 if (S.handle == 0) {
1539 DP("Failed to get signal instance\n"){};
1540 return OFFLOAD_FAIL(~0);
1541 }
1542 Packet->completion_signal = S;
1543 hsa_signal_store_relaxed(Packet->completion_signal, 1);
1544
1545 // Publish the packet indicating it is ready to be processed
1546 core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
1547 core::createHeader(), Packet->setup);
1548
1549 // Since the packet is already published, its contents must not be
1550 // accessed any more
1551 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
1552
1553 while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX(18446744073709551615UL),
1554 HSA_WAIT_STATE_BLOCKED) != 0)
1555 ;
1556
1557 assert(ArgPool)(static_cast <bool> (ArgPool) ? void (0) : __assert_fail
("ArgPool", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1557, __extension__ __PRETTY_FUNCTION__))
;
1558 ArgPool->deallocate(KernArg);
1559 DeviceInfo().FreeSignalPool.push(S);
1560 }
1561
1562 DP("Kernel completed\n"){};
1563 return OFFLOAD_SUCCESS(0);
1564}
1565
1566bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
1567 const uint16_t AmdgcnMachineID = EM_AMDGPU;
1568 const int32_t R = elf_check_machine(Image, AmdgcnMachineID);
1569 if (!R) {
1570 DP("Supported machine ID not found\n"){};
1571 }
1572 return R;
1573}
1574
1575uint32_t elfEFlags(__tgt_device_image *Image) {
1576 const char *ImgBegin = (char *)Image->ImageStart;
1577 size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
1578
1579 StringRef Buffer = StringRef(ImgBegin, ImgSize);
1580 auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
1581 /*InitContent=*/false);
1582 if (!ElfOrErr) {
1583 consumeError(ElfOrErr.takeError());
1584 return 0;
1585 }
1586
1587 if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
1588 return ELFObj->getPlatformFlags();
1589 return 0;
1590}
1591
1592template <typename T> bool enforceUpperBound(T *Value, T Upper) {
1593 bool Changed = *Value > Upper;
1594 if (Changed) {
1595 *Value = Upper;
1596 }
1597 return Changed;
1598}
1599
1600struct SymbolInfo {
1601 const void *Addr = nullptr;
1602 uint32_t Size = UINT32_MAX(4294967295U);
1603 uint32_t ShType = SHT_NULL;
1604};
1605
1606int getSymbolInfoWithoutLoading(const ELFObjectFile<ELF64LE> &ELFObj,
1607 StringRef SymName, SymbolInfo *Res) {
1608 auto SymOrErr = getELFSymbol(ELFObj, SymName);
1609 if (!SymOrErr) {
1610 std::string ErrorString = toString(SymOrErr.takeError());
1611 DP("Failed ELF lookup: %s\n", ErrorString.c_str()){};
1612 return 1;
1613 }
1614 if (!*SymOrErr)
1615 return 1;
1616
1617 auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx);
1618 if (!SymSecOrErr) {
1619 std::string ErrorString = toString(SymOrErr.takeError());
1620 DP("Failed ELF lookup: %s\n", ErrorString.c_str()){};
1621 return 1;
1622 }
1623
1624 Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base();
1625 Res->Size = static_cast<uint32_t>((*SymOrErr)->st_size);
1626 Res->ShType = static_cast<uint32_t>((*SymSecOrErr)->sh_type);
1627 return 0;
1628}
1629
1630int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName,
1631 SymbolInfo *Res) {
1632 StringRef Buffer = StringRef(Base, ImgSize);
1633 auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
1634 /*InitContent=*/false);
1635 if (!ElfOrErr) {
1636 REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str())do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Failed to load ELF: %s\n"
, toString(ElfOrErr.takeError()).c_str()); } while (0);
;
1637 return 1;
1638 }
1639
1640 if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
1641 return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res);
1642 return 1;
1643}
1644
1645hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
1646 const char *SymName, const void **VarAddr,
1647 uint32_t *VarSize) {
1648 SymbolInfo SI;
1649 int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
1650 if (Rc == 0) {
1651 *VarAddr = SI.Addr;
1652 *VarSize = SI.Size;
1653 return HSA_STATUS_SUCCESS;
1654 }
1655 return HSA_STATUS_ERROR;
1656}
1657
1658template <typename C>
1659hsa_status_t moduleRegisterFromMemoryToPlace(
1660 std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
1661 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1662 void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
1663 std::vector<hsa_executable_t> &HSAExecutables) {
1664 auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
1665 C *Unwrapped = static_cast<C *>(CbState);
1666 return (*Unwrapped)(Data, Size);
1667 };
1668 return core::RegisterModuleFromMemory(
1669 KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
1670 DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
1671 HSAExecutables);
1672}
1673
1674uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
1675 uint64_t DeviceStateBytes = 0;
1676 {
1677 // If this is the deviceRTL, get the state variable size
1678 SymbolInfo SizeSi;
1679 int Rc = getSymbolInfoWithoutLoading(
1680 ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
1681
1682 if (Rc == 0) {
1683 if (SizeSi.Size != sizeof(uint64_t)) {
1684 DP("Found device_State_size variable with wrong size\n"){};
1685 return 0;
1686 }
1687
1688 // Read number of bytes directly from the elf
1689 memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
1690 }
1691 }
1692 return DeviceStateBytes;
1693}
1694
1695struct DeviceEnvironment {
1696 // initialise an DeviceEnvironmentTy in the deviceRTL
1697 // patches around differences in the deviceRTL between trunk, aomp,
1698 // rocmcc. Over time these differences will tend to zero and this class
1699 // simplified.
1700 // Symbol may be in .data or .bss, and may be missing fields, todo:
1701 // review aomp/trunk/rocm and simplify the following
1702
1703 // The symbol may also have been deadstripped because the device side
1704 // accessors were unused.
1705
1706 // If the symbol is in .data (aomp, rocm) it can be written directly.
1707 // If it is in .bss, we must wait for it to be allocated space on the
1708 // gpu (trunk) and initialize after loading.
1709 const char *sym() { return "omptarget_device_environment"; }
1710
1711 DeviceEnvironmentTy HostDeviceEnv;
1712 SymbolInfo SI;
1713 bool Valid = false;
1714
1715 __tgt_device_image *Image;
1716 const size_t ImgSize;
1717
1718 DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
1719 __tgt_device_image *Image, const size_t ImgSize)
1720 : Image(Image), ImgSize(ImgSize) {
1721
1722 HostDeviceEnv.NumDevices = NumberDevices;
1723 HostDeviceEnv.DeviceNum = DeviceId;
1724 HostDeviceEnv.DebugKind = 0;
1725 HostDeviceEnv.DynamicMemSize = DynamicMemSize;
1726 if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
1727 HostDeviceEnv.DebugKind = std::stoi(EnvStr);
1728
1729 int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
1730 sym(), &SI);
1731 if (Rc != 0) {
1732 DP("Finding global device environment '%s' - symbol missing.\n", sym()){};
1733 return;
1734 }
1735
1736 if (SI.Size > sizeof(HostDeviceEnv)) {
1737 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,{}
1738 sizeof(HostDeviceEnv)){};
1739 return;
1740 }
1741
1742 Valid = true;
1743 }
1744
1745 bool inImage() { return SI.ShType != SHT_NOBITS; }
1746
1747 hsa_status_t beforeLoading(void *Data, size_t Size) {
1748 if (Valid) {
1749 if (inImage()) {
1750 DP("Setting global device environment before load (%u bytes)\n",{}
1751 SI.Size){};
1752 uint64_t Offset = (char *)SI.Addr - (char *)Image->ImageStart;
1753 void *Pos = (char *)Data + Offset;
1754 memcpy(Pos, &HostDeviceEnv, SI.Size);
1755 }
1756 }
1757 return HSA_STATUS_SUCCESS;
1758 }
1759
1760 hsa_status_t afterLoading() {
1761 if (Valid) {
1762 if (!inImage()) {
1763 DP("Setting global device environment after load (%u bytes)\n",{}
1764 SI.Size){};
1765 int DeviceId = HostDeviceEnv.DeviceNum;
1766 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
1767 void *StatePtr;
1768 uint32_t StatePtrSize;
1769 hsa_status_t Err = interop_hsa_get_symbol_info(
1770 SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
1771 if (Err != HSA_STATUS_SUCCESS) {
1772 DP("failed to find %s in loaded image\n", sym()){};
1773 return Err;
1774 }
1775
1776 if (StatePtrSize != SI.Size) {
1777 DP("Symbol had size %u before loading, %u after\n", StatePtrSize,{}
1778 SI.Size){};
1779 return HSA_STATUS_ERROR;
1780 }
1781
1782 return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
1783 StatePtrSize, DeviceId);
1784 }
1785 }
1786 return HSA_STATUS_SUCCESS;
1787 }
1788};
1789
1790hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
1791 uint64_t Rounded = 4 * ((Size + 3) / 4);
1792 void *Ptr;
1793 hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
1794 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
1795 if (Err != HSA_STATUS_SUCCESS) {
1796 return Err;
1797 }
1798
1799 hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
1800 if (Rc != HSA_STATUS_SUCCESS) {
1801 DP("zero fill device_state failed with %u\n", Rc){};
1802 core::Runtime::Memfree(Ptr);
1803 return HSA_STATUS_ERROR;
1804 }
1805
1806 *RetPtr = Ptr;
1807 return HSA_STATUS_SUCCESS;
1808}
1809
1810bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
1811 SymbolInfo SI;
1812 int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
1813 return (Rc == 0) && (SI.Addr != nullptr);
1814}
1815
1816} // namespace
1817
1818namespace core {
1819hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
1820 return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(),
1821 &DeviceInfo().HSAAgents[0], NULL__null, Ptr);
1822}
1823} // namespace core
1824
1825static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
1826 hsa_status_t err;
1827 uint32_t name_len;
1828 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
1829 if (err != HSA_STATUS_SUCCESS) {
1830 DP("Error getting ISA info length\n"){};
1831 return err;
1832 }
1833
1834 char TargetID[name_len];
1835 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
1836 if (err != HSA_STATUS_SUCCESS) {
1837 DP("Error getting ISA info name\n"){};
1838 return err;
1839 }
1840
1841 auto TripleTargetID = llvm::StringRef(TargetID);
1842 if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
1843 DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str());
1844 }
1845 return HSA_STATUS_SUCCESS;
1846}
1847
1848/// Parse a TargetID to get processor arch and feature map.
1849/// Returns processor subarch.
1850/// Returns TargetID features in \p FeatureMap argument.
1851/// If the \p TargetID contains feature+, FeatureMap it to true.
1852/// If the \p TargetID contains feature-, FeatureMap it to false.
1853/// If the \p TargetID does not contain a feature (default), do not map it.
1854StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) {
1855 if (TargetID.empty())
1856 return llvm::StringRef();
1857
1858 auto ArchFeature = TargetID.split(":");
1859 auto Arch = ArchFeature.first;
1860 auto Features = ArchFeature.second;
1861 if (Features.empty())
1862 return Arch;
1863
1864 if (Features.contains("sramecc+")) {
1865 FeatureMap.insert(std::pair<std::string, bool>("sramecc", true));
1866 } else if (Features.contains("sramecc-")) {
1867 FeatureMap.insert(std::pair<std::string, bool>("sramecc", false));
1868 }
1869 if (Features.contains("xnack+")) {
1870 FeatureMap.insert(std::pair<std::string, bool>("xnack", true));
1871 } else if (Features.contains("xnack-")) {
1872 FeatureMap.insert(std::pair<std::string, bool>("xnack", false));
1873 }
1874
1875 return Arch;
1876}
1877
1878/// Checks if an image \p ImgInfo is compatible with current
1879/// system's environment \p EnvInfo
1880bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) {
1881 llvm::StringRef ImgTID(ImgInfo), EnvTID(EnvInfo);
1882
1883 // Compatible in case of exact match
1884 if (ImgTID == EnvTID) {
1885 DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n",{}
1886 ImgTID.data(), EnvTID.data()){};
1887 return true;
1888 }
1889
1890 // Incompatible if Archs mismatch.
1891 StringMap<bool> ImgMap, EnvMap;
1892 StringRef ImgArch = parseTargetID(ImgTID, ImgMap);
1893 StringRef EnvArch = parseTargetID(EnvTID, EnvMap);
1894
1895 // Both EnvArch and ImgArch can't be empty here.
1896 if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) {
1897 DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: %s]\n",{}
1898 ImgTID.data(), EnvTID.data()){};
1899 return false;
1900 }
1901
1902 // Incompatible if image has more features than the environment, irrespective
1903 // of type or sign of features.
1904 if (ImgMap.size() > EnvMap.size()) {
1905 DP("Incompatible: Image has more features than the environment \t[Image: "{}
1906 "%s]\t:\t[Environment: %s]\n",{}
1907 ImgTID.data(), EnvTID.data()){};
1908 return false;
1909 }
1910
1911 // Compatible if each target feature specified by the environment is
1912 // compatible with target feature of the image. The target feature is
1913 // compatible if the iamge does not specify it (meaning Any), or if it
1914 // specifies it with the same value (meaning On or Off).
1915 for (const auto &ImgFeature : ImgMap) {
1916 auto EnvFeature = EnvMap.find(ImgFeature.first());
1917 if (EnvFeature == EnvMap.end()) {
1918 DP("Incompatible: Value of Image's non-ANY feature is not matching with "{}
1919 "the Environment feature's ANY value \t[Image: %s]\t:\t[Environment: "{}
1920 "%s]\n",{}
1921 ImgTID.data(), EnvTID.data()){};
1922 return false;
1923 } else if (EnvFeature->first() == ImgFeature.first() &&
1924 EnvFeature->second != ImgFeature.second) {
1925 DP("Incompatible: Value of Image's non-ANY feature is not matching with "{}
1926 "the Environment feature's non-ANY value \t[Image: "{}
1927 "%s]\t:\t[Environment: %s]\n",{}
1928 ImgTID.data(), EnvTID.data()){};
1929 return false;
1930 }
1931 }
1932
1933 // Image is compatible if all features of Environment are:
1934 // - either, present in the Image's features map with the same sign,
1935 // - or, the feature is missing from Image's features map i.e. it is
1936 // set to ANY
1937 DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: "{}
1938 "%s]\n",{}
1939 ImgTID.data(), EnvTID.data()){};
1940 return true;
1941}
1942
1943extern "C" {
1944int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
1945 return elfMachineIdIsAmdgcn(Image);
1946}
1947
1948int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
1949 __tgt_image_info *info) {
1950 if (!__tgt_rtl_is_valid_binary(image))
1951 return false;
1952
1953 // A subarchitecture was not specified. Assume it is compatible.
1954 if (!info->Arch)
1955 return true;
1956
1957 int32_t NumberOfDevices = __tgt_rtl_number_of_devices();
1958
1959 for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
1960 __tgt_rtl_init_device(DeviceId);
1961 hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId];
1962 hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
1963 if (err != HSA_STATUS_SUCCESS) {
1964 DP("Error iterating ISAs\n"){};
1965 return false;
1966 }
1967 if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo().TargetID[DeviceId]))
1968 return false;
1969 }
1970 DP("Image has Target ID compatible with the current environment: %s\n",{}
1971 info->Arch){};
1972 return true;
1973}
1974
1975int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS(0); }
1976int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS(0); }
1977
1978int __tgt_rtl_number_of_devices() {
1979 // If the construction failed, no methods are safe to call
1980 if (DeviceInfo().ConstructionSucceeded) {
1981 return DeviceInfo().NumberOfDevices;
1982 }
1983 DP("AMDGPU plugin construction failed. Zero devices available\n"){};
1984 return 0;
1985}
1986
1987int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1988 DP("Init requires flags to %ld\n", RequiresFlags){};
1989 DeviceInfo().RequiresFlags = RequiresFlags;
1990 return RequiresFlags;
1991}
1992
1993int32_t __tgt_rtl_init_device(int DeviceId) {
1994 hsa_status_t Err = hsa_init();
1995 if (Err != HSA_STATUS_SUCCESS) {
1996 DP("HSA Initialization Failed.\n"){};
1997 return HSA_STATUS_ERROR;
1998 }
1999 // this is per device id init
2000 DP("Initialize the device id: %d\n", DeviceId){};
2001
2002 hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
2003
2004 // Get number of Compute Unit
2005 uint32_t ComputeUnits = 0;
2006 Err = hsa_agent_get_info(
2007 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
2008 &ComputeUnits);
2009 if (Err != HSA_STATUS_SUCCESS) {
2010 DeviceInfo().ComputeUnits[DeviceId] = 1;
2011 DP("Error getting compute units : settiing to 1\n"){};
2012 } else {
2013 DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits;
2014 DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]){};
2015 }
2016
2017 char GetInfoName[64]; // 64 max size returned by get info
2018 Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
2019 (void *)GetInfoName);
2020 if (Err)
2021 DeviceInfo().GPUName[DeviceId] = "--unknown gpu--";
2022 else {
2023 DeviceInfo().GPUName[DeviceId] = GetInfoName;
2024 }
2025
2026 if (print_kernel_trace & STARTUP_DETAILS)
2027 DP("Device#%-2d CU's: %2d %s\n", DeviceId,{}
2028 DeviceInfo().ComputeUnits[DeviceId],{}
2029 DeviceInfo().GPUName[DeviceId].c_str()){};
2030
2031 // Query attributes to determine number of threads/block and blocks/grid.
2032 uint16_t WorkgroupMaxDim[3];
2033 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
2034 &WorkgroupMaxDim);
2035 if (Err != HSA_STATUS_SUCCESS) {
2036 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
2037 DP("Error getting grid dims: num groups : %d\n",{}
2038 RTLDeviceInfoTy::DefaultNumTeams){};
2039 } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
2040 DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
2041 DP("Using %d ROCm blocks per grid\n",{}
2042 DeviceInfo().GroupsPerDevice[DeviceId]){};
2043 } else {
2044 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
2045 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "{}
2046 "at the hard limit\n",{}
2047 WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit){};
2048 }
2049
2050 // Get thread limit
2051 hsa_dim3_t GridMaxDim;
2052 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
2053 if (Err == HSA_STATUS_SUCCESS) {
2054 DeviceInfo().ThreadsPerGroup[DeviceId] =
2055 reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
2056 DeviceInfo().GroupsPerDevice[DeviceId];
2057
2058 if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) {
2059 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2060 DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){};
2061 } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId],
2062 RTLDeviceInfoTy::MaxWgSize)) {
2063 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){};
2064 } else {
2065 DP("Using ROCm Queried thread limit: %d\n",{}
2066 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2067 }
2068 } else {
2069 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2070 DP("Error getting max block dimension, use default:%d \n",{}
2071 RTLDeviceInfoTy::MaxWgSize){};
2072 }
2073
2074 // Get wavefront size
2075 uint32_t WavefrontSize = 0;
2076 Err =
2077 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
2078 if (Err == HSA_STATUS_SUCCESS) {
2079 DP("Queried wavefront size: %d\n", WavefrontSize){};
2080 DeviceInfo().WarpSize[DeviceId] = WavefrontSize;
2081 } else {
2082 // TODO: Burn the wavefront size into the code object
2083 DP("Warning: Unknown wavefront size, assuming 64\n"){};
2084 DeviceInfo().WarpSize[DeviceId] = 64;
2085 }
2086
2087 // Adjust teams to the env variables
2088
2089 if (DeviceInfo().Env.TeamLimit > 0 &&
2090 (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId],
2091 DeviceInfo().Env.TeamLimit))) {
2092 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",{}
2093 DeviceInfo().Env.TeamLimit){};
2094 }
2095
2096 // Set default number of teams
2097 if (DeviceInfo().Env.NumTeams > 0) {
2098 DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams;
2099 DP("Default number of teams set according to environment %d\n",{}
2100 DeviceInfo().Env.NumTeams){};
2101 } else {
2102 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
2103 int TeamsPerCU = DefaultTeamsPerCU;
2104 if (TeamsPerCUEnvStr) {
2105 TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
2106 }
2107
2108 DeviceInfo().NumTeams[DeviceId] =
2109 TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId];
2110 DP("Default number of teams = %d * number of compute units %d\n",{}
2111 TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]){};
2112 }
2113
2114 if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId],
2115 DeviceInfo().GroupsPerDevice[DeviceId])) {
2116 DP("Default number of teams exceeds device limit, capping at %d\n",{}
2117 DeviceInfo().GroupsPerDevice[DeviceId]){};
2118 }
2119
2120 // Adjust threads to the env variables
2121 if (DeviceInfo().Env.TeamThreadLimit > 0 &&
2122 (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2123 DeviceInfo().Env.TeamThreadLimit))) {
2124 DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",{}
2125 DeviceInfo().Env.TeamThreadLimit){};
2126 }
2127
2128 // Set default number of threads
2129 DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
2130 DP("Default number of threads set according to library's default %d\n",{}
2131 RTLDeviceInfoTy::DefaultWgSize){};
2132 if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2133 DeviceInfo().ThreadsPerGroup[DeviceId])) {
2134 DP("Default number of threads exceeds device limit, capping at %d\n",{}
2135 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2136 }
2137
2138 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",{}
2139 DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],{}
2140 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2141
2142 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,{}
2143 DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],{}
2144 DeviceInfo().GroupsPerDevice[DeviceId],{}
2145 DeviceInfo().GroupsPerDevice[DeviceId] *{}
2146 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2147
2148 return OFFLOAD_SUCCESS(0);
2149}
2150
2151static __tgt_target_table *
2152__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
2153
2154__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
2155 __tgt_device_image *Image) {
2156 DeviceInfo().LoadRunLock.lock();
2157 __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
2158 DeviceInfo().LoadRunLock.unlock();
2159 return Res;
2160}
2161
2162__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2163 __tgt_device_image *Image) {
2164 // This function loads the device image onto gpu[DeviceId] and does other
2165 // per-image initialization work. Specifically:
2166 //
2167 // - Initialize an DeviceEnvironmentTy instance embedded in the
2168 // image at the symbol "omptarget_device_environment"
2169 // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
2170 //
2171 // - Allocate a large array per-gpu (could be moved to init_device)
2172 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size
2173 // - Allocate at least that many bytes of gpu memory
2174 // - Zero initialize it
2175 // - Write the pointer to the symbol omptarget_nvptx_device_State
2176 //
2177 // - Pulls some per-kernel information together from various sources and
2178 // records it in the KernelsList for quicker access later
2179 //
2180 // The initialization can be done before or after loading the image onto the
2181 // gpu. This function presently does a mixture. Using the hsa api to get/set
2182 // the information is simpler to implement, in exchange for more complicated
2183 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
2184 // back from the gpu vs a hashtable lookup on the host.
2185
2186 const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
2187
2188 DeviceInfo().clearOffloadEntriesTable(DeviceId);
2189
2190 // We do not need to set the ELF version because the caller of this function
2191 // had to do that to decide the right runtime to use
2192
2193 if (!elfMachineIdIsAmdgcn(Image))
2194 return NULL__null;
2195
2196 {
2197 auto Env =
2198 DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
2199 DeviceInfo().Env.DynamicMemSize, Image, ImgSize);
2200
2201 auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId];
2202 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
2203 hsa_status_t Err = moduleRegisterFromMemoryToPlace(
2204 KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
2205 [&](void *Data, size_t Size) {
2206 if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
2207 __atomic_store_n(&DeviceInfo().HostcallRequired, true,
2208 __ATOMIC_RELEASE3);
2209 }
2210 return Env.beforeLoading(Data, Size);
2211 },
2212 DeviceInfo().HSAExecutables);
2213
2214 check("Module registering", Err){};
2215 if (Err != HSA_STATUS_SUCCESS) {
2216 const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str();
2217 const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
2218
2219 if (strcmp(DeviceName, ElfName) != 0) {
2220 DP("Possible gpu arch mismatch: device:%s, image:%s please check"{}
2221 " compiler flag: -march=<gpu>\n",{}
2222 DeviceName, ElfName){};
2223 } else {
2224 DP("Error loading image onto GPU: %s\n", get_error_string(Err)){};
2225 }
2226
2227 return NULL__null;
2228 }
2229
2230 Err = Env.afterLoading();
2231 if (Err != HSA_STATUS_SUCCESS) {
2232 return NULL__null;
2233 }
2234 }
2235
2236 DP("AMDGPU module successfully loaded!\n"){};
2237
2238 {
2239 // the device_State array is either large value in bss or a void* that
2240 // needs to be assigned to a pointer to an array of size device_state_bytes
2241 // If absent, it has been deadstripped and needs no setup.
2242
2243 void *StatePtr;
2244 uint32_t StatePtrSize;
2245 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2246 hsa_status_t Err = interop_hsa_get_symbol_info(
2247 SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
2248 &StatePtrSize);
2249
2250 if (Err != HSA_STATUS_SUCCESS) {
2251 DP("No device_state symbol found, skipping initialization\n"){};
2252 } else {
2253 if (StatePtrSize < sizeof(void *)) {
2254 DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,{}
2255 sizeof(void *)){};
2256 return NULL__null;
2257 }
2258
2259 // if it's larger than a void*, assume it's a bss array and no further
2260 // initialization is required. Only try to set up a pointer for
2261 // sizeof(void*)
2262 if (StatePtrSize == sizeof(void *)) {
2263 uint64_t DeviceStateBytes =
2264 getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
2265 if (DeviceStateBytes == 0) {
2266 DP("Can't initialize device_State, missing size information\n"){};
2267 return NULL__null;
2268 }
2269
2270 auto &DSS = DeviceInfo().DeviceStateStore[DeviceId];
2271 if (DSS.first.get() == nullptr) {
2272 assert(DSS.second == 0)(static_cast <bool> (DSS.second == 0) ? void (0) : __assert_fail
("DSS.second == 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 2272, __extension__ __PRETTY_FUNCTION__))
;
2273 void *Ptr = NULL__null;
2274 hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
2275 if (Err != HSA_STATUS_SUCCESS) {
2276 DP("Failed to allocate device_state array\n"){};
2277 return NULL__null;
2278 }
2279 DSS = {
2280 std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
2281 DeviceStateBytes,
2282 };
2283 }
2284
2285 void *Ptr = DSS.first.get();
2286 if (DeviceStateBytes != DSS.second) {
2287 DP("Inconsistent sizes of device_State unsupported\n"){};
2288 return NULL__null;
2289 }
2290
2291 // write ptr to device memory so it can be used by later kernels
2292 Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
2293 sizeof(void *), DeviceId);
2294 if (Err != HSA_STATUS_SUCCESS) {
2295 DP("memcpy install of state_ptr failed\n"){};
2296 return NULL__null;
2297 }
2298 }
2299 }
2300 }
2301
2302 // Here, we take advantage of the data that is appended after img_end to get
2303 // the symbols' name we need to load. This data consist of the host entries
2304 // begin and end as well as the target name (see the offloading linker script
2305 // creation in clang compiler).
2306
2307 // Find the symbols in the module by name. The name can be obtain by
2308 // concatenating the host entry name with the target name
2309
2310 __tgt_offload_entry *HostBegin = Image->EntriesBegin;
2311 __tgt_offload_entry *HostEnd = Image->EntriesEnd;
2312
2313 for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
2314
2315 if (!E->addr) {
2316 // The host should have always something in the address to
2317 // uniquely identify the target region.
2318 DP("Analyzing host entry '<null>' (size = %lld)...\n",{}
2319 (unsigned long long)E->size){};
2320 return NULL__null;
2321 }
2322
2323 if (E->size) {
2324 __tgt_offload_entry Entry = *E;
2325
2326 void *Varptr;
2327 uint32_t Varsize;
2328
2329 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2330 hsa_status_t Err = interop_hsa_get_symbol_info(
2331 SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
2332
2333 if (Err != HSA_STATUS_SUCCESS) {
2334 // Inform the user what symbol prevented offloading
2335 DP("Loading global '%s' (Failed)\n", E->name){};
2336 return NULL__null;
2337 }
2338
2339 if (Varsize != E->size) {
2340 DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,{}
2341 Varsize, E->size){};
2342 return NULL__null;
2343 }
2344
2345 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",{}
2346 DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)){};
2347 Entry.addr = (void *)Varptr;
2348
2349 DeviceInfo().addOffloadEntry(DeviceId, Entry);
2350
2351 if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
2352 E->flags & OMP_DECLARE_TARGET_LINK) {
2353 // If unified memory is present any target link variables
2354 // can access host addresses directly. There is no longer a
2355 // need for device copies.
2356 Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
2357 sizeof(void *), DeviceId);
2358 if (Err != HSA_STATUS_SUCCESS)
2359 DP("Error when copying USM\n"){};
2360 DP("Copy linked variable host address (" DPxMOD ")"{}
2361 "to device address (" DPxMOD ")\n",{}
2362 DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)){};
2363 }
2364
2365 continue;
2366 }
2367
2368 DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)){};
2369
2370 // errors in kernarg_segment_size previously treated as = 0 (or as undef)
2371 uint32_t KernargSegmentSize = 0;
2372 auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId];
2373 hsa_status_t Err = HSA_STATUS_SUCCESS;
2374 if (!E->name) {
2375 Err = HSA_STATUS_ERROR;
2376 } else {
2377 std::string KernelStr = std::string(E->name);
2378 auto It = KernelInfoMap.find(KernelStr);
2379 if (It != KernelInfoMap.end()) {
2380 atl_kernel_info_t Info = It->second;
2381 KernargSegmentSize = Info.kernel_segment_size;
2382 } else {
2383 Err = HSA_STATUS_ERROR;
Value stored to 'Err' is never read
2384 }
2385 }
2386
2387 // default value GENERIC (in case symbol is missing from cubin file)
2388 llvm::omp::OMPTgtExecModeFlags ExecModeVal =
2389 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2390
2391 // get flat group size if present, else Default_WG_Size
2392 int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2393
2394 // get Kernel Descriptor if present.
2395 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
2396 struct KernDescValType {
2397 uint16_t Version;
2398 uint16_t TSize;
2399 uint16_t WGSize;
2400 };
2401 struct KernDescValType KernDescVal;
2402 std::string KernDescNameStr(E->name);
2403 KernDescNameStr += "_kern_desc";
2404 const char *KernDescName = KernDescNameStr.c_str();
2405
2406 const void *KernDescPtr;
2407 uint32_t KernDescSize;
2408 void *CallStackAddr = nullptr;
2409 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
2410 &KernDescPtr, &KernDescSize);
2411
2412 if (Err == HSA_STATUS_SUCCESS) {
2413 if ((size_t)KernDescSize != sizeof(KernDescVal))
2414 DP("Loading global computation properties '%s' - size mismatch (%u != "{}
2415 "%lu)\n",{}
2416 KernDescName, KernDescSize, sizeof(KernDescVal)){};
2417
2418 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
2419
2420 // Check structure size against recorded size.
2421 if ((size_t)KernDescSize != KernDescVal.TSize)
2422 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",{}
2423 sizeof(KernDescVal), KernDescVal.TSize, KernDescName){};
2424
2425 DP("After loading global for %s KernDesc \n", KernDescName){};
2426 DP("KernDesc: Version: %d\n", KernDescVal.Version){};
2427 DP("KernDesc: TSize: %d\n", KernDescVal.TSize){};
2428 DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize){};
2429
2430 if (KernDescVal.WGSize == 0) {
2431 KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
2432 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize){};
2433 }
2434 WGSizeVal = KernDescVal.WGSize;
2435 DP("WGSizeVal %d\n", WGSizeVal){};
2436 check("Loading KernDesc computation property", Err){};
2437 } else {
2438 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName){};
2439
2440 // Flat group size
2441 std::string WGSizeNameStr(E->name);
2442 WGSizeNameStr += "_wg_size";
2443 const char *WGSizeName = WGSizeNameStr.c_str();
2444
2445 const void *WGSizePtr;
2446 uint32_t WGSize;
2447 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
2448 &WGSizePtr, &WGSize);
2449
2450 if (Err == HSA_STATUS_SUCCESS) {
2451 if ((size_t)WGSize != sizeof(int16_t)) {
2452 DP("Loading global computation properties '%s' - size mismatch (%u "{}
2453 "!= "{}
2454 "%lu)\n",{}
2455 WGSizeName, WGSize, sizeof(int16_t)){};
2456 return NULL__null;
2457 }
2458
2459 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
2460
2461 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal){};
2462
2463 if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
2464 WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
2465 DP("Error wrong WGSize value specified in HSA code object file: "{}
2466 "%d\n",{}
2467 WGSizeVal){};
2468 WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2469 }
2470 } else {
2471 DP("Warning: Loading WGSize '%s' - symbol not found, "{}
2472 "using default value %d\n",{}
2473 WGSizeName, WGSizeVal){};
2474 }
2475
2476 check("Loading WGSize computation property", Err){};
2477 }
2478
2479 // Read execution mode from global in binary
2480 std::string ExecModeNameStr(E->name);
2481 ExecModeNameStr += "_exec_mode";
2482 const char *ExecModeName = ExecModeNameStr.c_str();
2483
2484 const void *ExecModePtr;
2485 uint32_t VarSize;
2486 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
2487 &ExecModePtr, &VarSize);
2488
2489 if (Err == HSA_STATUS_SUCCESS) {
2490 if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
2491 DP("Loading global computation properties '%s' - size mismatch(%u != "{}
2492 "%lu)\n",{}
2493 ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)){};
2494 return NULL__null;
2495 }
2496
2497 memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
2498
2499 DP("After loading global for %s ExecMode = %d\n", ExecModeName,{}
2500 ExecModeVal){};
2501
2502 if (ExecModeVal < 0 ||
2503 ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
2504 DP("Error wrong exec_mode value specified in HSA code object file: "{}
2505 "%d\n",{}
2506 ExecModeVal){};
2507 return NULL__null;
2508 }
2509 } else {
2510 DP("Loading global exec_mode '%s' - symbol missing, using default "{}
2511 "value "{}
2512 "GENERIC (1)\n",{}
2513 ExecModeName){};
2514 }
2515 check("Loading computation property", Err){};
2516
2517 KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
2518 CallStackAddr, E->name, KernargSegmentSize,
2519 DeviceInfo().KernArgPool));
2520 __tgt_offload_entry Entry = *E;
2521 Entry.addr = (void *)&KernelsList.back();
2522 DeviceInfo().addOffloadEntry(DeviceId, Entry);
2523 DP("Entry point %ld maps to %s\n", E - HostBegin, E->name){};
2524 }
2525
2526 return DeviceInfo().getOffloadEntriesTable(DeviceId);
2527}
2528
2529void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
2530 void *Ptr = NULL__null;
2531 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2531, __extension__
__PRETTY_FUNCTION__))
;
2532
2533 hsa_amd_memory_pool_t MemoryPool;
2534 switch (Kind) {
2535 case TARGET_ALLOC_DEFAULT:
2536 case TARGET_ALLOC_DEVICE:
2537 // GPU memory
2538 MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
2539 break;
2540 case TARGET_ALLOC_HOST:
2541 // non-migratable memory accessible by host and device(s)
2542 MemoryPool = DeviceInfo().getHostMemoryPool();
2543 break;
2544 default:
2545 REPORT("Invalid target data allocation kind or requested allocator not "do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not "
"implemented yet\n"); } while (0);
2546 "implemented yet\n")do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not "
"implemented yet\n"); } while (0);
;
2547 return NULL__null;
2548 }
2549
2550 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
2551 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,{}
2552 (long long unsigned)(Elf64_Addr)Ptr){};
2553 Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL__null;
2554 return Ptr;
2555}
2556
2557int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
2558 int64_t Size) {
2559 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2559, __extension__
__PRETTY_FUNCTION__))
;
2560 __tgt_async_info AsyncInfo;
2561 int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
2562 if (Rc != OFFLOAD_SUCCESS(0))
2563 return OFFLOAD_FAIL(~0);
2564
2565 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2566}
2567
2568int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
2569 int64_t Size, __tgt_async_info *AsyncInfo) {
2570 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2570, __extension__
__PRETTY_FUNCTION__))
;
2571 if (AsyncInfo) {
2572 initAsyncInfo(AsyncInfo);
2573 return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
2574 }
2575 return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
2576}
2577
2578int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
2579 int64_t Size) {
2580 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2580, __extension__
__PRETTY_FUNCTION__))
;
2581 __tgt_async_info AsyncInfo;
2582 int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
2583 if (Rc != OFFLOAD_SUCCESS(0))
2584 return OFFLOAD_FAIL(~0);
2585
2586 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2587}
2588
2589int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
2590 int64_t Size,
2591 __tgt_async_info *AsyncInfo) {
2592 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2592, __extension__
__PRETTY_FUNCTION__))
;
2593 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2593, __extension__
__PRETTY_FUNCTION__))
;
2594 initAsyncInfo(AsyncInfo);
2595 return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
2596}
2597
2598int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) {
2599 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2599, __extension__
__PRETTY_FUNCTION__))
;
2600 // HSA can free pointers allocated from different types of memory pool.
2601 hsa_status_t Err;
2602 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr){};
2603 Err = core::Runtime::Memfree(TgtPtr);
2604 if (Err != HSA_STATUS_SUCCESS) {
2605 DP("Error when freeing CUDA memory\n"){};
2606 return OFFLOAD_FAIL(~0);
2607 }
2608 return OFFLOAD_SUCCESS(0);
2609}
2610
2611int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
2612 void **TgtArgs, ptrdiff_t *TgtOffsets,
2613 int32_t ArgNum, int32_t NumTeams,
2614 int32_t ThreadLimit,
2615 uint64_t LoopTripcount) {
2616
2617 DeviceInfo().LoadRunLock.lock_shared();
2618 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2619 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
2620
2621 DeviceInfo().LoadRunLock.unlock_shared();
2622 return Res;
2623}
2624
2625int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
2626 void **TgtArgs, ptrdiff_t *TgtOffsets,
2627 int32_t ArgNum) {
2628 // use one team and one thread
2629 // fix thread num
2630 int32_t TeamNum = 1;
2631 int32_t ThreadLimit = 0; // use default
2632 return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs,
2633 TgtOffsets, ArgNum, TeamNum,
2634 ThreadLimit, 0);
2635}
2636
2637int32_t __tgt_rtl_run_target_team_region_async(
2638 int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
2639 int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit,
2640 uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) {
2641 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2641, __extension__
__PRETTY_FUNCTION__))
;
2642 initAsyncInfo(AsyncInfo);
2643
2644 DeviceInfo().LoadRunLock.lock_shared();
2645 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2646 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
2647
2648 DeviceInfo().LoadRunLock.unlock_shared();
2649 return Res;
2650}
2651
2652int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
2653 void **TgtArgs, ptrdiff_t *TgtOffsets,
2654 int32_t ArgNum,
2655 __tgt_async_info *AsyncInfo) {
2656 // use one team and one thread
2657 // fix thread num
2658 int32_t TeamNum = 1;
2659 int32_t ThreadLimit = 0; // use default
2660 return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs,
2661 TgtOffsets, ArgNum, TeamNum,
2662 ThreadLimit, 0, AsyncInfo);
2663}
2664
2665int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
2666 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2666, __extension__
__PRETTY_FUNCTION__))
;
2667
2668 // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
2669 // is not ensured by devices.cpp for amdgcn
2670 // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
2671 if (AsyncInfo->Queue) {
2672 finiAsyncInfo(AsyncInfo);
2673 }
2674 return OFFLOAD_SUCCESS(0);
2675}
2676
2677void __tgt_rtl_print_device_info(int32_t DeviceId) {
2678 // TODO: Assertion to see if DeviceId is correct
2679 // NOTE: We don't need to set context for print device info.
2680
2681 DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
2682}
2683
2684} // extern "C"