Bug Summary

File:build/source/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
Warning:line 2376, column 7
Value stored to 'Err' is never read

Annotated Source Code

Press '?' to see keyboard shortcuts

clang -cc1 -cc1 -triple x86_64-pc-linux-gnu -analyze -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name rtl.cpp -analyzer-checker=core -analyzer-checker=apiModeling -analyzer-checker=unix -analyzer-checker=deadcode -analyzer-checker=cplusplus -analyzer-checker=security.insecureAPI.UncheckedReturn -analyzer-checker=security.insecureAPI.getpw -analyzer-checker=security.insecureAPI.gets -analyzer-checker=security.insecureAPI.mktemp -analyzer-checker=security.insecureAPI.mkstemp -analyzer-checker=security.insecureAPI.vfork -analyzer-checker=nullability.NullPassedToNonnull -analyzer-checker=nullability.NullReturnedFromNonnull -analyzer-output plist -w -setup-static-analyzer -analyzer-config-compatibility-mode=true -mrelocation-model pic -pic-level 2 -mframe-pointer=none -fmath-errno -ffp-contract=on -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -ffunction-sections -fdata-sections -fcoverage-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -resource-dir /usr/lib/llvm-16/lib/clang/16.0.0 -I projects/openmp/libomptarget/plugins/amdgpu -I /build/source/openmp/libomptarget/plugins/amdgpu -I include -I /build/source/llvm/include -I projects/openmp/runtime/src -I /build/source/openmp/libomptarget/plugins/amdgpu/dynamic_hsa -I /build/source/openmp/libomptarget/include -I /build/source/openmp/libomptarget/plugins/amdgpu/impl -I /build/source/openmp/libomptarget/plugins/common/elf_common -D OMPT_SUPPORT=1 -D TARGET_NAME=AMDGPU -D _DEBUG -D _GNU_SOURCE -D __STDC_CONSTANT_MACROS -D __STDC_FORMAT_MACROS -D __STDC_LIMIT_MACROS -D _FORTIFY_SOURCE=2 -D NDEBUG -U NDEBUG -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/x86_64-linux-gnu/c++/10 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../include/c++/10/backward -internal-isystem /usr/lib/llvm-16/lib/clang/16.0.0/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/10/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -fmacro-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fmacro-prefix-map=/build/source/= -fcoverage-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fcoverage-prefix-map=/build/source/= -source-date-epoch 1670066131 -O2 -Wno-unused-command-line-argument -Wno-unused-parameter -Wwrite-strings -Wno-missing-field-initializers -Wno-long-long -Wno-maybe-uninitialized -Wno-class-memaccess -Wno-redundant-move -Wno-pessimizing-move -Wno-noexcept-type -Wno-comment -Wno-misleading-indentation -Wno-enum-constexpr-conversion -Wno-extra -Wno-pedantic -Wno-maybe-uninitialized -std=c++17 -fdeprecated-macro -fdebug-compilation-dir=/build/source/build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/build-llvm/tools/clang/stage2-bins=build-llvm/tools/clang/stage2-bins -fdebug-prefix-map=/build/source/= -ferror-limit 19 -fvisibility=protected -fvisibility-inlines-hidden -stack-protector 2 -fgnuc-version=4.2.1 -fcolor-diagnostics -vectorize-loops -vectorize-slp -analyzer-output=html -analyzer-config stable-report-filename=true -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o /tmp/scan-build-2022-12-03-132955-15984-1 -x c++ /build/source/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
1//===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// RTL for AMD hsa machine
10//
11//===----------------------------------------------------------------------===//
12
13#include "llvm/ADT/StringMap.h"
14#include "llvm/ADT/StringRef.h"
15#include "llvm/Frontend/OpenMP/OMPConstants.h"
16#include "llvm/Frontend/OpenMP/OMPGridValues.h"
17#include "llvm/Object/ELF.h"
18#include "llvm/Object/ELFObjectFile.h"
19
20#include <algorithm>
21#include <assert.h>
22#include <cstdio>
23#include <cstdlib>
24#include <cstring>
25#include <functional>
26#include <list>
27#include <memory>
28#include <mutex>
29#include <shared_mutex>
30#include <unordered_map>
31#include <vector>
32
33#include "ELFSymbols.h"
34#include "impl_runtime.h"
35#include "interop_hsa.h"
36
37#include "internal.h"
38#include "rt.h"
39
40#include "DeviceEnvironment.h"
41#include "get_elf_mach_gfx_name.h"
42#include "omptargetplugin.h"
43#include "print_tracing.h"
44
45using namespace llvm;
46using namespace llvm::object;
47using namespace llvm::ELF;
48
49// hostrpc interface, FIXME: consider moving to its own include these are
50// statically linked into amdgpu/plugin if present from hostrpc_services.a,
51// linked as --whole-archive to override the weak symbols that are used to
52// implement a fallback for toolchains that do not yet have a hostrpc library.
53extern "C" {
54uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ,
55 uint32_t DeviceId);
56hsa_status_t hostrpc_init();
57hsa_status_t hostrpc_terminate();
58
59__attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; }
60__attribute__((weak)) hsa_status_t hostrpc_terminate() {
61 return HSA_STATUS_SUCCESS;
62}
63__attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *,
64 uint32_t DeviceId) {
65 DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "{}
66 "missing\n",{}
67 DeviceId){};
68 return 0;
69}
70}
71
72// Heuristic parameters used for kernel launch
73// Number of teams per CU to allow scheduling flexibility
74static const unsigned DefaultTeamsPerCU = 4;
75
76int print_kernel_trace;
77
78#ifdef OMPTARGET_DEBUG
79#define check(msg, status){} \
80 if (status != HSA_STATUS_SUCCESS) { \
81 DP(#msg " failed\n"){}; \
82 } else { \
83 DP(#msg " succeeded\n"){}; \
84 }
85#else
86#define check(msg, status){} \
87 {}
88#endif
89
90#include "elf_common.h"
91
92namespace hsa {
93template <typename C> hsa_status_t iterate_agents(C Cb) {
94 auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t {
95 C *Unwrapped = static_cast<C *>(Data);
96 return (*Unwrapped)(Agent);
97 };
98 return hsa_iterate_agents(L, static_cast<void *>(&Cb));
99}
100
101template <typename C>
102hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) {
103 auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t {
104 C *Unwrapped = static_cast<C *>(Data);
105 return (*Unwrapped)(MemoryPool);
106 };
107
108 return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb));
109}
110
111} // namespace hsa
112
113/// Keep entries table per device
114struct FuncOrGblEntryTy {
115 __tgt_target_table Table;
116 std::vector<__tgt_offload_entry> Entries;
117};
118
119struct KernelArgPool {
120private:
121 static pthread_mutex_t Mutex;
122
123public:
124 uint32_t KernargSegmentSize;
125 void *KernargRegion = nullptr;
126 std::queue<int> FreeKernargSegments;
127
128 uint32_t kernargSizeIncludingImplicit() {
129 return KernargSegmentSize + sizeof(impl_implicit_args_t);
130 }
131
132 ~KernelArgPool() {
133 if (KernargRegion) {
134 auto R = hsa_amd_memory_pool_free(KernargRegion);
135 if (R != HSA_STATUS_SUCCESS) {
136 DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)){};
137 }
138 }
139 }
140
141 // Can't really copy or move a mutex
142 KernelArgPool() = default;
143 KernelArgPool(const KernelArgPool &) = delete;
144 KernelArgPool(KernelArgPool &&) = delete;
145
146 KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool)
147 : KernargSegmentSize(KernargSegmentSize) {
148
149 // impl uses one pool per kernel for all gpus, with a fixed upper size
150 // preserving that exact scheme here, including the queue<int>
151
152 hsa_status_t Err = hsa_amd_memory_pool_allocate(
153 MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS(1024 * 16), 0,
154 &KernargRegion);
155
156 if (Err != HSA_STATUS_SUCCESS) {
157 DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)){};
158 KernargRegion = nullptr; // paranoid
159 return;
160 }
161
162 Err = core::allow_access_to_all_gpu_agents(KernargRegion);
163 if (Err != HSA_STATUS_SUCCESS) {
164 DP("hsa allow_access_to_all_gpu_agents failed: %s\n",{}
165 get_error_string(Err)){};
166 auto R = hsa_amd_memory_pool_free(KernargRegion);
167 if (R != HSA_STATUS_SUCCESS) {
168 // if free failed, can't do anything more to resolve it
169 DP("hsa memory poll free failed: %s\n", get_error_string(Err)){};
170 }
171 KernargRegion = nullptr;
172 return;
173 }
174
175 for (int I = 0; I < MAX_NUM_KERNELS(1024 * 16); I++) {
176 FreeKernargSegments.push(I);
177 }
178 }
179
180 void *allocate(uint64_t ArgNum) {
181 assert((ArgNum * sizeof(void *)) == KernargSegmentSize)(static_cast <bool> ((ArgNum * sizeof(void *)) == KernargSegmentSize
) ? void (0) : __assert_fail ("(ArgNum * sizeof(void *)) == KernargSegmentSize"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 181, __extension__
__PRETTY_FUNCTION__))
;
182 Lock L(&Mutex);
183 void *Res = nullptr;
184 if (!FreeKernargSegments.empty()) {
185
186 int FreeIdx = FreeKernargSegments.front();
187 Res = static_cast<void *>(static_cast<char *>(KernargRegion) +
188 (FreeIdx * kernargSizeIncludingImplicit()));
189 assert(FreeIdx == pointerToIndex(Res))(static_cast <bool> (FreeIdx == pointerToIndex(Res)) ? void
(0) : __assert_fail ("FreeIdx == pointerToIndex(Res)", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 189, __extension__ __PRETTY_FUNCTION__))
;
190 FreeKernargSegments.pop();
191 }
192 return Res;
193 }
194
195 void deallocate(void *Ptr) {
196 Lock L(&Mutex);
197 int Idx = pointerToIndex(Ptr);
198 FreeKernargSegments.push(Idx);
199 }
200
201private:
202 int pointerToIndex(void *Ptr) {
203 ptrdiff_t Bytes =
204 static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion);
205 assert(Bytes >= 0)(static_cast <bool> (Bytes >= 0) ? void (0) : __assert_fail
("Bytes >= 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 205, __extension__ __PRETTY_FUNCTION__))
;
206 assert(Bytes % kernargSizeIncludingImplicit() == 0)(static_cast <bool> (Bytes % kernargSizeIncludingImplicit
() == 0) ? void (0) : __assert_fail ("Bytes % kernargSizeIncludingImplicit() == 0"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 206, __extension__
__PRETTY_FUNCTION__))
;
207 return Bytes / kernargSizeIncludingImplicit();
208 }
209 struct Lock {
210 Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); }
211 ~Lock() { pthread_mutex_unlock(M); }
212 pthread_mutex_t *M;
213 };
214};
215pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
216
217std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>>
218 KernelArgPoolMap;
219
220/// Use a single entity to encode a kernel and a set of flags
221struct KernelTy {
222 llvm::omp::OMPTgtExecModeFlags ExecutionMode;
223 int16_t ConstWGSize;
224 int32_t DeviceId;
225 void *CallStackAddr = nullptr;
226 const char *Name;
227
228 KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize,
229 int32_t DeviceId, void *CallStackAddr, const char *Name,
230 uint32_t KernargSegmentSize,
231 hsa_amd_memory_pool_t &KernArgMemoryPool)
232 : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize),
233 DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) {
234 DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode){};
235
236 std::string N(Name);
237 if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) {
238 KernelArgPoolMap.insert(
239 std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool(
240 KernargSegmentSize, KernArgMemoryPool))));
241 }
242 }
243};
244
245/// List that contains all the kernels.
246/// FIXME: we may need this to be per device and per library.
247std::list<KernelTy> KernelsList;
248
249template <typename Callback> static hsa_status_t findAgents(Callback CB) {
250
251 hsa_status_t Err =
252 hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t {
253 hsa_device_type_t DeviceType;
254 // get_info fails iff HSA runtime not yet initialized
255 hsa_status_t Err =
256 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType);
257
258 if (Err != HSA_STATUS_SUCCESS) {
259 if (print_kernel_trace > 0)
260 DP("rtl.cpp: err %s\n", get_error_string(Err)){};
261
262 return Err;
263 }
264
265 CB(DeviceType, Agent);
266 return HSA_STATUS_SUCCESS;
267 });
268
269 // iterate_agents fails iff HSA runtime not yet initialized
270 if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) {
271 DP("rtl.cpp: err %s\n", get_error_string(Err)){};
272 }
273
274 return Err;
275}
276
277static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source,
278 void *Data) {
279 if (Status != HSA_STATUS_SUCCESS) {
280 const char *StatusString;
281 if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) {
282 StatusString = "unavailable";
283 }
284 DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,{}
285 Status, StatusString){};
286 abort();
287 }
288}
289
290namespace core {
291namespace {
292
293bool checkResult(hsa_status_t Err, const char *ErrMsg) {
294 if (Err == HSA_STATUS_SUCCESS)
295 return true;
296
297 REPORT("%s", ErrMsg)do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s"
, ErrMsg); } while (0);
;
298 REPORT("%s", get_error_string(Err))do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s"
, get_error_string(Err)); } while (0);
;
299 return false;
300}
301
302void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) {
303 __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE3);
304}
305
306uint16_t createHeader() {
307 uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
308 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
309 Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
310 return Header;
311}
312
313hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
314 bool AllocAllowed = false;
315 hsa_status_t Err = hsa_amd_memory_pool_get_info(
316 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
317 &AllocAllowed);
318 if (Err != HSA_STATUS_SUCCESS) {
319 DP("Alloc allowed in memory pool check failed: %s\n",{}
320 get_error_string(Err)){};
321 return Err;
322 }
323
324 size_t Size = 0;
325 Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
326 &Size);
327 if (Err != HSA_STATUS_SUCCESS) {
328 DP("Get memory pool size failed: %s\n", get_error_string(Err)){};
329 return Err;
330 }
331
332 return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR;
333}
334
335hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
336 std::vector<hsa_amd_memory_pool_t> *Result =
337 static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data);
338
339 hsa_status_t Err;
340 if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) {
341 return Err;
342 }
343
344 Result->push_back(MemoryPool);
345 return HSA_STATUS_SUCCESS;
346}
347
348} // namespace
349} // namespace core
350
351struct EnvironmentVariables {
352 int NumTeams;
353 int TeamLimit;
354 int TeamThreadLimit;
355 int MaxTeamsDefault;
356 int DynamicMemSize;
357};
358
359template <uint32_t wavesize>
360static constexpr const llvm::omp::GV &getGridValue() {
361 return llvm::omp::getAMDGPUGridValues<wavesize>();
362}
363
364struct HSALifetime {
365 // Wrapper around HSA used to ensure it is constructed before other types
366 // and destructed after, which means said other types can use raii for
367 // cleanup without risking running outside of the lifetime of HSA
368 const hsa_status_t S;
369
370 bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; }
371 HSALifetime() : S(hsa_init()) {}
372
373 ~HSALifetime() {
374 if (S == HSA_STATUS_SUCCESS) {
375 hsa_status_t Err = hsa_shut_down();
376 if (Err != HSA_STATUS_SUCCESS) {
377 // Can't call into HSA to get a string from the integer
378 DP("Shutting down HSA failed: %d\n", Err){};
379 }
380 }
381 }
382};
383
384// Handle scheduling of multiple hsa_queue's per device to
385// multiple threads (one scheduler per device)
386class HSAQueueScheduler {
387public:
388 HSAQueueScheduler() : Current(0) {}
389
390 HSAQueueScheduler(const HSAQueueScheduler &) = delete;
391
392 HSAQueueScheduler(HSAQueueScheduler &&Q) {
393 Current = Q.Current.load();
394 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
395 HSAQueues[I] = Q.HSAQueues[I];
396 Q.HSAQueues[I] = nullptr;
397 }
398 }
399
400 // \return false if any HSA queue creation fails
401 bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) {
402 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
403 hsa_queue_t *Q = nullptr;
404 hsa_status_t Rc =
405 hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI,
406 callbackQueue, NULL__null, UINT32_MAX(4294967295U), UINT32_MAX(4294967295U), &Q);
407 if (Rc != HSA_STATUS_SUCCESS) {
408 DP("Failed to create HSA queue %d\n", I){};
409 return false;
410 }
411 HSAQueues[I] = Q;
412 }
413 return true;
414 }
415
416 ~HSAQueueScheduler() {
417 for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) {
418 if (HSAQueues[I]) {
419 hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]);
420 if (Err != HSA_STATUS_SUCCESS)
421 DP("Error destroying HSA queue"){};
422 }
423 }
424 }
425
426 // \return next queue to use for device
427 hsa_queue_t *next() {
428 return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) %
429 NUM_QUEUES_PER_DEVICE];
430 }
431
432private:
433 // Number of queues per device
434 enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 };
435 hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {};
436 std::atomic<uint8_t> Current;
437};
438
439/// Class containing all the device information
440class RTLDeviceInfoTy : HSALifetime {
441 std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries;
442
443 struct QueueDeleter {
444 void operator()(hsa_queue_t *Q) {
445 if (Q) {
446 hsa_status_t Err = hsa_queue_destroy(Q);
447 if (Err != HSA_STATUS_SUCCESS) {
448 DP("Error destroying hsa queue: %s\n", get_error_string(Err)){};
449 }
450 }
451 }
452 };
453
454public:
455 bool ConstructionSucceeded = false;
456
457 // load binary populates symbol tables and mutates various global state
458 // run uses those symbol tables
459 std::shared_timed_mutex LoadRunLock;
460
461 int NumberOfDevices = 0;
462
463 // GPU devices
464 std::vector<hsa_agent_t> HSAAgents;
465 std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu
466
467 // CPUs
468 std::vector<hsa_agent_t> CPUAgents;
469
470 // Device properties
471 std::vector<int> ComputeUnits;
472 std::vector<int> GroupsPerDevice;
473 std::vector<int> ThreadsPerGroup;
474 std::vector<int> WarpSize;
475 std::vector<std::string> GPUName;
476 std::vector<std::string> TargetID;
477
478 // OpenMP properties
479 std::vector<int> NumTeams;
480 std::vector<int> NumThreads;
481
482 // OpenMP Environment properties
483 EnvironmentVariables Env;
484
485 // OpenMP Requires Flags
486 int64_t RequiresFlags;
487
488 // Resource pools
489 SignalPoolT FreeSignalPool;
490
491 bool HostcallRequired = false;
492
493 std::vector<hsa_executable_t> HSAExecutables;
494
495 std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable;
496 std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable;
497
498 hsa_amd_memory_pool_t KernArgPool;
499
500 // fine grained memory pool for host allocations
501 hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
502
503 // fine and coarse-grained memory pools per offloading device
504 std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
505 std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
506
507 struct ImplFreePtrDeletor {
508 void operator()(void *P) {
509 core::Runtime::Memfree(P); // ignore failure to free
510 }
511 };
512
513 // device_State shared across loaded binaries, error if inconsistent size
514 std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>>
515 DeviceStateStore;
516
517 static const unsigned HardTeamLimit =
518 (1 << 16) - 1; // 64K needed to fit in uint16
519 static const int DefaultNumTeams = 128;
520
521 // These need to be per-device since different devices can have different
522 // wave sizes, but are currently the same number for each so that refactor
523 // can be postponed.
524 static_assert(getGridValue<32>().GV_Max_Teams ==
525 getGridValue<64>().GV_Max_Teams,
526 "");
527 static const int MaxTeams = getGridValue<64>().GV_Max_Teams;
528
529 static_assert(getGridValue<32>().GV_Max_WG_Size ==
530 getGridValue<64>().GV_Max_WG_Size,
531 "");
532 static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size;
533
534 static_assert(getGridValue<32>().GV_Default_WG_Size ==
535 getGridValue<64>().GV_Default_WG_Size,
536 "");
537 static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size;
538
539 using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size,
540 hsa_agent_t, hsa_amd_memory_pool_t);
541 hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size,
542 MemcpyFunc Func, int32_t DeviceId) {
543 hsa_agent_t Agent = HSAAgents[DeviceId];
544 hsa_signal_t S = FreeSignalPool.pop();
545 if (S.handle == 0) {
546 return HSA_STATUS_ERROR;
547 }
548 hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool);
549 FreeSignalPool.push(S);
550 return R;
551 }
552
553 hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size,
554 int32_t DeviceId) {
555 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId);
556 }
557
558 hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size,
559 int32_t DeviceId) {
560 return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId);
561 }
562
563 static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) {
564 char TmpChar[1000];
565 uint16_t Major, Minor;
566 uint32_t TmpUInt;
567 uint32_t TmpUInt2;
568 uint32_t CacheSize[4];
569 bool TmpBool;
570 uint16_t WorkgroupMaxDim[3];
571 hsa_dim3_t GridMaxDim;
572
573 // Getting basic information about HSA and Device
574 core::checkResult(
575 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major),
576 "Error from hsa_system_get_info when obtaining "
577 "HSA_SYSTEM_INFO_VERSION_MAJOR\n");
578 core::checkResult(
579 hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor),
580 "Error from hsa_system_get_info when obtaining "
581 "HSA_SYSTEM_INFO_VERSION_MINOR\n");
582 printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor);
583 printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId);
584 core::checkResult(
585 hsa_agent_get_info(
586 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar),
587 "Error returned from hsa_agent_get_info when obtaining "
588 "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n");
589 printf(" Product Name: \t\t\t%s \n", TmpChar);
590 core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar),
591 "Error returned from hsa_agent_get_info when obtaining "
592 "HSA_AGENT_INFO_NAME\n");
593 printf(" Device Name: \t\t\t%s \n", TmpChar);
594 core::checkResult(
595 hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar),
596 "Error returned from hsa_agent_get_info when obtaining "
597 "HSA_AGENT_INFO_NAME\n");
598 printf(" Vendor Name: \t\t\t%s \n", TmpChar);
599 hsa_device_type_t DevType;
600 core::checkResult(
601 hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType),
602 "Error returned from hsa_agent_get_info when obtaining "
603 "HSA_AGENT_INFO_DEVICE\n");
604 printf(" Device Type: \t\t\t%s \n",
605 DevType == HSA_DEVICE_TYPE_CPU
606 ? "CPU"
607 : (DevType == HSA_DEVICE_TYPE_GPU
608 ? "GPU"
609 : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN")));
610 core::checkResult(
611 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt),
612 "Error returned from hsa_agent_get_info when obtaining "
613 "HSA_AGENT_INFO_QUEUES_MAX\n");
614 printf(" Max Queues: \t\t\t%u \n", TmpUInt);
615 core::checkResult(
616 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt),
617 "Error returned from hsa_agent_get_info when obtaining "
618 "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n");
619 printf(" Queue Min Size: \t\t\t%u \n", TmpUInt);
620 core::checkResult(
621 hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt),
622 "Error returned from hsa_agent_get_info when obtaining "
623 "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n");
624 printf(" Queue Max Size: \t\t\t%u \n", TmpUInt);
625
626 // Getting cache information
627 printf(" Cache:\n");
628
629 // FIXME: This is deprecated according to HSA documentation. But using
630 // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during
631 // runtime.
632 core::checkResult(
633 hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize),
634 "Error returned from hsa_agent_get_info when obtaining "
635 "HSA_AGENT_INFO_CACHE_SIZE\n");
636
637 for (int I = 0; I < 4; I++) {
638 if (CacheSize[I]) {
639 printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]);
640 }
641 }
642
643 core::checkResult(
644 hsa_agent_get_info(Agent,
645 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE,
646 &TmpUInt),
647 "Error returned from hsa_agent_get_info when obtaining "
648 "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n");
649 printf(" Cacheline Size: \t\t\t%u \n", TmpUInt);
650 core::checkResult(
651 hsa_agent_get_info(
652 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY,
653 &TmpUInt),
654 "Error returned from hsa_agent_get_info when obtaining "
655 "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n");
656 printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt);
657 core::checkResult(
658 hsa_agent_get_info(
659 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
660 &TmpUInt),
661 "Error returned from hsa_agent_get_info when obtaining "
662 "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n");
663 printf(" Compute Units: \t\t\t%u \n", TmpUInt);
664 core::checkResult(hsa_agent_get_info(
665 Agent,
666 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU,
667 &TmpUInt),
668 "Error returned from hsa_agent_get_info when obtaining "
669 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
670 printf(" SIMD per CU: \t\t\t%u \n", TmpUInt);
671 core::checkResult(
672 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool),
673 "Error returned from hsa_agent_get_info when obtaining "
674 "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n");
675 printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE"));
676 core::checkResult(
677 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2),
678 "Error returned from hsa_agent_get_info when obtaining "
679 "HSA_AGENT_INFO_WAVEFRONT_SIZE\n");
680 printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2);
681 core::checkResult(
682 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt),
683 "Error returned from hsa_agent_get_info when obtaining "
684 "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n");
685 printf(" Workgroup Max Size: \t\t%u \n", TmpUInt);
686 core::checkResult(hsa_agent_get_info(Agent,
687 HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
688 WorkgroupMaxDim),
689 "Error returned from hsa_agent_get_info when obtaining "
690 "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n");
691 printf(" Workgroup Max Size per Dimension:\n");
692 printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]);
693 printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]);
694 printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]);
695 core::checkResult(hsa_agent_get_info(
696 Agent,
697 (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU,
698 &TmpUInt),
699 "Error returned from hsa_agent_get_info when obtaining "
700 "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n");
701 printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt);
702 printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2);
703 core::checkResult(
704 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt),
705 "Error returned from hsa_agent_get_info when obtaining "
706 "HSA_AGENT_INFO_GRID_MAX_SIZE\n");
707 printf(" Grid Max Size: \t\t\t%u \n", TmpUInt);
708 core::checkResult(
709 hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim),
710 "Error returned from hsa_agent_get_info when obtaining "
711 "HSA_AGENT_INFO_GRID_MAX_DIM\n");
712 printf(" Grid Max Size per Dimension: \t\t\n");
713 printf(" x: \t\t\t\t%u\n", GridMaxDim.x);
714 printf(" y: \t\t\t\t%u\n", GridMaxDim.y);
715 printf(" z: \t\t\t\t%u\n", GridMaxDim.z);
716 core::checkResult(
717 hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt),
718 "Error returned from hsa_agent_get_info when obtaining "
719 "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n");
720 printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt);
721
722 printf(" Memory Pools:\n");
723 auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t {
724 std::string TmpStr;
725 size_t Size;
726 bool Alloc, Access;
727 hsa_amd_segment_t Segment;
728 hsa_amd_memory_pool_global_flag_t GlobalFlags;
729 core::checkResult(
730 hsa_amd_memory_pool_get_info(
731 Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags),
732 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
733 "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n");
734 core::checkResult(hsa_amd_memory_pool_get_info(
735 Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment),
736 "Error returned from hsa_amd_memory_pool_get_info when "
737 "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n");
738
739 switch (Segment) {
740 case HSA_AMD_SEGMENT_GLOBAL:
741 TmpStr = "GLOBAL; FLAGS: ";
742 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags)
743 TmpStr += "KERNARG, ";
744 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags)
745 TmpStr += "FINE GRAINED, ";
746 if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags)
747 TmpStr += "COARSE GRAINED, ";
748 break;
749 case HSA_AMD_SEGMENT_READONLY:
750 TmpStr = "READONLY";
751 break;
752 case HSA_AMD_SEGMENT_PRIVATE:
753 TmpStr = "PRIVATE";
754 break;
755 case HSA_AMD_SEGMENT_GROUP:
756 TmpStr = "GROUP";
757 break;
758 }
759 printf(" Pool %s: \n", TmpStr.c_str());
760
761 core::checkResult(hsa_amd_memory_pool_get_info(
762 Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size),
763 "Error returned from hsa_amd_memory_pool_get_info when "
764 "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n");
765 printf(" Size: \t\t\t\t %zu bytes\n", Size);
766 core::checkResult(
767 hsa_amd_memory_pool_get_info(
768 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc),
769 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
770 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n");
771 printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE"));
772 core::checkResult(
773 hsa_amd_memory_pool_get_info(
774 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size),
775 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
776 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n");
777 printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size);
778 core::checkResult(
779 hsa_amd_memory_pool_get_info(
780 Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size),
781 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
782 "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n");
783 printf(" Runtime Alloc alignment: \t %zu bytes\n", Size);
784 core::checkResult(
785 hsa_amd_memory_pool_get_info(
786 Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access),
787 "Error returned from hsa_amd_memory_pool_get_info when obtaining "
788 "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n");
789 printf(" Accessable by all: \t\t %s\n",
790 (Access ? "TRUE" : "FALSE"));
791
792 return HSA_STATUS_SUCCESS;
793 };
794 // Iterate over all the memory regions for this agent. Get the memory region
795 // type and size
796 hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr);
797
798 printf(" ISAs:\n");
799 auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t {
800 char TmpChar[1000];
801 core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar),
802 "Error returned from hsa_isa_get_info_alt when "
803 "obtaining HSA_ISA_INFO_NAME\n");
804 printf(" Name: \t\t\t\t %s\n", TmpChar);
805
806 return HSA_STATUS_SUCCESS;
807 };
808 // Iterate over all the memory regions for this agent. Get the memory region
809 // type and size
810 hsa_agent_iterate_isas(Agent, CBIsas, nullptr);
811 }
812
813 // Record entry point associated with device
814 void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) {
815 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 816, __extension__
__PRETTY_FUNCTION__))
816 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 816, __extension__
__PRETTY_FUNCTION__))
;
817 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
818
819 E.Entries.push_back(Entry);
820 }
821
822 // Return true if the entry is associated with device
823 bool findOffloadEntry(int32_t DeviceId, void *Addr) {
824 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 825, __extension__
__PRETTY_FUNCTION__))
825 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 825, __extension__
__PRETTY_FUNCTION__))
;
826 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
827
828 for (auto &It : E.Entries) {
829 if (It.addr == Addr)
830 return true;
831 }
832
833 return false;
834 }
835
836 // Return the pointer to the target entries table
837 __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) {
838 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 839, __extension__
__PRETTY_FUNCTION__))
839 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 839, __extension__
__PRETTY_FUNCTION__))
;
840 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
841
842 int32_t Size = E.Entries.size();
843
844 // Table is empty
845 if (!Size)
846 return 0;
847
848 __tgt_offload_entry *Begin = &E.Entries[0];
849 __tgt_offload_entry *End = &E.Entries[Size - 1];
850
851 // Update table info according to the entries and return the pointer
852 E.Table.EntriesBegin = Begin;
853 E.Table.EntriesEnd = ++End;
854
855 return &E.Table;
856 }
857
858 // Clear entries table for a device
859 void clearOffloadEntriesTable(int DeviceId) {
860 assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 861, __extension__
__PRETTY_FUNCTION__))
861 "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries
.size() && "Unexpected device id!") ? void (0) : __assert_fail
("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 861, __extension__
__PRETTY_FUNCTION__))
;
862 FuncGblEntries[DeviceId].emplace_back();
863 FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back();
864 // KernelArgPoolMap.clear();
865 E.Entries.clear();
866 E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
867 }
868
869 hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
870 unsigned int DeviceId) {
871 assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.")(static_cast <bool> (DeviceId < DeviceFineGrainedMemoryPools
.size() && "Error here.") ? void (0) : __assert_fail (
"DeviceId < DeviceFineGrainedMemoryPools.size() && \"Error here.\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 871, __extension__
__PRETTY_FUNCTION__))
;
872 uint32_t GlobalFlags = 0;
873 hsa_status_t Err = hsa_amd_memory_pool_get_info(
874 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
875
876 if (Err != HSA_STATUS_SUCCESS) {
877 return Err;
878 }
879
880 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
881 DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
882 } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
883 DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
884 }
885
886 return HSA_STATUS_SUCCESS;
887 }
888
889 hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) {
890 for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
891 hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
892 Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
893 hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool);
894 if (ValidStatus != HSA_STATUS_SUCCESS) {
895 DP("Alloc allowed in memory pool check failed: %s\n",{}
896 get_error_string(ValidStatus)){};
897 return HSA_STATUS_SUCCESS;
898 }
899 return addDeviceMemoryPool(MemoryPool, DeviceId);
900 });
901
902 if (Err != HSA_STATUS_SUCCESS) {
903 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{}
904 "Iterate all memory pools", get_error_string(Err)){};
905 return Err;
906 }
907 }
908 return HSA_STATUS_SUCCESS;
909 }
910
911 hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) {
912 std::vector<hsa_amd_memory_pool_t> HostPools;
913
914 // collect all the "valid" pools for all the given agents.
915 for (const auto &Agent : Agents) {
916 hsa_status_t Err = hsa_amd_agent_iterate_memory_pools(
917 Agent, core::addMemoryPool, static_cast<void *>(&HostPools));
918 if (Err != HSA_STATUS_SUCCESS) {
919 DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)){};
920 }
921 }
922
923 // We need two fine-grained pools.
924 // 1. One with kernarg flag set for storing kernel arguments
925 // 2. Second for host allocations
926 bool FineGrainedMemoryPoolSet = false;
927 bool KernArgPoolSet = false;
928 for (const auto &MemoryPool : HostPools) {
929 hsa_status_t Err = HSA_STATUS_SUCCESS;
930 uint32_t GlobalFlags = 0;
931 Err = hsa_amd_memory_pool_get_info(
932 MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
933 if (Err != HSA_STATUS_SUCCESS) {
934 DP("Get memory pool info failed: %s\n", get_error_string(Err)){};
935 return Err;
936 }
937
938 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
939 if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
940 KernArgPool = MemoryPool;
941 KernArgPoolSet = true;
942 }
943 HostFineGrainedMemoryPool = MemoryPool;
944 FineGrainedMemoryPoolSet = true;
945 }
946 }
947
948 if (FineGrainedMemoryPoolSet && KernArgPoolSet)
949 return HSA_STATUS_SUCCESS;
950
951 return HSA_STATUS_ERROR;
952 }
953
954 hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) {
955 assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&(static_cast <bool> (DeviceId >= 0 && DeviceId
< DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id"
) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 956, __extension__
__PRETTY_FUNCTION__))
956 "Invalid device Id")(static_cast <bool> (DeviceId >= 0 && DeviceId
< DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id"
) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 956, __extension__
__PRETTY_FUNCTION__))
;
957 return DeviceCoarseGrainedMemoryPools[DeviceId];
958 }
959
960 hsa_amd_memory_pool_t getHostMemoryPool() {
961 return HostFineGrainedMemoryPool;
962 }
963
964 static int readEnv(const char *Env, int Default = -1) {
965 const char *EnvStr = getenv(Env);
966 int Res = Default;
967 if (EnvStr) {
968 Res = std::stoi(EnvStr);
969 DP("Parsed %s=%d\n", Env, Res){};
970 }
971 return Res;
972 }
973
974 RTLDeviceInfoTy() {
975 DP("Start initializing " GETNAME(TARGET_NAME) "\n"){};
976
977 // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
978 // anytime. You do not need a debug library build.
979 // 0 => no tracing
980 // 1 => tracing dispatch only
981 // >1 => verbosity increase
982
983 if (!HSAInitSuccess()) {
984 DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"){};
985 return;
986 }
987
988 if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE"))
989 print_kernel_trace = atoi(EnvStr);
990 else
991 print_kernel_trace = 0;
992
993 hsa_status_t Err = core::atl_init_gpu_context();
994 if (Err != HSA_STATUS_SUCCESS) {
995 DP("Error when initializing " GETNAME(TARGET_NAME) "\n"){};
996 return;
997 }
998
999 // Init hostcall soon after initializing hsa
1000 hostrpc_init();
1001
1002 Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) {
1003 if (DeviceType == HSA_DEVICE_TYPE_CPU) {
1004 CPUAgents.push_back(Agent);
1005 } else {
1006 HSAAgents.push_back(Agent);
1007 }
1008 });
1009 if (Err != HSA_STATUS_SUCCESS)
1010 return;
1011
1012 NumberOfDevices = (int)HSAAgents.size();
1013
1014 if (NumberOfDevices == 0) {
1015 DP("There are no devices supporting HSA.\n"){};
1016 return;
1017 }
1018 DP("There are %d devices supporting HSA.\n", NumberOfDevices){};
1019
1020 // Init the device info
1021 HSAQueueSchedulers.reserve(NumberOfDevices);
1022 FuncGblEntries.resize(NumberOfDevices);
1023 ThreadsPerGroup.resize(NumberOfDevices);
1024 ComputeUnits.resize(NumberOfDevices);
1025 GPUName.resize(NumberOfDevices);
1026 GroupsPerDevice.resize(NumberOfDevices);
1027 WarpSize.resize(NumberOfDevices);
1028 NumTeams.resize(NumberOfDevices);
1029 NumThreads.resize(NumberOfDevices);
1030 DeviceStateStore.resize(NumberOfDevices);
1031 KernelInfoTable.resize(NumberOfDevices);
1032 SymbolInfoTable.resize(NumberOfDevices);
1033 DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
1034 DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
1035
1036 Err = setupDevicePools(HSAAgents);
1037 if (Err != HSA_STATUS_SUCCESS) {
1038 DP("Setup for Device Memory Pools failed\n"){};
1039 return;
1040 }
1041
1042 Err = setupHostMemoryPools(CPUAgents);
1043 if (Err != HSA_STATUS_SUCCESS) {
1044 DP("Setup for Host Memory Pools failed\n"){};
1045 return;
1046 }
1047
1048 for (int I = 0; I < NumberOfDevices; I++) {
1049 uint32_t QueueSize = 0;
1050 {
1051 hsa_status_t Err = hsa_agent_get_info(
1052 HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize);
1053 if (Err != HSA_STATUS_SUCCESS) {
1054 DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I){};
1055 return;
1056 }
1057 enum { MaxQueueSize = 4096 };
1058 if (QueueSize > MaxQueueSize) {
1059 QueueSize = MaxQueueSize;
1060 }
1061 }
1062
1063 {
1064 HSAQueueScheduler QSched;
1065 if (!QSched.createQueues(HSAAgents[I], QueueSize))
1066 return;
1067 HSAQueueSchedulers.emplace_back(std::move(QSched));
1068 }
1069
1070 DeviceStateStore[I] = {nullptr, 0};
1071 }
1072
1073 for (int I = 0; I < NumberOfDevices; I++) {
1074 ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize;
1075 GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams;
1076 ComputeUnits[I] = 1;
1077 DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,{}
1078 GroupsPerDevice[I], ThreadsPerGroup[I]){};
1079 }
1080
1081 // Get environment variables regarding teams
1082 Env.TeamLimit = readEnv("OMP_TEAM_LIMIT");
1083 Env.NumTeams = readEnv("OMP_NUM_TEAMS");
1084 Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT");
1085 Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT");
1086 Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0);
1087
1088 // Default state.
1089 RequiresFlags = OMP_REQ_UNDEFINED;
1090
1091 ConstructionSucceeded = true;
1092 }
1093
1094 ~RTLDeviceInfoTy() {
1095 DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"){};
1096 if (!HSAInitSuccess()) {
1097 // Then none of these can have been set up and they can't be torn down
1098 return;
1099 }
1100 // Run destructors on types that use HSA before
1101 // impl_finalize removes access to it
1102 DeviceStateStore.clear();
1103 KernelArgPoolMap.clear();
1104 // Terminate hostrpc before finalizing hsa
1105 hostrpc_terminate();
1106
1107 hsa_status_t Err;
1108 for (uint32_t I = 0; I < HSAExecutables.size(); I++) {
1109 Err = hsa_executable_destroy(HSAExecutables[I]);
1110 if (Err != HSA_STATUS_SUCCESS) {
1111 DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{}
1112 "Destroying executable", get_error_string(Err)){};
1113 }
1114 }
1115 }
1116};
1117
1118pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
1119
1120// Putting accesses to DeviceInfo global behind a function call prior
1121// to changing to use init_plugin/deinit_plugin calls
1122static RTLDeviceInfoTy DeviceInfoState;
1123static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; }
1124
1125namespace {
1126
1127int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size,
1128 __tgt_async_info *AsyncInfo) {
1129 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1129, __extension__
__PRETTY_FUNCTION__))
;
1130 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1130, __extension__
__PRETTY_FUNCTION__))
;
1131 // Return success if we are not copying back to host from target.
1132 if (!HstPtr)
1133 return OFFLOAD_SUCCESS(0);
1134 hsa_status_t Err;
1135 DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{}
1136 (long long unsigned)(Elf64_Addr)TgtPtr,{}
1137 (long long unsigned)(Elf64_Addr)HstPtr){};
1138
1139 Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size,
1140 DeviceId);
1141
1142 if (Err != HSA_STATUS_SUCCESS) {
1143 DP("Error when copying data from device to host. Pointers: "{}
1144 "host = 0x%016lx, device = 0x%016lx, size = %lld\n",{}
1145 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){};
1146 return OFFLOAD_FAIL(~0);
1147 }
1148 DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{}
1149 (long long unsigned)(Elf64_Addr)TgtPtr,{}
1150 (long long unsigned)(Elf64_Addr)HstPtr){};
1151 return OFFLOAD_SUCCESS(0);
1152}
1153
1154int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size,
1155 __tgt_async_info *AsyncInfo) {
1156 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1156, __extension__
__PRETTY_FUNCTION__))
;
1157 hsa_status_t Err;
1158 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1158, __extension__
__PRETTY_FUNCTION__))
;
1159 // Return success if we are not doing host to target.
1160 if (!HstPtr)
1161 return OFFLOAD_SUCCESS(0);
1162
1163 DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,{}
1164 (long long unsigned)(Elf64_Addr)HstPtr,{}
1165 (long long unsigned)(Elf64_Addr)TgtPtr){};
1166 Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size,
1167 DeviceId);
1168 if (Err != HSA_STATUS_SUCCESS) {
1169 DP("Error when copying data from host to device. Pointers: "{}
1170 "host = 0x%016lx, device = 0x%016lx, size = %lld\n",{}
1171 (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){};
1172 return OFFLOAD_FAIL(~0);
1173 }
1174 return OFFLOAD_SUCCESS(0);
1175}
1176
1177// Async.
1178// The implementation was written with cuda streams in mind. The semantics of
1179// that are to execute kernels on a queue in order of insertion. A synchronise
1180// call then makes writes visible between host and device. This means a series
1181// of N data_submit_async calls are expected to execute serially. HSA offers
1182// various options to run the data copies concurrently. This may require changes
1183// to libomptarget.
1184
1185// __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that
1186// there are no outstanding kernels that need to be synchronized. Any async call
1187// may be passed a Queue==0, at which point the cuda implementation will set it
1188// to non-null (see getStream). The cuda streams are per-device. Upstream may
1189// change this interface to explicitly initialize the AsyncInfo_pointer, but
1190// until then hsa lazily initializes it as well.
1191
1192void initAsyncInfo(__tgt_async_info *AsyncInfo) {
1193 // set non-null while using async calls, return to null to indicate completion
1194 assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail
("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1194, __extension__ __PRETTY_FUNCTION__))
;
1195 if (!AsyncInfo->Queue) {
1196 AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX(18446744073709551615UL));
1197 }
1198}
1199void finiAsyncInfo(__tgt_async_info *AsyncInfo) {
1200 assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail
("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1200, __extension__ __PRETTY_FUNCTION__))
;
1201 assert(AsyncInfo->Queue)(static_cast <bool> (AsyncInfo->Queue) ? void (0) : __assert_fail
("AsyncInfo->Queue", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1201, __extension__ __PRETTY_FUNCTION__))
;
1202 AsyncInfo->Queue = 0;
1203}
1204
1205// Determine launch values for kernel.
1206struct LaunchVals {
1207 int WorkgroupSize;
1208 int GridSize;
1209};
1210LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env,
1211 int ConstWGSize,
1212 llvm::omp::OMPTgtExecModeFlags ExecutionMode,
1213 int NumTeams, int ThreadLimit, uint64_t LoopTripcount,
1214 int DeviceNumTeams) {
1215
1216 int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize;
1217 int NumGroups = 0;
1218
1219 int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams;
1220 if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit))
1221 MaxTeams = RTLDeviceInfoTy::HardTeamLimit;
1222
1223 if (print_kernel_trace & STARTUP_DETAILS) {
1224 DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams){};
1225 DP("Max_Teams: %d\n", MaxTeams){};
1226 DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize){};
1227 DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize){};
1228 DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",{}
1229 RTLDeviceInfoTy::DefaultWgSize){};
1230 DP("thread_limit: %d\n", ThreadLimit){};
1231 DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1232 DP("ConstWGSize: %d\n", ConstWGSize){};
1233 }
1234 // check for thread_limit() clause
1235 if (ThreadLimit > 0) {
1236 ThreadsPerGroup = ThreadLimit;
1237 DP("Setting threads per block to requested %d\n", ThreadLimit){};
1238 // Add master warp for GENERIC
1239 if (ExecutionMode ==
1240 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1241 ThreadsPerGroup += WarpSize;
1242 DP("Adding master wavefront: +%d threads\n", WarpSize){};
1243 }
1244 if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max
1245 ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize;
1246 DP("Setting threads per block to maximum %d\n", ThreadsPerGroup){};
1247 }
1248 }
1249 // check flat_max_work_group_size attr here
1250 if (ThreadsPerGroup > ConstWGSize) {
1251 ThreadsPerGroup = ConstWGSize;
1252 DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",{}
1253 ThreadsPerGroup){};
1254 }
1255 if (print_kernel_trace & STARTUP_DETAILS)
1256 DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1257 DP("Preparing %d threads\n", ThreadsPerGroup){};
1258
1259 // Set default num_groups (teams)
1260 if (Env.TeamLimit > 0)
1261 NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit;
1262 else
1263 NumGroups = MaxTeams;
1264 DP("Set default num of groups %d\n", NumGroups){};
1265
1266 if (print_kernel_trace & STARTUP_DETAILS) {
1267 DP("num_groups: %d\n", NumGroups){};
1268 DP("num_teams: %d\n", NumTeams){};
1269 }
1270
1271 // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size
1272 // This reduction is typical for default case (no thread_limit clause).
1273 // or when user goes crazy with num_teams clause.
1274 // FIXME: We cant distinguish between a constant or variable thread limit.
1275 // So we only handle constant thread_limits.
1276 if (ThreadsPerGroup >
1277 RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024
1278 // Should we round threadsPerGroup up to nearest WarpSize
1279 // here?
1280 NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup;
1281
1282 // check for num_teams() clause
1283 if (NumTeams > 0) {
1284 NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups;
1285 }
1286 if (print_kernel_trace & STARTUP_DETAILS) {
1287 DP("num_groups: %d\n", NumGroups){};
1288 DP("Env.NumTeams %d\n", Env.NumTeams){};
1289 DP("Env.TeamLimit %d\n", Env.TeamLimit){};
1290 }
1291
1292 if (Env.NumTeams > 0) {
1293 NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups;
1294 DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams){};
1295 } else if (Env.TeamLimit > 0) {
1296 NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups;
1297 DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit){};
1298 } else {
1299 if (NumTeams <= 0) {
1300 if (LoopTripcount > 0) {
1301 if (ExecutionMode ==
1302 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) {
1303 // round up to the nearest integer
1304 NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1;
1305 } else if (ExecutionMode ==
1306 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) {
1307 NumGroups = LoopTripcount;
1308 } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ {
1309 // This is a generic kernel that was transformed to use SPMD-mode
1310 // execution but uses Generic-mode semantics for scheduling.
1311 NumGroups = LoopTripcount;
1312 }
1313 DP("Using %d teams due to loop trip count %" PRIu64 " and number of "{}
1314 "threads per block %d\n",{}
1315 NumGroups, LoopTripcount, ThreadsPerGroup){};
1316 }
1317 } else {
1318 NumGroups = NumTeams;
1319 }
1320 if (NumGroups > MaxTeams) {
1321 NumGroups = MaxTeams;
1322 if (print_kernel_trace & STARTUP_DETAILS)
1323 DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams){};
1324 }
1325 if (NumGroups > NumTeams && NumTeams > 0) {
1326 NumGroups = NumTeams;
1327 if (print_kernel_trace & STARTUP_DETAILS)
1328 DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,{}
1329 NumTeams){};
1330 }
1331 }
1332
1333 // num_teams clause always honored, no matter what, unless DEFAULT is active.
1334 if (NumTeams > 0) {
1335 NumGroups = NumTeams;
1336 // Cap num_groups to EnvMaxTeamsDefault if set.
1337 if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault)
1338 NumGroups = Env.MaxTeamsDefault;
1339 }
1340 if (print_kernel_trace & STARTUP_DETAILS) {
1341 DP("threadsPerGroup: %d\n", ThreadsPerGroup){};
1342 DP("num_groups: %d\n", NumGroups){};
1343 DP("loop_tripcount: %ld\n", LoopTripcount){};
1344 }
1345 DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,{}
1346 ThreadsPerGroup){};
1347
1348 LaunchVals Res;
1349 Res.WorkgroupSize = ThreadsPerGroup;
1350 Res.GridSize = ThreadsPerGroup * NumGroups;
1351 return Res;
1352}
1353
1354static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) {
1355 uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1);
1356 bool Full = true;
1357 while (Full) {
1358 Full =
1359 PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue));
1360 }
1361 return PacketId;
1362}
1363
1364int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1365 ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams,
1366 int32_t ThreadLimit, uint64_t LoopTripcount) {
1367 // Set the context we are using
1368 // update thread limit content in gpu memory if un-initialized or specified
1369 // from host
1370
1371 DP("Run target team region thread_limit %d\n", ThreadLimit){};
1372
1373 // All args are references.
1374 std::vector<void *> Args(ArgNum);
1375 std::vector<void *> Ptrs(ArgNum);
1376
1377 DP("Arg_num: %d\n", ArgNum){};
1378 for (int32_t I = 0; I < ArgNum; ++I) {
1379 Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]);
1380 Args[I] = &Ptrs[I];
1381 DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])){};
1382 }
1383
1384 KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr;
1385
1386 std::string KernelName = std::string(KernelInfo->Name);
1387 auto &KernelInfoTable = DeviceInfo().KernelInfoTable;
1388 if (KernelInfoTable[DeviceId].find(KernelName) ==
1389 KernelInfoTable[DeviceId].end()) {
1390 DP("Kernel %s not found\n", KernelName.c_str()){};
1391 return OFFLOAD_FAIL(~0);
1392 }
1393
1394 const atl_kernel_info_t KernelInfoEntry =
1395 KernelInfoTable[DeviceId][KernelName];
1396 const uint32_t GroupSegmentSize =
1397 KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize;
1398 const uint32_t SgprCount = KernelInfoEntry.sgpr_count;
1399 const uint32_t VgprCount = KernelInfoEntry.vgpr_count;
1400 const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count;
1401 const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count;
1402
1403 assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count)(static_cast <bool> (ArgNum == (int)KernelInfoEntry.explicit_argument_count
) ? void (0) : __assert_fail ("ArgNum == (int)KernelInfoEntry.explicit_argument_count"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1403, __extension__
__PRETTY_FUNCTION__))
;
1404
1405 /*
1406 * Set limit based on ThreadsPerGroup and GroupsPerDevice
1407 */
1408 LaunchVals LV =
1409 getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env,
1410 KernelInfo->ConstWGSize, KernelInfo->ExecutionMode,
1411 NumTeams, // From run_region arg
1412 ThreadLimit, // From run_region arg
1413 LoopTripcount, // From run_region arg
1414 DeviceInfo().NumTeams[KernelInfo->DeviceId]);
1415 const int GridSize = LV.GridSize;
1416 const int WorkgroupSize = LV.WorkgroupSize;
1417
1418 if (print_kernel_trace >= LAUNCH) {
1419 int NumGroups = GridSize / WorkgroupSize;
1420 // enum modes are SPMD, GENERIC, NONE 0,1,2
1421 // if doing rtl timing, print to stderr, unless stdout requested.
1422 bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING);
1423 fprintf(TraceToStdout ? stdoutstdout : stderrstderr,
1424 "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) "
1425 "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u "
1426 "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n",
1427 DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize,
1428 ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit,
1429 GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount,
1430 VgprSpillCount, LoopTripcount, KernelInfo->Name);
1431 }
1432
1433 // Run on the device.
1434 {
1435 hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next();
1436 if (!Queue) {
1437 return OFFLOAD_FAIL(~0);
1438 }
1439 uint64_t PacketId = acquireAvailablePacketId(Queue);
1440
1441 const uint32_t Mask = Queue->size - 1; // size is a power of 2
1442 hsa_kernel_dispatch_packet_t *Packet =
1443 (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask);
1444
1445 // packet->header is written last
1446 Packet->setup = UINT16_C(1)1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
1447 Packet->workgroup_size_x = WorkgroupSize;
1448 Packet->workgroup_size_y = 1;
1449 Packet->workgroup_size_z = 1;
1450 Packet->reserved0 = 0;
1451 Packet->grid_size_x = GridSize;
1452 Packet->grid_size_y = 1;
1453 Packet->grid_size_z = 1;
1454 Packet->private_segment_size = KernelInfoEntry.private_segment_size;
1455 Packet->group_segment_size = GroupSegmentSize;
1456 Packet->kernel_object = KernelInfoEntry.kernel_object;
1457 Packet->kernarg_address = 0; // use the block allocator
1458 Packet->reserved2 = 0; // impl writes id_ here
1459 Packet->completion_signal = {0}; // may want a pool of signals
1460
1461 KernelArgPool *ArgPool = nullptr;
1462 void *KernArg = nullptr;
1463 {
1464 auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name));
1465 if (It != KernelArgPoolMap.end()) {
1466 ArgPool = (It->second).get();
1467 }
1468 }
1469 if (!ArgPool) {
1470 DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,{}
1471 DeviceId){};
1472 }
1473 {
1474 if (ArgPool) {
1475 assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)))(static_cast <bool> (ArgPool->KernargSegmentSize == (
ArgNum * sizeof(void *))) ? void (0) : __assert_fail ("ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))"
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1475, __extension__
__PRETTY_FUNCTION__))
;
1476 KernArg = ArgPool->allocate(ArgNum);
1477 }
1478 if (!KernArg) {
1479 DP("Allocate kernarg failed\n"){};
1480 return OFFLOAD_FAIL(~0);
1481 }
1482
1483 // Copy explicit arguments
1484 for (int I = 0; I < ArgNum; I++) {
1485 memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *));
1486 }
1487
1488 // Initialize implicit arguments. TODO: Which of these can be dropped
1489 impl_implicit_args_t *ImplArgs = reinterpret_cast<impl_implicit_args_t *>(
1490 static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize);
1491 memset(ImplArgs, 0,
1492 sizeof(impl_implicit_args_t)); // may not be necessary
1493 ImplArgs->offset_x = 0;
1494 ImplArgs->offset_y = 0;
1495 ImplArgs->offset_z = 0;
1496
1497 // assign a hostcall buffer for the selected Q
1498 if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE2)) {
1499 // hostrpc_assign_buffer is not thread safe, and this function is
1500 // under a multiple reader lock, not a writer lock.
1501 static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } };
1502 pthread_mutex_lock(&HostcallInitLock);
1503 uint64_t Buffer = hostrpc_assign_buffer(
1504 DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId);
1505 pthread_mutex_unlock(&HostcallInitLock);
1506 if (!Buffer) {
1507 DP("hostrpc_assign_buffer failed, gpu would dereference null and "{}
1508 "error\n"){};
1509 return OFFLOAD_FAIL(~0);
1510 }
1511
1512 DP("Implicit argument count: %d\n",{}
1513 KernelInfoEntry.implicit_argument_count){};
1514 if (KernelInfoEntry.implicit_argument_count >= 4) {
1515 // Initialise pointer for implicit_argument_count != 0 ABI
1516 // Guess that the right implicit argument is at offset 24 after
1517 // the explicit arguments. In the future, should be able to read
1518 // the offset from msgpack. Clang is not annotating it at present.
1519 uint64_t Offset =
1520 sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3);
1521 if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) {
1522 DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "{}
1523 "args: %d\n",{}
1524 Offset + 8, ArgPool->kernargSizeIncludingImplicit()){};
1525 } else {
1526 memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8);
1527 }
1528 }
1529
1530 // initialise pointer for implicit_argument_count == 0 ABI
1531 ImplArgs->hostcall_ptr = Buffer;
1532 }
1533
1534 Packet->kernarg_address = KernArg;
1535 }
1536
1537 hsa_signal_t S = DeviceInfo().FreeSignalPool.pop();
1538 if (S.handle == 0) {
1539 DP("Failed to get signal instance\n"){};
1540 return OFFLOAD_FAIL(~0);
1541 }
1542 Packet->completion_signal = S;
1543 hsa_signal_store_relaxed(Packet->completion_signal, 1);
1544
1545 // Publish the packet indicating it is ready to be processed
1546 core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet),
1547 core::createHeader(), Packet->setup);
1548
1549 // Since the packet is already published, its contents must not be
1550 // accessed any more
1551 hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
1552
1553 while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX(18446744073709551615UL),
1554 HSA_WAIT_STATE_BLOCKED) != 0)
1555 ;
1556
1557 assert(ArgPool)(static_cast <bool> (ArgPool) ? void (0) : __assert_fail
("ArgPool", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 1557, __extension__ __PRETTY_FUNCTION__))
;
1558 ArgPool->deallocate(KernArg);
1559 DeviceInfo().FreeSignalPool.push(S);
1560 }
1561
1562 DP("Kernel completed\n"){};
1563 return OFFLOAD_SUCCESS(0);
1564}
1565
1566bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) {
1567 const uint16_t AmdgcnMachineID = EM_AMDGPU;
1568 const int32_t R = elf_check_machine(Image, AmdgcnMachineID);
1569 if (!R) {
1570 DP("Supported machine ID not found\n"){};
1571 }
1572 return R;
1573}
1574
1575uint32_t elfEFlags(__tgt_device_image *Image) {
1576 const char *ImgBegin = (char *)Image->ImageStart;
1577 size_t ImgSize = (char *)Image->ImageEnd - ImgBegin;
1578
1579 StringRef Buffer = StringRef(ImgBegin, ImgSize);
1580 auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
1581 /*InitContent=*/false);
1582 if (!ElfOrErr) {
1583 consumeError(ElfOrErr.takeError());
1584 return 0;
1585 }
1586
1587 if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
1588 return ELFObj->getPlatformFlags();
1589 return 0;
1590}
1591
1592template <typename T> bool enforceUpperBound(T *Value, T Upper) {
1593 bool Changed = *Value > Upper;
1594 if (Changed) {
1595 *Value = Upper;
1596 }
1597 return Changed;
1598}
1599
1600struct SymbolInfo {
1601 const void *Addr = nullptr;
1602 uint32_t Size = UINT32_MAX(4294967295U);
1603 uint32_t ShType = SHT_NULL;
1604};
1605
1606int getSymbolInfoWithoutLoading(const ELFObjectFile<ELF64LE> &ELFObj,
1607 StringRef SymName, SymbolInfo *Res) {
1608 auto SymOrErr = getELFSymbol(ELFObj, SymName);
1609 if (!SymOrErr) {
1610 std::string ErrorString = toString(SymOrErr.takeError());
1611 DP("Failed ELF lookup: %s\n", ErrorString.c_str()){};
1612 return 1;
1613 }
1614 if (!*SymOrErr)
1615 return 1;
1616
1617 auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx);
1618 if (!SymSecOrErr) {
1619 std::string ErrorString = toString(SymOrErr.takeError());
1620 DP("Failed ELF lookup: %s\n", ErrorString.c_str()){};
1621 return 1;
1622 }
1623
1624 Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base();
1625 Res->Size = static_cast<uint32_t>((*SymOrErr)->st_size);
1626 Res->ShType = static_cast<uint32_t>((*SymSecOrErr)->sh_type);
1627 return 0;
1628}
1629
1630int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName,
1631 SymbolInfo *Res) {
1632 StringRef Buffer = StringRef(Base, ImgSize);
1633 auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""),
1634 /*InitContent=*/false);
1635 if (!ElfOrErr) {
1636 REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str())do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Failed to load ELF: %s\n"
, toString(ElfOrErr.takeError()).c_str()); } while (0);
;
1637 return 1;
1638 }
1639
1640 if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get()))
1641 return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res);
1642 return 1;
1643}
1644
1645hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize,
1646 const char *SymName, const void **VarAddr,
1647 uint32_t *VarSize) {
1648 SymbolInfo SI;
1649 int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI);
1650 if (Rc == 0) {
1651 *VarAddr = SI.Addr;
1652 *VarSize = SI.Size;
1653 return HSA_STATUS_SUCCESS;
1654 }
1655 return HSA_STATUS_ERROR;
1656}
1657
1658template <typename C>
1659hsa_status_t moduleRegisterFromMemoryToPlace(
1660 std::map<std::string, atl_kernel_info_t> &KernelInfoTable,
1661 std::map<std::string, atl_symbol_info_t> &SymbolInfoTable,
1662 void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb,
1663 std::vector<hsa_executable_t> &HSAExecutables) {
1664 auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t {
1665 C *Unwrapped = static_cast<C *>(CbState);
1666 return (*Unwrapped)(Data, Size);
1667 };
1668 return core::RegisterModuleFromMemory(
1669 KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize,
1670 DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb),
1671 HSAExecutables);
1672}
1673
1674uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) {
1675 uint64_t DeviceStateBytes = 0;
1676 {
1677 // If this is the deviceRTL, get the state variable size
1678 SymbolInfo SizeSi;
1679 int Rc = getSymbolInfoWithoutLoading(
1680 ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi);
1681
1682 if (Rc == 0) {
1683 if (SizeSi.Size != sizeof(uint64_t)) {
1684 DP("Found device_State_size variable with wrong size\n"){};
1685 return 0;
1686 }
1687
1688 // Read number of bytes directly from the elf
1689 memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t));
1690 }
1691 }
1692 return DeviceStateBytes;
1693}
1694
1695struct DeviceEnvironment {
1696 // initialise an DeviceEnvironmentTy in the deviceRTL
1697 // patches around differences in the deviceRTL between trunk, aomp,
1698 // rocmcc. Over time these differences will tend to zero and this class
1699 // simplified.
1700 // Symbol may be in .data or .bss, and may be missing fields, todo:
1701 // review aomp/trunk/rocm and simplify the following
1702
1703 // The symbol may also have been deadstripped because the device side
1704 // accessors were unused.
1705
1706 // If the symbol is in .data (aomp, rocm) it can be written directly.
1707 // If it is in .bss, we must wait for it to be allocated space on the
1708 // gpu (trunk) and initialize after loading.
1709 const char *sym() { return "omptarget_device_environment"; }
1710
1711 DeviceEnvironmentTy HostDeviceEnv;
1712 SymbolInfo SI;
1713 bool Valid = false;
1714
1715 __tgt_device_image *Image;
1716 const size_t ImgSize;
1717
1718 DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize,
1719 __tgt_device_image *Image, const size_t ImgSize)
1720 : Image(Image), ImgSize(ImgSize) {
1721
1722 HostDeviceEnv.NumDevices = NumberDevices;
1723 HostDeviceEnv.DeviceNum = DeviceId;
1724 HostDeviceEnv.DebugKind = 0;
1725 HostDeviceEnv.DynamicMemSize = DynamicMemSize;
1726 if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG"))
1727 HostDeviceEnv.DebugKind = std::stoi(EnvStr);
1728
1729 int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize,
1730 sym(), &SI);
1731 if (Rc != 0) {
1732 DP("Finding global device environment '%s' - symbol missing.\n", sym()){};
1733 return;
1734 }
1735
1736 if (SI.Size > sizeof(HostDeviceEnv)) {
1737 DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,{}
1738 sizeof(HostDeviceEnv)){};
1739 return;
1740 }
1741
1742 Valid = true;
1743 }
1744
1745 bool inImage() { return SI.ShType != SHT_NOBITS; }
1746
1747 hsa_status_t beforeLoading(void *Data, size_t Size) {
1748 if (Valid) {
1749 if (inImage()) {
1750 DP("Setting global device environment before load (%u bytes)\n",{}
1751 SI.Size){};
1752 uint64_t Offset = reinterpret_cast<const char *>(SI.Addr) -
1753 reinterpret_cast<const char *>(Image->ImageStart);
1754 void *Pos = reinterpret_cast<char *>(Data) + Offset;
1755 memcpy(Pos, &HostDeviceEnv, SI.Size);
1756 }
1757 }
1758 return HSA_STATUS_SUCCESS;
1759 }
1760
1761 hsa_status_t afterLoading() {
1762 if (Valid) {
1763 if (!inImage()) {
1764 DP("Setting global device environment after load (%u bytes)\n",{}
1765 SI.Size){};
1766 int DeviceId = HostDeviceEnv.DeviceNum;
1767 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
1768 void *StatePtr;
1769 uint32_t StatePtrSize;
1770 hsa_status_t Err = interop_hsa_get_symbol_info(
1771 SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize);
1772 if (Err != HSA_STATUS_SUCCESS) {
1773 DP("failed to find %s in loaded image\n", sym()){};
1774 return Err;
1775 }
1776
1777 if (StatePtrSize != SI.Size) {
1778 DP("Symbol had size %u before loading, %u after\n", StatePtrSize,{}
1779 SI.Size){};
1780 return HSA_STATUS_ERROR;
1781 }
1782
1783 return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv,
1784 StatePtrSize, DeviceId);
1785 }
1786 }
1787 return HSA_STATUS_SUCCESS;
1788 }
1789};
1790
1791hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) {
1792 uint64_t Rounded = 4 * ((Size + 3) / 4);
1793 void *Ptr;
1794 hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
1795 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr);
1796 if (Err != HSA_STATUS_SUCCESS) {
1797 return Err;
1798 }
1799
1800 hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4);
1801 if (Rc != HSA_STATUS_SUCCESS) {
1802 DP("zero fill device_state failed with %u\n", Rc){};
1803 core::Runtime::Memfree(Ptr);
1804 return HSA_STATUS_ERROR;
1805 }
1806
1807 *RetPtr = Ptr;
1808 return HSA_STATUS_SUCCESS;
1809}
1810
1811bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) {
1812 SymbolInfo SI;
1813 int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI);
1814 return (Rc == 0) && (SI.Addr != nullptr);
1815}
1816
1817} // namespace
1818
1819namespace core {
1820hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) {
1821 return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(),
1822 &DeviceInfo().HSAAgents[0], NULL__null, Ptr);
1823}
1824} // namespace core
1825
1826static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) {
1827 hsa_status_t err;
1828 uint32_t name_len;
1829 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len);
1830 if (err != HSA_STATUS_SUCCESS) {
1831 DP("Error getting ISA info length\n"){};
1832 return err;
1833 }
1834
1835 char TargetID[name_len];
1836 err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID);
1837 if (err != HSA_STATUS_SUCCESS) {
1838 DP("Error getting ISA info name\n"){};
1839 return err;
1840 }
1841
1842 auto TripleTargetID = llvm::StringRef(TargetID);
1843 if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) {
1844 DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str());
1845 }
1846 return HSA_STATUS_SUCCESS;
1847}
1848
1849/// Parse a TargetID to get processor arch and feature map.
1850/// Returns processor subarch.
1851/// Returns TargetID features in \p FeatureMap argument.
1852/// If the \p TargetID contains feature+, FeatureMap it to true.
1853/// If the \p TargetID contains feature-, FeatureMap it to false.
1854/// If the \p TargetID does not contain a feature (default), do not map it.
1855StringRef parseTargetID(StringRef TargetID, StringMap<bool> &FeatureMap) {
1856 if (TargetID.empty())
1857 return llvm::StringRef();
1858
1859 auto ArchFeature = TargetID.split(":");
1860 auto Arch = ArchFeature.first;
1861 auto Features = ArchFeature.second;
1862 if (Features.empty())
1863 return Arch;
1864
1865 if (Features.contains("sramecc+")) {
1866 FeatureMap.insert(std::pair<std::string, bool>("sramecc", true));
1867 } else if (Features.contains("sramecc-")) {
1868 FeatureMap.insert(std::pair<std::string, bool>("sramecc", false));
1869 }
1870 if (Features.contains("xnack+")) {
1871 FeatureMap.insert(std::pair<std::string, bool>("xnack", true));
1872 } else if (Features.contains("xnack-")) {
1873 FeatureMap.insert(std::pair<std::string, bool>("xnack", false));
1874 }
1875
1876 return Arch;
1877}
1878
1879/// Checks if an image \p ImgInfo is compatible with current
1880/// system's environment \p EnvInfo
1881bool IsImageCompatibleWithEnv(const char *ImgInfo, std::string EnvInfo) {
1882 llvm::StringRef ImgTID(ImgInfo), EnvTID(EnvInfo);
1883
1884 // Compatible in case of exact match
1885 if (ImgTID == EnvTID) {
1886 DP("Compatible: Exact match \t[Image: %s]\t:\t[Environment: %s]\n",{}
1887 ImgTID.data(), EnvTID.data()){};
1888 return true;
1889 }
1890
1891 // Incompatible if Archs mismatch.
1892 StringMap<bool> ImgMap, EnvMap;
1893 StringRef ImgArch = parseTargetID(ImgTID, ImgMap);
1894 StringRef EnvArch = parseTargetID(EnvTID, EnvMap);
1895
1896 // Both EnvArch and ImgArch can't be empty here.
1897 if (EnvArch.empty() || ImgArch.empty() || !ImgArch.contains(EnvArch)) {
1898 DP("Incompatible: Processor mismatch \t[Image: %s]\t:\t[Environment: %s]\n",{}
1899 ImgTID.data(), EnvTID.data()){};
1900 return false;
1901 }
1902
1903 // Incompatible if image has more features than the environment, irrespective
1904 // of type or sign of features.
1905 if (ImgMap.size() > EnvMap.size()) {
1906 DP("Incompatible: Image has more features than the environment \t[Image: "{}
1907 "%s]\t:\t[Environment: %s]\n",{}
1908 ImgTID.data(), EnvTID.data()){};
1909 return false;
1910 }
1911
1912 // Compatible if each target feature specified by the environment is
1913 // compatible with target feature of the image. The target feature is
1914 // compatible if the iamge does not specify it (meaning Any), or if it
1915 // specifies it with the same value (meaning On or Off).
1916 for (const auto &ImgFeature : ImgMap) {
1917 auto EnvFeature = EnvMap.find(ImgFeature.first());
1918 if (EnvFeature == EnvMap.end()) {
1919 DP("Incompatible: Value of Image's non-ANY feature is not matching with "{}
1920 "the Environment feature's ANY value \t[Image: %s]\t:\t[Environment: "{}
1921 "%s]\n",{}
1922 ImgTID.data(), EnvTID.data()){};
1923 return false;
1924 } else if (EnvFeature->first() == ImgFeature.first() &&
1925 EnvFeature->second != ImgFeature.second) {
1926 DP("Incompatible: Value of Image's non-ANY feature is not matching with "{}
1927 "the Environment feature's non-ANY value \t[Image: "{}
1928 "%s]\t:\t[Environment: %s]\n",{}
1929 ImgTID.data(), EnvTID.data()){};
1930 return false;
1931 }
1932 }
1933
1934 // Image is compatible if all features of Environment are:
1935 // - either, present in the Image's features map with the same sign,
1936 // - or, the feature is missing from Image's features map i.e. it is
1937 // set to ANY
1938 DP("Compatible: Target IDs are compatible \t[Image: %s]\t:\t[Environment: "{}
1939 "%s]\n",{}
1940 ImgTID.data(), EnvTID.data()){};
1941 return true;
1942}
1943
1944extern "C" {
1945int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) {
1946 return elfMachineIdIsAmdgcn(Image);
1947}
1948
1949int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image,
1950 __tgt_image_info *info) {
1951 if (!__tgt_rtl_is_valid_binary(image))
1952 return false;
1953
1954 // A subarchitecture was not specified. Assume it is compatible.
1955 if (!info->Arch)
1956 return true;
1957
1958 int32_t NumberOfDevices = __tgt_rtl_number_of_devices();
1959
1960 for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) {
1961 __tgt_rtl_init_device(DeviceId);
1962 hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId];
1963 hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId);
1964 if (err != HSA_STATUS_SUCCESS) {
1965 DP("Error iterating ISAs\n"){};
1966 return false;
1967 }
1968 if (!IsImageCompatibleWithEnv(info->Arch, DeviceInfo().TargetID[DeviceId]))
1969 return false;
1970 }
1971 DP("Image has Target ID compatible with the current environment: %s\n",{}
1972 info->Arch){};
1973 return true;
1974}
1975
1976int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS(0); }
1977int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS(0); }
1978
1979int __tgt_rtl_number_of_devices() {
1980 // If the construction failed, no methods are safe to call
1981 if (DeviceInfo().ConstructionSucceeded) {
1982 return DeviceInfo().NumberOfDevices;
1983 }
1984 DP("AMDGPU plugin construction failed. Zero devices available\n"){};
1985 return 0;
1986}
1987
1988int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) {
1989 DP("Init requires flags to %ld\n", RequiresFlags){};
1990 DeviceInfo().RequiresFlags = RequiresFlags;
1991 return RequiresFlags;
1992}
1993
1994int32_t __tgt_rtl_init_device(int DeviceId) {
1995 hsa_status_t Err = hsa_init();
1996 if (Err != HSA_STATUS_SUCCESS) {
1997 DP("HSA Initialization Failed.\n"){};
1998 return HSA_STATUS_ERROR;
1999 }
2000 // this is per device id init
2001 DP("Initialize the device id: %d\n", DeviceId){};
2002
2003 hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId];
2004
2005 // Get number of Compute Unit
2006 uint32_t ComputeUnits = 0;
2007 Err = hsa_agent_get_info(
2008 Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT,
2009 &ComputeUnits);
2010 if (Err != HSA_STATUS_SUCCESS) {
2011 DeviceInfo().ComputeUnits[DeviceId] = 1;
2012 DP("Error getting compute units : settiing to 1\n"){};
2013 } else {
2014 DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits;
2015 DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]){};
2016 }
2017
2018 char GetInfoName[64]; // 64 max size returned by get info
2019 Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME,
2020 (void *)GetInfoName);
2021 if (Err)
2022 DeviceInfo().GPUName[DeviceId] = "--unknown gpu--";
2023 else {
2024 DeviceInfo().GPUName[DeviceId] = GetInfoName;
2025 }
2026
2027 if (print_kernel_trace & STARTUP_DETAILS)
2028 DP("Device#%-2d CU's: %2d %s\n", DeviceId,{}
2029 DeviceInfo().ComputeUnits[DeviceId],{}
2030 DeviceInfo().GPUName[DeviceId].c_str()){};
2031
2032 // Query attributes to determine number of threads/block and blocks/grid.
2033 uint16_t WorkgroupMaxDim[3];
2034 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM,
2035 &WorkgroupMaxDim);
2036 if (Err != HSA_STATUS_SUCCESS) {
2037 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams;
2038 DP("Error getting grid dims: num groups : %d\n",{}
2039 RTLDeviceInfoTy::DefaultNumTeams){};
2040 } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) {
2041 DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0];
2042 DP("Using %d ROCm blocks per grid\n",{}
2043 DeviceInfo().GroupsPerDevice[DeviceId]){};
2044 } else {
2045 DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit;
2046 DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "{}
2047 "at the hard limit\n",{}
2048 WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit){};
2049 }
2050
2051 // Get thread limit
2052 hsa_dim3_t GridMaxDim;
2053 Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim);
2054 if (Err == HSA_STATUS_SUCCESS) {
2055 DeviceInfo().ThreadsPerGroup[DeviceId] =
2056 reinterpret_cast<uint32_t *>(&GridMaxDim)[0] /
2057 DeviceInfo().GroupsPerDevice[DeviceId];
2058
2059 if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) {
2060 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2061 DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){};
2062 } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId],
2063 RTLDeviceInfoTy::MaxWgSize)) {
2064 DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){};
2065 } else {
2066 DP("Using ROCm Queried thread limit: %d\n",{}
2067 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2068 }
2069 } else {
2070 DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize;
2071 DP("Error getting max block dimension, use default:%d \n",{}
2072 RTLDeviceInfoTy::MaxWgSize){};
2073 }
2074
2075 // Get wavefront size
2076 uint32_t WavefrontSize = 0;
2077 Err =
2078 hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize);
2079 if (Err == HSA_STATUS_SUCCESS) {
2080 DP("Queried wavefront size: %d\n", WavefrontSize){};
2081 DeviceInfo().WarpSize[DeviceId] = WavefrontSize;
2082 } else {
2083 // TODO: Burn the wavefront size into the code object
2084 DP("Warning: Unknown wavefront size, assuming 64\n"){};
2085 DeviceInfo().WarpSize[DeviceId] = 64;
2086 }
2087
2088 // Adjust teams to the env variables
2089
2090 if (DeviceInfo().Env.TeamLimit > 0 &&
2091 (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId],
2092 DeviceInfo().Env.TeamLimit))) {
2093 DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",{}
2094 DeviceInfo().Env.TeamLimit){};
2095 }
2096
2097 // Set default number of teams
2098 if (DeviceInfo().Env.NumTeams > 0) {
2099 DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams;
2100 DP("Default number of teams set according to environment %d\n",{}
2101 DeviceInfo().Env.NumTeams){};
2102 } else {
2103 char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC");
2104 int TeamsPerCU = DefaultTeamsPerCU;
2105 if (TeamsPerCUEnvStr) {
2106 TeamsPerCU = std::stoi(TeamsPerCUEnvStr);
2107 }
2108
2109 DeviceInfo().NumTeams[DeviceId] =
2110 TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId];
2111 DP("Default number of teams = %d * number of compute units %d\n",{}
2112 TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]){};
2113 }
2114
2115 if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId],
2116 DeviceInfo().GroupsPerDevice[DeviceId])) {
2117 DP("Default number of teams exceeds device limit, capping at %d\n",{}
2118 DeviceInfo().GroupsPerDevice[DeviceId]){};
2119 }
2120
2121 // Adjust threads to the env variables
2122 if (DeviceInfo().Env.TeamThreadLimit > 0 &&
2123 (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2124 DeviceInfo().Env.TeamThreadLimit))) {
2125 DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",{}
2126 DeviceInfo().Env.TeamThreadLimit){};
2127 }
2128
2129 // Set default number of threads
2130 DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize;
2131 DP("Default number of threads set according to library's default %d\n",{}
2132 RTLDeviceInfoTy::DefaultWgSize){};
2133 if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId],
2134 DeviceInfo().ThreadsPerGroup[DeviceId])) {
2135 DP("Default number of threads exceeds device limit, capping at %d\n",{}
2136 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2137 }
2138
2139 DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",{}
2140 DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],{}
2141 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2142
2143 DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,{}
2144 DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],{}
2145 DeviceInfo().GroupsPerDevice[DeviceId],{}
2146 DeviceInfo().GroupsPerDevice[DeviceId] *{}
2147 DeviceInfo().ThreadsPerGroup[DeviceId]){};
2148
2149 return OFFLOAD_SUCCESS(0);
2150}
2151
2152static __tgt_target_table *
2153__tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image);
2154
2155__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
2156 __tgt_device_image *Image) {
2157 DeviceInfo().LoadRunLock.lock();
2158 __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image);
2159 DeviceInfo().LoadRunLock.unlock();
2160 return Res;
2161}
2162
2163__tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId,
2164 __tgt_device_image *Image) {
2165 // This function loads the device image onto gpu[DeviceId] and does other
2166 // per-image initialization work. Specifically:
2167 //
2168 // - Initialize an DeviceEnvironmentTy instance embedded in the
2169 // image at the symbol "omptarget_device_environment"
2170 // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL.
2171 //
2172 // - Allocate a large array per-gpu (could be moved to init_device)
2173 // - Read a uint64_t at symbol omptarget_nvptx_device_State_size
2174 // - Allocate at least that many bytes of gpu memory
2175 // - Zero initialize it
2176 // - Write the pointer to the symbol omptarget_nvptx_device_State
2177 //
2178 // - Pulls some per-kernel information together from various sources and
2179 // records it in the KernelsList for quicker access later
2180 //
2181 // The initialization can be done before or after loading the image onto the
2182 // gpu. This function presently does a mixture. Using the hsa api to get/set
2183 // the information is simpler to implement, in exchange for more complicated
2184 // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes
2185 // back from the gpu vs a hashtable lookup on the host.
2186
2187 const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart;
2188
2189 DeviceInfo().clearOffloadEntriesTable(DeviceId);
2190
2191 // We do not need to set the ELF version because the caller of this function
2192 // had to do that to decide the right runtime to use
2193
2194 if (!elfMachineIdIsAmdgcn(Image))
2195 return NULL__null;
2196
2197 {
2198 auto Env =
2199 DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices,
2200 DeviceInfo().Env.DynamicMemSize, Image, ImgSize);
2201
2202 auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId];
2203 auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId];
2204 hsa_status_t Err = moduleRegisterFromMemoryToPlace(
2205 KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId,
2206 [&](void *Data, size_t Size) {
2207 if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) {
2208 __atomic_store_n(&DeviceInfo().HostcallRequired, true,
2209 __ATOMIC_RELEASE3);
2210 }
2211 return Env.beforeLoading(Data, Size);
2212 },
2213 DeviceInfo().HSAExecutables);
2214
2215 check("Module registering", Err){};
2216 if (Err != HSA_STATUS_SUCCESS) {
2217 const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str();
2218 const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image));
2219
2220 if (strcmp(DeviceName, ElfName) != 0) {
2221 DP("Possible gpu arch mismatch: device:%s, image:%s please check"{}
2222 " compiler flag: -march=<gpu>\n",{}
2223 DeviceName, ElfName){};
2224 } else {
2225 DP("Error loading image onto GPU: %s\n", get_error_string(Err)){};
2226 }
2227
2228 return NULL__null;
2229 }
2230
2231 Err = Env.afterLoading();
2232 if (Err != HSA_STATUS_SUCCESS) {
2233 return NULL__null;
2234 }
2235 }
2236
2237 DP("AMDGPU module successfully loaded!\n"){};
2238
2239 {
2240 // the device_State array is either large value in bss or a void* that
2241 // needs to be assigned to a pointer to an array of size device_state_bytes
2242 // If absent, it has been deadstripped and needs no setup.
2243
2244 void *StatePtr;
2245 uint32_t StatePtrSize;
2246 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2247 hsa_status_t Err = interop_hsa_get_symbol_info(
2248 SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr,
2249 &StatePtrSize);
2250
2251 if (Err != HSA_STATUS_SUCCESS) {
2252 DP("No device_state symbol found, skipping initialization\n"){};
2253 } else {
2254 if (StatePtrSize < sizeof(void *)) {
2255 DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,{}
2256 sizeof(void *)){};
2257 return NULL__null;
2258 }
2259
2260 // if it's larger than a void*, assume it's a bss array and no further
2261 // initialization is required. Only try to set up a pointer for
2262 // sizeof(void*)
2263 if (StatePtrSize == sizeof(void *)) {
2264 uint64_t DeviceStateBytes =
2265 getDeviceStateBytes((char *)Image->ImageStart, ImgSize);
2266 if (DeviceStateBytes == 0) {
2267 DP("Can't initialize device_State, missing size information\n"){};
2268 return NULL__null;
2269 }
2270
2271 auto &DSS = DeviceInfo().DeviceStateStore[DeviceId];
2272 if (DSS.first.get() == nullptr) {
2273 assert(DSS.second == 0)(static_cast <bool> (DSS.second == 0) ? void (0) : __assert_fail
("DSS.second == 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp"
, 2273, __extension__ __PRETTY_FUNCTION__))
;
2274 void *Ptr = NULL__null;
2275 hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId);
2276 if (Err != HSA_STATUS_SUCCESS) {
2277 DP("Failed to allocate device_state array\n"){};
2278 return NULL__null;
2279 }
2280 DSS = {
2281 std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr},
2282 DeviceStateBytes,
2283 };
2284 }
2285
2286 void *Ptr = DSS.first.get();
2287 if (DeviceStateBytes != DSS.second) {
2288 DP("Inconsistent sizes of device_State unsupported\n"){};
2289 return NULL__null;
2290 }
2291
2292 // write ptr to device memory so it can be used by later kernels
2293 Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr,
2294 sizeof(void *), DeviceId);
2295 if (Err != HSA_STATUS_SUCCESS) {
2296 DP("memcpy install of state_ptr failed\n"){};
2297 return NULL__null;
2298 }
2299 }
2300 }
2301 }
2302
2303 // Here, we take advantage of the data that is appended after img_end to get
2304 // the symbols' name we need to load. This data consist of the host entries
2305 // begin and end as well as the target name (see the offloading linker script
2306 // creation in clang compiler).
2307
2308 // Find the symbols in the module by name. The name can be obtain by
2309 // concatenating the host entry name with the target name
2310
2311 __tgt_offload_entry *HostBegin = Image->EntriesBegin;
2312 __tgt_offload_entry *HostEnd = Image->EntriesEnd;
2313
2314 for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) {
2315
2316 if (!E->addr) {
2317 // The host should have always something in the address to
2318 // uniquely identify the target region.
2319 DP("Analyzing host entry '<null>' (size = %lld)...\n",{}
2320 (unsigned long long)E->size){};
2321 return NULL__null;
2322 }
2323
2324 if (E->size) {
2325 __tgt_offload_entry Entry = *E;
2326
2327 void *Varptr;
2328 uint32_t Varsize;
2329
2330 auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId];
2331 hsa_status_t Err = interop_hsa_get_symbol_info(
2332 SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize);
2333
2334 if (Err != HSA_STATUS_SUCCESS) {
2335 // Inform the user what symbol prevented offloading
2336 DP("Loading global '%s' (Failed)\n", E->name){};
2337 return NULL__null;
2338 }
2339
2340 if (Varsize != E->size) {
2341 DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,{}
2342 Varsize, E->size){};
2343 return NULL__null;
2344 }
2345
2346 DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",{}
2347 DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)){};
2348 Entry.addr = (void *)Varptr;
2349
2350 DeviceInfo().addOffloadEntry(DeviceId, Entry);
2351
2352 if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
2353 E->flags & OMP_DECLARE_TARGET_LINK) {
2354 // If unified memory is present any target link variables
2355 // can access host addresses directly. There is no longer a
2356 // need for device copies.
2357 Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr,
2358 sizeof(void *), DeviceId);
2359 if (Err != HSA_STATUS_SUCCESS)
2360 DP("Error when copying USM\n"){};
2361 DP("Copy linked variable host address (" DPxMOD ")"{}
2362 "to device address (" DPxMOD ")\n",{}
2363 DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)){};
2364 }
2365
2366 continue;
2367 }
2368
2369 DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)){};
2370
2371 // errors in kernarg_segment_size previously treated as = 0 (or as undef)
2372 uint32_t KernargSegmentSize = 0;
2373 auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId];
2374 hsa_status_t Err = HSA_STATUS_SUCCESS;
2375 if (!E->name) {
2376 Err = HSA_STATUS_ERROR;
Value stored to 'Err' is never read
2377 } else {
2378 std::string KernelStr = std::string(E->name);
2379 auto It = KernelInfoMap.find(KernelStr);
2380 if (It != KernelInfoMap.end()) {
2381 atl_kernel_info_t Info = It->second;
2382 KernargSegmentSize = Info.kernel_segment_size;
2383 } else {
2384 Err = HSA_STATUS_ERROR;
2385 }
2386 }
2387
2388 // default value GENERIC (in case symbol is missing from cubin file)
2389 llvm::omp::OMPTgtExecModeFlags ExecModeVal =
2390 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC;
2391
2392 // get flat group size if present, else Default_WG_Size
2393 int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2394
2395 // get Kernel Descriptor if present.
2396 // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp
2397 struct KernDescValType {
2398 uint16_t Version;
2399 uint16_t TSize;
2400 uint16_t WGSize;
2401 };
2402 struct KernDescValType KernDescVal;
2403 std::string KernDescNameStr(E->name);
2404 KernDescNameStr += "_kern_desc";
2405 const char *KernDescName = KernDescNameStr.c_str();
2406
2407 const void *KernDescPtr;
2408 uint32_t KernDescSize;
2409 void *CallStackAddr = nullptr;
2410 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName,
2411 &KernDescPtr, &KernDescSize);
2412
2413 if (Err == HSA_STATUS_SUCCESS) {
2414 if ((size_t)KernDescSize != sizeof(KernDescVal))
2415 DP("Loading global computation properties '%s' - size mismatch (%u != "{}
2416 "%lu)\n",{}
2417 KernDescName, KernDescSize, sizeof(KernDescVal)){};
2418
2419 memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize);
2420
2421 // Check structure size against recorded size.
2422 if ((size_t)KernDescSize != KernDescVal.TSize)
2423 DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",{}
2424 sizeof(KernDescVal), KernDescVal.TSize, KernDescName){};
2425
2426 DP("After loading global for %s KernDesc \n", KernDescName){};
2427 DP("KernDesc: Version: %d\n", KernDescVal.Version){};
2428 DP("KernDesc: TSize: %d\n", KernDescVal.TSize){};
2429 DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize){};
2430
2431 if (KernDescVal.WGSize == 0) {
2432 KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize;
2433 DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize){};
2434 }
2435 WGSizeVal = KernDescVal.WGSize;
2436 DP("WGSizeVal %d\n", WGSizeVal){};
2437 check("Loading KernDesc computation property", Err){};
2438 } else {
2439 DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName){};
2440
2441 // Flat group size
2442 std::string WGSizeNameStr(E->name);
2443 WGSizeNameStr += "_wg_size";
2444 const char *WGSizeName = WGSizeNameStr.c_str();
2445
2446 const void *WGSizePtr;
2447 uint32_t WGSize;
2448 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName,
2449 &WGSizePtr, &WGSize);
2450
2451 if (Err == HSA_STATUS_SUCCESS) {
2452 if ((size_t)WGSize != sizeof(int16_t)) {
2453 DP("Loading global computation properties '%s' - size mismatch (%u "{}
2454 "!= "{}
2455 "%lu)\n",{}
2456 WGSizeName, WGSize, sizeof(int16_t)){};
2457 return NULL__null;
2458 }
2459
2460 memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize);
2461
2462 DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal){};
2463
2464 if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize ||
2465 WGSizeVal > RTLDeviceInfoTy::MaxWgSize) {
2466 DP("Error wrong WGSize value specified in HSA code object file: "{}
2467 "%d\n",{}
2468 WGSizeVal){};
2469 WGSizeVal = RTLDeviceInfoTy::DefaultWgSize;
2470 }
2471 } else {
2472 DP("Warning: Loading WGSize '%s' - symbol not found, "{}
2473 "using default value %d\n",{}
2474 WGSizeName, WGSizeVal){};
2475 }
2476
2477 check("Loading WGSize computation property", Err){};
2478 }
2479
2480 // Read execution mode from global in binary
2481 std::string ExecModeNameStr(E->name);
2482 ExecModeNameStr += "_exec_mode";
2483 const char *ExecModeName = ExecModeNameStr.c_str();
2484
2485 const void *ExecModePtr;
2486 uint32_t VarSize;
2487 Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName,
2488 &ExecModePtr, &VarSize);
2489
2490 if (Err == HSA_STATUS_SUCCESS) {
2491 if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) {
2492 DP("Loading global computation properties '%s' - size mismatch(%u != "{}
2493 "%lu)\n",{}
2494 ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)){};
2495 return NULL__null;
2496 }
2497
2498 memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize);
2499
2500 DP("After loading global for %s ExecMode = %d\n", ExecModeName,{}
2501 ExecModeVal){};
2502
2503 if (ExecModeVal < 0 ||
2504 ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) {
2505 DP("Error wrong exec_mode value specified in HSA code object file: "{}
2506 "%d\n",{}
2507 ExecModeVal){};
2508 return NULL__null;
2509 }
2510 } else {
2511 DP("Loading global exec_mode '%s' - symbol missing, using default "{}
2512 "value "{}
2513 "GENERIC (1)\n",{}
2514 ExecModeName){};
2515 }
2516 check("Loading computation property", Err){};
2517
2518 KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId,
2519 CallStackAddr, E->name, KernargSegmentSize,
2520 DeviceInfo().KernArgPool));
2521 __tgt_offload_entry Entry = *E;
2522 Entry.addr = (void *)&KernelsList.back();
2523 DeviceInfo().addOffloadEntry(DeviceId, Entry);
2524 DP("Entry point %ld maps to %s\n", E - HostBegin, E->name){};
2525 }
2526
2527 return DeviceInfo().getOffloadEntriesTable(DeviceId);
2528}
2529
2530void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) {
2531 void *Ptr = NULL__null;
2532 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2532, __extension__
__PRETTY_FUNCTION__))
;
2533
2534 hsa_amd_memory_pool_t MemoryPool;
2535 switch (Kind) {
2536 case TARGET_ALLOC_DEFAULT:
2537 case TARGET_ALLOC_DEVICE:
2538 // GPU memory
2539 MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId);
2540 break;
2541 case TARGET_ALLOC_HOST:
2542 // non-migratable memory accessible by host and device(s)
2543 MemoryPool = DeviceInfo().getHostMemoryPool();
2544 break;
2545 default:
2546 REPORT("Invalid target data allocation kind or requested allocator not "do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not "
"implemented yet\n"); } while (0);
2547 "implemented yet\n")do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not "
"implemented yet\n"); } while (0);
;
2548 return NULL__null;
2549 }
2550
2551 hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr);
2552 DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,{}
2553 (long long unsigned)(Elf64_Addr)Ptr){};
2554 Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL__null;
2555 return Ptr;
2556}
2557
2558int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr,
2559 int64_t Size) {
2560 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2560, __extension__
__PRETTY_FUNCTION__))
;
2561 __tgt_async_info AsyncInfo;
2562 int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo);
2563 if (Rc != OFFLOAD_SUCCESS(0))
2564 return OFFLOAD_FAIL(~0);
2565
2566 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2567}
2568
2569int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr,
2570 int64_t Size, __tgt_async_info *AsyncInfo) {
2571 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2571, __extension__
__PRETTY_FUNCTION__))
;
2572 if (AsyncInfo) {
2573 initAsyncInfo(AsyncInfo);
2574 return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo);
2575 }
2576 return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size);
2577}
2578
2579int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr,
2580 int64_t Size) {
2581 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2581, __extension__
__PRETTY_FUNCTION__))
;
2582 __tgt_async_info AsyncInfo;
2583 int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo);
2584 if (Rc != OFFLOAD_SUCCESS(0))
2585 return OFFLOAD_FAIL(~0);
2586
2587 return __tgt_rtl_synchronize(DeviceId, &AsyncInfo);
2588}
2589
2590int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr,
2591 int64_t Size,
2592 __tgt_async_info *AsyncInfo) {
2593 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2593, __extension__
__PRETTY_FUNCTION__))
;
2594 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2594, __extension__
__PRETTY_FUNCTION__))
;
2595 initAsyncInfo(AsyncInfo);
2596 return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo);
2597}
2598
2599int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) {
2600 assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices
&& "Device ID too large") ? void (0) : __assert_fail
("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2600, __extension__
__PRETTY_FUNCTION__))
;
2601 // HSA can free pointers allocated from different types of memory pool.
2602 hsa_status_t Err;
2603 DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr){};
2604 Err = core::Runtime::Memfree(TgtPtr);
2605 if (Err != HSA_STATUS_SUCCESS) {
2606 DP("Error when freeing CUDA memory\n"){};
2607 return OFFLOAD_FAIL(~0);
2608 }
2609 return OFFLOAD_SUCCESS(0);
2610}
2611
2612int32_t __tgt_rtl_run_target_team_region(int32_t DeviceId, void *TgtEntryPtr,
2613 void **TgtArgs, ptrdiff_t *TgtOffsets,
2614 int32_t ArgNum, int32_t NumTeams,
2615 int32_t ThreadLimit,
2616 uint64_t LoopTripcount) {
2617
2618 DeviceInfo().LoadRunLock.lock_shared();
2619 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2620 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
2621
2622 DeviceInfo().LoadRunLock.unlock_shared();
2623 return Res;
2624}
2625
2626int32_t __tgt_rtl_run_target_region(int32_t DeviceId, void *TgtEntryPtr,
2627 void **TgtArgs, ptrdiff_t *TgtOffsets,
2628 int32_t ArgNum) {
2629 // use one team and one thread
2630 // fix thread num
2631 int32_t TeamNum = 1;
2632 int32_t ThreadLimit = 0; // use default
2633 return __tgt_rtl_run_target_team_region(DeviceId, TgtEntryPtr, TgtArgs,
2634 TgtOffsets, ArgNum, TeamNum,
2635 ThreadLimit, 0);
2636}
2637
2638int32_t __tgt_rtl_run_target_team_region_async(
2639 int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, ptrdiff_t *TgtOffsets,
2640 int32_t ArgNum, int32_t NumTeams, int32_t ThreadLimit,
2641 uint64_t LoopTripcount, __tgt_async_info *AsyncInfo) {
2642 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2642, __extension__
__PRETTY_FUNCTION__))
;
2643 initAsyncInfo(AsyncInfo);
2644
2645 DeviceInfo().LoadRunLock.lock_shared();
2646 int32_t Res = runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets,
2647 ArgNum, NumTeams, ThreadLimit, LoopTripcount);
2648
2649 DeviceInfo().LoadRunLock.unlock_shared();
2650 return Res;
2651}
2652
2653int32_t __tgt_rtl_run_target_region_async(int32_t DeviceId, void *TgtEntryPtr,
2654 void **TgtArgs, ptrdiff_t *TgtOffsets,
2655 int32_t ArgNum,
2656 __tgt_async_info *AsyncInfo) {
2657 // use one team and one thread
2658 // fix thread num
2659 int32_t TeamNum = 1;
2660 int32_t ThreadLimit = 0; // use default
2661 return __tgt_rtl_run_target_team_region_async(DeviceId, TgtEntryPtr, TgtArgs,
2662 TgtOffsets, ArgNum, TeamNum,
2663 ThreadLimit, 0, AsyncInfo);
2664}
2665
2666int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) {
2667 assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr"
) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\""
, "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2667, __extension__
__PRETTY_FUNCTION__))
;
2668
2669 // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant
2670 // is not ensured by devices.cpp for amdgcn
2671 // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr");
2672 if (AsyncInfo->Queue) {
2673 finiAsyncInfo(AsyncInfo);
2674 }
2675 return OFFLOAD_SUCCESS(0);
2676}
2677
2678void __tgt_rtl_print_device_info(int32_t DeviceId) {
2679 // TODO: Assertion to see if DeviceId is correct
2680 // NOTE: We don't need to set context for print device info.
2681
2682 DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]);
2683}
2684
2685} // extern "C"