File: | build/source/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp |
Warning: | line 2312, column 7 Value stored to 'Err' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // RTL for AMD hsa machine |
10 | // |
11 | //===----------------------------------------------------------------------===// |
12 | |
13 | #include "llvm/ADT/StringRef.h" |
14 | #include "llvm/Frontend/OpenMP/OMPConstants.h" |
15 | #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
16 | #include "llvm/Object/ELF.h" |
17 | #include "llvm/Object/ELFObjectFile.h" |
18 | |
19 | #include <algorithm> |
20 | #include <assert.h> |
21 | #include <cstdio> |
22 | #include <cstdlib> |
23 | #include <cstring> |
24 | #include <functional> |
25 | #include <list> |
26 | #include <memory> |
27 | #include <mutex> |
28 | #include <shared_mutex> |
29 | #include <unordered_map> |
30 | #include <vector> |
31 | |
32 | #include "ELFSymbols.h" |
33 | #include "impl_runtime.h" |
34 | #include "interop_hsa.h" |
35 | |
36 | #include "UtilitiesRTL.h" |
37 | #include "internal.h" |
38 | #include "rt.h" |
39 | |
40 | #include "DeviceEnvironment.h" |
41 | #include "get_elf_mach_gfx_name.h" |
42 | #include "omptargetplugin.h" |
43 | #include "print_tracing.h" |
44 | |
45 | using namespace llvm; |
46 | using namespace llvm::object; |
47 | using namespace llvm::ELF; |
48 | using namespace llvm::omp::target::plugin::utils; |
49 | |
50 | // hostrpc interface, FIXME: consider moving to its own include these are |
51 | // statically linked into amdgpu/plugin if present from hostrpc_services.a, |
52 | // linked as --whole-archive to override the weak symbols that are used to |
53 | // implement a fallback for toolchains that do not yet have a hostrpc library. |
54 | extern "C" { |
55 | uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ, |
56 | uint32_t DeviceId); |
57 | hsa_status_t hostrpc_init(); |
58 | hsa_status_t hostrpc_terminate(); |
59 | |
60 | __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } |
61 | __attribute__((weak)) hsa_status_t hostrpc_terminate() { |
62 | return HSA_STATUS_SUCCESS; |
63 | } |
64 | __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, |
65 | uint32_t DeviceId) { |
66 | DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "{} |
67 | "missing\n",{} |
68 | DeviceId){}; |
69 | return 0; |
70 | } |
71 | } |
72 | |
73 | // Heuristic parameters used for kernel launch |
74 | // Number of teams per CU to allow scheduling flexibility |
75 | static const unsigned DefaultTeamsPerCU = 4; |
76 | |
77 | int print_kernel_trace; |
78 | |
79 | #ifdef OMPTARGET_DEBUG |
80 | #define check(msg, status){} \ |
81 | if (status != HSA_STATUS_SUCCESS) { \ |
82 | DP(#msg " failed\n"){}; \ |
83 | } else { \ |
84 | DP(#msg " succeeded\n"){}; \ |
85 | } |
86 | #else |
87 | #define check(msg, status){} \ |
88 | {} |
89 | #endif |
90 | |
91 | #include "elf_common.h" |
92 | |
93 | namespace hsa { |
94 | template <typename C> hsa_status_t iterate_agents(C Cb) { |
95 | auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t { |
96 | C *Unwrapped = static_cast<C *>(Data); |
97 | return (*Unwrapped)(Agent); |
98 | }; |
99 | return hsa_iterate_agents(L, static_cast<void *>(&Cb)); |
100 | } |
101 | |
102 | template <typename C> |
103 | hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) { |
104 | auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t { |
105 | C *Unwrapped = static_cast<C *>(Data); |
106 | return (*Unwrapped)(MemoryPool); |
107 | }; |
108 | |
109 | return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb)); |
110 | } |
111 | |
112 | } // namespace hsa |
113 | |
114 | /// Keep entries table per device |
115 | struct FuncOrGblEntryTy { |
116 | __tgt_target_table Table; |
117 | std::vector<__tgt_offload_entry> Entries; |
118 | }; |
119 | |
120 | struct KernelArgPool { |
121 | private: |
122 | static pthread_mutex_t Mutex; |
123 | |
124 | public: |
125 | uint32_t KernargSegmentSize; |
126 | void *KernargRegion = nullptr; |
127 | std::queue<int> FreeKernargSegments; |
128 | |
129 | uint32_t kernargSizeIncludingImplicit() { |
130 | return KernargSegmentSize + sizeof(AMDGPUImplicitArgsTy); |
131 | } |
132 | |
133 | ~KernelArgPool() { |
134 | if (KernargRegion) { |
135 | auto R = hsa_amd_memory_pool_free(KernargRegion); |
136 | if (R != HSA_STATUS_SUCCESS) { |
137 | DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)){}; |
138 | } |
139 | } |
140 | } |
141 | |
142 | // Can't really copy or move a mutex |
143 | KernelArgPool() = default; |
144 | KernelArgPool(const KernelArgPool &) = delete; |
145 | KernelArgPool(KernelArgPool &&) = delete; |
146 | |
147 | KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) |
148 | : KernargSegmentSize(KernargSegmentSize) { |
149 | |
150 | // impl uses one pool per kernel for all gpus, with a fixed upper size |
151 | // preserving that exact scheme here, including the queue<int> |
152 | |
153 | hsa_status_t Err = hsa_amd_memory_pool_allocate( |
154 | MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS(1024 * 16), 0, |
155 | &KernargRegion); |
156 | |
157 | if (Err != HSA_STATUS_SUCCESS) { |
158 | DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)){}; |
159 | KernargRegion = nullptr; // paranoid |
160 | return; |
161 | } |
162 | |
163 | Err = core::allow_access_to_all_gpu_agents(KernargRegion); |
164 | if (Err != HSA_STATUS_SUCCESS) { |
165 | DP("hsa allow_access_to_all_gpu_agents failed: %s\n",{} |
166 | get_error_string(Err)){}; |
167 | auto R = hsa_amd_memory_pool_free(KernargRegion); |
168 | if (R != HSA_STATUS_SUCCESS) { |
169 | // if free failed, can't do anything more to resolve it |
170 | DP("hsa memory poll free failed: %s\n", get_error_string(Err)){}; |
171 | } |
172 | KernargRegion = nullptr; |
173 | return; |
174 | } |
175 | |
176 | for (int I = 0; I < MAX_NUM_KERNELS(1024 * 16); I++) { |
177 | FreeKernargSegments.push(I); |
178 | } |
179 | } |
180 | |
181 | void *allocate(uint64_t ArgNum) { |
182 | assert((ArgNum * sizeof(void *)) == KernargSegmentSize)(static_cast <bool> ((ArgNum * sizeof(void *)) == KernargSegmentSize ) ? void (0) : __assert_fail ("(ArgNum * sizeof(void *)) == KernargSegmentSize" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 182, __extension__ __PRETTY_FUNCTION__)); |
183 | Lock L(&Mutex); |
184 | void *Res = nullptr; |
185 | if (!FreeKernargSegments.empty()) { |
186 | |
187 | int FreeIdx = FreeKernargSegments.front(); |
188 | Res = static_cast<void *>(static_cast<char *>(KernargRegion) + |
189 | (FreeIdx * kernargSizeIncludingImplicit())); |
190 | assert(FreeIdx == pointerToIndex(Res))(static_cast <bool> (FreeIdx == pointerToIndex(Res)) ? void (0) : __assert_fail ("FreeIdx == pointerToIndex(Res)", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 190, __extension__ __PRETTY_FUNCTION__)); |
191 | FreeKernargSegments.pop(); |
192 | } |
193 | return Res; |
194 | } |
195 | |
196 | void deallocate(void *Ptr) { |
197 | Lock L(&Mutex); |
198 | int Idx = pointerToIndex(Ptr); |
199 | FreeKernargSegments.push(Idx); |
200 | } |
201 | |
202 | private: |
203 | int pointerToIndex(void *Ptr) { |
204 | ptrdiff_t Bytes = |
205 | static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion); |
206 | assert(Bytes >= 0)(static_cast <bool> (Bytes >= 0) ? void (0) : __assert_fail ("Bytes >= 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 206, __extension__ __PRETTY_FUNCTION__)); |
207 | assert(Bytes % kernargSizeIncludingImplicit() == 0)(static_cast <bool> (Bytes % kernargSizeIncludingImplicit () == 0) ? void (0) : __assert_fail ("Bytes % kernargSizeIncludingImplicit() == 0" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 207, __extension__ __PRETTY_FUNCTION__)); |
208 | return Bytes / kernargSizeIncludingImplicit(); |
209 | } |
210 | struct Lock { |
211 | Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); } |
212 | ~Lock() { pthread_mutex_unlock(M); } |
213 | pthread_mutex_t *M; |
214 | }; |
215 | }; |
216 | pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } }; |
217 | |
218 | std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>> |
219 | KernelArgPoolMap; |
220 | |
221 | /// Use a single entity to encode a kernel and a set of flags |
222 | struct KernelTy { |
223 | llvm::omp::OMPTgtExecModeFlags ExecutionMode; |
224 | int16_t ConstWGSize; |
225 | int32_t DeviceId; |
226 | void *CallStackAddr = nullptr; |
227 | const char *Name; |
228 | |
229 | KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, |
230 | int32_t DeviceId, void *CallStackAddr, const char *Name, |
231 | uint32_t KernargSegmentSize, |
232 | hsa_amd_memory_pool_t &KernArgMemoryPool) |
233 | : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), |
234 | DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { |
235 | DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode){}; |
236 | |
237 | std::string N(Name); |
238 | if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { |
239 | KernelArgPoolMap.insert( |
240 | std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool( |
241 | KernargSegmentSize, KernArgMemoryPool)))); |
242 | } |
243 | } |
244 | }; |
245 | |
246 | /// List that contains all the kernels. |
247 | /// FIXME: we may need this to be per device and per library. |
248 | std::list<KernelTy> KernelsList; |
249 | |
250 | template <typename Callback> static hsa_status_t findAgents(Callback CB) { |
251 | |
252 | hsa_status_t Err = |
253 | hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t { |
254 | hsa_device_type_t DeviceType; |
255 | // get_info fails iff HSA runtime not yet initialized |
256 | hsa_status_t Err = |
257 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); |
258 | |
259 | if (Err != HSA_STATUS_SUCCESS) { |
260 | if (print_kernel_trace > 0) |
261 | DP("rtl.cpp: err %s\n", get_error_string(Err)){}; |
262 | |
263 | return Err; |
264 | } |
265 | |
266 | CB(DeviceType, Agent); |
267 | return HSA_STATUS_SUCCESS; |
268 | }); |
269 | |
270 | // iterate_agents fails iff HSA runtime not yet initialized |
271 | if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) { |
272 | DP("rtl.cpp: err %s\n", get_error_string(Err)){}; |
273 | } |
274 | |
275 | return Err; |
276 | } |
277 | |
278 | static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source, |
279 | void *Data) { |
280 | if (Status != HSA_STATUS_SUCCESS) { |
281 | const char *StatusString; |
282 | if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) { |
283 | StatusString = "unavailable"; |
284 | } |
285 | DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,{} |
286 | Status, StatusString){}; |
287 | abort(); |
288 | } |
289 | } |
290 | |
291 | namespace core { |
292 | namespace { |
293 | |
294 | bool checkResult(hsa_status_t Err, const char *ErrMsg) { |
295 | if (Err == HSA_STATUS_SUCCESS) |
296 | return true; |
297 | |
298 | REPORT("%s", ErrMsg)do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s" , ErrMsg); } while (0);; |
299 | REPORT("%s", get_error_string(Err))do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s" , get_error_string(Err)); } while (0);; |
300 | return false; |
301 | } |
302 | |
303 | void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) { |
304 | __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE3); |
305 | } |
306 | |
307 | uint16_t createHeader() { |
308 | uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; |
309 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; |
310 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; |
311 | return Header; |
312 | } |
313 | |
314 | hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { |
315 | bool AllocAllowed = false; |
316 | hsa_status_t Err = hsa_amd_memory_pool_get_info( |
317 | MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, |
318 | &AllocAllowed); |
319 | if (Err != HSA_STATUS_SUCCESS) { |
320 | DP("Alloc allowed in memory pool check failed: %s\n",{} |
321 | get_error_string(Err)){}; |
322 | return Err; |
323 | } |
324 | |
325 | size_t Size = 0; |
326 | Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, |
327 | &Size); |
328 | if (Err != HSA_STATUS_SUCCESS) { |
329 | DP("Get memory pool size failed: %s\n", get_error_string(Err)){}; |
330 | return Err; |
331 | } |
332 | |
333 | return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; |
334 | } |
335 | |
336 | hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { |
337 | std::vector<hsa_amd_memory_pool_t> *Result = |
338 | static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data); |
339 | |
340 | hsa_status_t Err; |
341 | if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) { |
342 | return Err; |
343 | } |
344 | |
345 | Result->push_back(MemoryPool); |
346 | return HSA_STATUS_SUCCESS; |
347 | } |
348 | |
349 | } // namespace |
350 | } // namespace core |
351 | |
352 | struct EnvironmentVariables { |
353 | int NumTeams; |
354 | int TeamLimit; |
355 | int TeamThreadLimit; |
356 | int MaxTeamsDefault; |
357 | int DynamicMemSize; |
358 | }; |
359 | |
360 | template <uint32_t wavesize> |
361 | static constexpr const llvm::omp::GV &getGridValue() { |
362 | return llvm::omp::getAMDGPUGridValues<wavesize>(); |
363 | } |
364 | |
365 | struct HSALifetime { |
366 | // Wrapper around HSA used to ensure it is constructed before other types |
367 | // and destructed after, which means said other types can use raii for |
368 | // cleanup without risking running outside of the lifetime of HSA |
369 | const hsa_status_t S; |
370 | |
371 | bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; } |
372 | HSALifetime() : S(hsa_init()) {} |
373 | |
374 | ~HSALifetime() { |
375 | if (S == HSA_STATUS_SUCCESS) { |
376 | hsa_status_t Err = hsa_shut_down(); |
377 | if (Err != HSA_STATUS_SUCCESS) { |
378 | // Can't call into HSA to get a string from the integer |
379 | DP("Shutting down HSA failed: %d\n", Err){}; |
380 | } |
381 | } |
382 | } |
383 | }; |
384 | |
385 | // Handle scheduling of multiple hsa_queue's per device to |
386 | // multiple threads (one scheduler per device) |
387 | class HSAQueueScheduler { |
388 | public: |
389 | HSAQueueScheduler() : Current(0) {} |
390 | |
391 | HSAQueueScheduler(const HSAQueueScheduler &) = delete; |
392 | |
393 | HSAQueueScheduler(HSAQueueScheduler &&Q) { |
394 | Current = Q.Current.load(); |
395 | for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { |
396 | HSAQueues[I] = Q.HSAQueues[I]; |
397 | Q.HSAQueues[I] = nullptr; |
398 | } |
399 | } |
400 | |
401 | // \return false if any HSA queue creation fails |
402 | bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) { |
403 | for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { |
404 | hsa_queue_t *Q = nullptr; |
405 | hsa_status_t Rc = |
406 | hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI, |
407 | callbackQueue, NULL__null, UINT32_MAX(4294967295U), UINT32_MAX(4294967295U), &Q); |
408 | if (Rc != HSA_STATUS_SUCCESS) { |
409 | DP("Failed to create HSA queue %d\n", I){}; |
410 | return false; |
411 | } |
412 | HSAQueues[I] = Q; |
413 | } |
414 | return true; |
415 | } |
416 | |
417 | ~HSAQueueScheduler() { |
418 | for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { |
419 | if (HSAQueues[I]) { |
420 | hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]); |
421 | if (Err != HSA_STATUS_SUCCESS) |
422 | DP("Error destroying HSA queue"){}; |
423 | } |
424 | } |
425 | } |
426 | |
427 | // \return next queue to use for device |
428 | hsa_queue_t *next() { |
429 | return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) % |
430 | NUM_QUEUES_PER_DEVICE]; |
431 | } |
432 | |
433 | private: |
434 | // Number of queues per device |
435 | enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 }; |
436 | hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {}; |
437 | std::atomic<uint8_t> Current; |
438 | }; |
439 | |
440 | /// Class containing all the device information |
441 | class RTLDeviceInfoTy : HSALifetime { |
442 | std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; |
443 | |
444 | struct QueueDeleter { |
445 | void operator()(hsa_queue_t *Q) { |
446 | if (Q) { |
447 | hsa_status_t Err = hsa_queue_destroy(Q); |
448 | if (Err != HSA_STATUS_SUCCESS) { |
449 | DP("Error destroying hsa queue: %s\n", get_error_string(Err)){}; |
450 | } |
451 | } |
452 | } |
453 | }; |
454 | |
455 | public: |
456 | bool ConstructionSucceeded = false; |
457 | |
458 | // load binary populates symbol tables and mutates various global state |
459 | // run uses those symbol tables |
460 | std::shared_timed_mutex LoadRunLock; |
461 | |
462 | int NumberOfDevices = 0; |
463 | |
464 | // GPU devices |
465 | std::vector<hsa_agent_t> HSAAgents; |
466 | std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu |
467 | |
468 | // CPUs |
469 | std::vector<hsa_agent_t> CPUAgents; |
470 | |
471 | // Device properties |
472 | std::vector<int> ComputeUnits; |
473 | std::vector<int> GroupsPerDevice; |
474 | std::vector<int> ThreadsPerGroup; |
475 | std::vector<int> WarpSize; |
476 | std::vector<std::string> GPUName; |
477 | std::vector<std::string> TargetID; |
478 | |
479 | // OpenMP properties |
480 | std::vector<int> NumTeams; |
481 | std::vector<int> NumThreads; |
482 | |
483 | // OpenMP Environment properties |
484 | EnvironmentVariables Env; |
485 | |
486 | // OpenMP Requires Flags |
487 | int64_t RequiresFlags; |
488 | |
489 | // Resource pools |
490 | SignalPoolT FreeSignalPool; |
491 | |
492 | bool HostcallRequired = false; |
493 | |
494 | std::vector<hsa_executable_t> HSAExecutables; |
495 | |
496 | std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable; |
497 | std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable; |
498 | |
499 | hsa_amd_memory_pool_t KernArgPool; |
500 | |
501 | // fine grained memory pool for host allocations |
502 | hsa_amd_memory_pool_t HostFineGrainedMemoryPool; |
503 | |
504 | // fine and coarse-grained memory pools per offloading device |
505 | std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools; |
506 | std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools; |
507 | |
508 | struct ImplFreePtrDeletor { |
509 | void operator()(void *P) { |
510 | core::Runtime::Memfree(P); // ignore failure to free |
511 | } |
512 | }; |
513 | |
514 | // device_State shared across loaded binaries, error if inconsistent size |
515 | std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>> |
516 | DeviceStateStore; |
517 | |
518 | static const unsigned HardTeamLimit = |
519 | (1 << 16) - 1; // 64K needed to fit in uint16 |
520 | static const int DefaultNumTeams = 128; |
521 | |
522 | // These need to be per-device since different devices can have different |
523 | // wave sizes, but are currently the same number for each so that refactor |
524 | // can be postponed. |
525 | static_assert(getGridValue<32>().GV_Max_Teams == |
526 | getGridValue<64>().GV_Max_Teams, |
527 | ""); |
528 | static const int MaxTeams = getGridValue<64>().GV_Max_Teams; |
529 | |
530 | static_assert(getGridValue<32>().GV_Max_WG_Size == |
531 | getGridValue<64>().GV_Max_WG_Size, |
532 | ""); |
533 | static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size; |
534 | |
535 | static_assert(getGridValue<32>().GV_Default_WG_Size == |
536 | getGridValue<64>().GV_Default_WG_Size, |
537 | ""); |
538 | static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size; |
539 | |
540 | using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size, |
541 | hsa_agent_t, hsa_amd_memory_pool_t); |
542 | hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size, |
543 | MemcpyFunc Func, int32_t DeviceId) { |
544 | hsa_agent_t Agent = HSAAgents[DeviceId]; |
545 | hsa_signal_t S = FreeSignalPool.pop(); |
546 | if (S.handle == 0) { |
547 | return HSA_STATUS_ERROR; |
548 | } |
549 | hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool); |
550 | FreeSignalPool.push(S); |
551 | return R; |
552 | } |
553 | |
554 | hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size, |
555 | int32_t DeviceId) { |
556 | return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId); |
557 | } |
558 | |
559 | hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size, |
560 | int32_t DeviceId) { |
561 | return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId); |
562 | } |
563 | |
564 | static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) { |
565 | char TmpChar[1000]; |
566 | uint16_t Major, Minor; |
567 | uint32_t TmpUInt; |
568 | uint32_t TmpUInt2; |
569 | uint32_t CacheSize[4]; |
570 | bool TmpBool; |
571 | uint16_t WorkgroupMaxDim[3]; |
572 | hsa_dim3_t GridMaxDim; |
573 | |
574 | // Getting basic information about HSA and Device |
575 | core::checkResult( |
576 | hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major), |
577 | "Error from hsa_system_get_info when obtaining " |
578 | "HSA_SYSTEM_INFO_VERSION_MAJOR\n"); |
579 | core::checkResult( |
580 | hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor), |
581 | "Error from hsa_system_get_info when obtaining " |
582 | "HSA_SYSTEM_INFO_VERSION_MINOR\n"); |
583 | printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor); |
584 | printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId); |
585 | core::checkResult( |
586 | hsa_agent_get_info( |
587 | Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar), |
588 | "Error returned from hsa_agent_get_info when obtaining " |
589 | "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n"); |
590 | printf(" Product Name: \t\t\t%s \n", TmpChar); |
591 | core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar), |
592 | "Error returned from hsa_agent_get_info when obtaining " |
593 | "HSA_AGENT_INFO_NAME\n"); |
594 | printf(" Device Name: \t\t\t%s \n", TmpChar); |
595 | core::checkResult( |
596 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar), |
597 | "Error returned from hsa_agent_get_info when obtaining " |
598 | "HSA_AGENT_INFO_NAME\n"); |
599 | printf(" Vendor Name: \t\t\t%s \n", TmpChar); |
600 | hsa_device_type_t DevType; |
601 | core::checkResult( |
602 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType), |
603 | "Error returned from hsa_agent_get_info when obtaining " |
604 | "HSA_AGENT_INFO_DEVICE\n"); |
605 | printf(" Device Type: \t\t\t%s \n", |
606 | DevType == HSA_DEVICE_TYPE_CPU |
607 | ? "CPU" |
608 | : (DevType == HSA_DEVICE_TYPE_GPU |
609 | ? "GPU" |
610 | : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN"))); |
611 | core::checkResult( |
612 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt), |
613 | "Error returned from hsa_agent_get_info when obtaining " |
614 | "HSA_AGENT_INFO_QUEUES_MAX\n"); |
615 | printf(" Max Queues: \t\t\t%u \n", TmpUInt); |
616 | core::checkResult( |
617 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt), |
618 | "Error returned from hsa_agent_get_info when obtaining " |
619 | "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n"); |
620 | printf(" Queue Min Size: \t\t\t%u \n", TmpUInt); |
621 | core::checkResult( |
622 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt), |
623 | "Error returned from hsa_agent_get_info when obtaining " |
624 | "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n"); |
625 | printf(" Queue Max Size: \t\t\t%u \n", TmpUInt); |
626 | |
627 | // Getting cache information |
628 | printf(" Cache:\n"); |
629 | |
630 | // FIXME: This is deprecated according to HSA documentation. But using |
631 | // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during |
632 | // runtime. |
633 | core::checkResult( |
634 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize), |
635 | "Error returned from hsa_agent_get_info when obtaining " |
636 | "HSA_AGENT_INFO_CACHE_SIZE\n"); |
637 | |
638 | for (int I = 0; I < 4; I++) { |
639 | if (CacheSize[I]) { |
640 | printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]); |
641 | } |
642 | } |
643 | |
644 | core::checkResult( |
645 | hsa_agent_get_info(Agent, |
646 | (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, |
647 | &TmpUInt), |
648 | "Error returned from hsa_agent_get_info when obtaining " |
649 | "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n"); |
650 | printf(" Cacheline Size: \t\t\t%u \n", TmpUInt); |
651 | core::checkResult( |
652 | hsa_agent_get_info( |
653 | Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, |
654 | &TmpUInt), |
655 | "Error returned from hsa_agent_get_info when obtaining " |
656 | "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n"); |
657 | printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt); |
658 | core::checkResult( |
659 | hsa_agent_get_info( |
660 | Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, |
661 | &TmpUInt), |
662 | "Error returned from hsa_agent_get_info when obtaining " |
663 | "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n"); |
664 | printf(" Compute Units: \t\t\t%u \n", TmpUInt); |
665 | core::checkResult(hsa_agent_get_info( |
666 | Agent, |
667 | (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, |
668 | &TmpUInt), |
669 | "Error returned from hsa_agent_get_info when obtaining " |
670 | "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); |
671 | printf(" SIMD per CU: \t\t\t%u \n", TmpUInt); |
672 | core::checkResult( |
673 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool), |
674 | "Error returned from hsa_agent_get_info when obtaining " |
675 | "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); |
676 | printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE")); |
677 | core::checkResult( |
678 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2), |
679 | "Error returned from hsa_agent_get_info when obtaining " |
680 | "HSA_AGENT_INFO_WAVEFRONT_SIZE\n"); |
681 | printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2); |
682 | core::checkResult( |
683 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt), |
684 | "Error returned from hsa_agent_get_info when obtaining " |
685 | "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n"); |
686 | printf(" Workgroup Max Size: \t\t%u \n", TmpUInt); |
687 | core::checkResult(hsa_agent_get_info(Agent, |
688 | HSA_AGENT_INFO_WORKGROUP_MAX_DIM, |
689 | WorkgroupMaxDim), |
690 | "Error returned from hsa_agent_get_info when obtaining " |
691 | "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n"); |
692 | printf(" Workgroup Max Size per Dimension:\n"); |
693 | printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]); |
694 | printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]); |
695 | printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]); |
696 | core::checkResult(hsa_agent_get_info( |
697 | Agent, |
698 | (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, |
699 | &TmpUInt), |
700 | "Error returned from hsa_agent_get_info when obtaining " |
701 | "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n"); |
702 | printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt); |
703 | printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2); |
704 | core::checkResult( |
705 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt), |
706 | "Error returned from hsa_agent_get_info when obtaining " |
707 | "HSA_AGENT_INFO_GRID_MAX_SIZE\n"); |
708 | printf(" Grid Max Size: \t\t\t%u \n", TmpUInt); |
709 | core::checkResult( |
710 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim), |
711 | "Error returned from hsa_agent_get_info when obtaining " |
712 | "HSA_AGENT_INFO_GRID_MAX_DIM\n"); |
713 | printf(" Grid Max Size per Dimension: \t\t\n"); |
714 | printf(" x: \t\t\t\t%u\n", GridMaxDim.x); |
715 | printf(" y: \t\t\t\t%u\n", GridMaxDim.y); |
716 | printf(" z: \t\t\t\t%u\n", GridMaxDim.z); |
717 | core::checkResult( |
718 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt), |
719 | "Error returned from hsa_agent_get_info when obtaining " |
720 | "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n"); |
721 | printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt); |
722 | |
723 | printf(" Memory Pools:\n"); |
724 | auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t { |
725 | std::string TmpStr; |
726 | size_t Size; |
727 | bool Alloc, Access; |
728 | hsa_amd_segment_t Segment; |
729 | hsa_amd_memory_pool_global_flag_t GlobalFlags; |
730 | core::checkResult( |
731 | hsa_amd_memory_pool_get_info( |
732 | Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags), |
733 | "Error returned from hsa_amd_memory_pool_get_info when obtaining " |
734 | "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n"); |
735 | core::checkResult(hsa_amd_memory_pool_get_info( |
736 | Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment), |
737 | "Error returned from hsa_amd_memory_pool_get_info when " |
738 | "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n"); |
739 | |
740 | switch (Segment) { |
741 | case HSA_AMD_SEGMENT_GLOBAL: |
742 | TmpStr = "GLOBAL; FLAGS: "; |
743 | if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags) |
744 | TmpStr += "KERNARG, "; |
745 | if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags) |
746 | TmpStr += "FINE GRAINED, "; |
747 | if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags) |
748 | TmpStr += "COARSE GRAINED, "; |
749 | break; |
750 | case HSA_AMD_SEGMENT_READONLY: |
751 | TmpStr = "READONLY"; |
752 | break; |
753 | case HSA_AMD_SEGMENT_PRIVATE: |
754 | TmpStr = "PRIVATE"; |
755 | break; |
756 | case HSA_AMD_SEGMENT_GROUP: |
757 | TmpStr = "GROUP"; |
758 | break; |
759 | } |
760 | printf(" Pool %s: \n", TmpStr.c_str()); |
761 | |
762 | core::checkResult(hsa_amd_memory_pool_get_info( |
763 | Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size), |
764 | "Error returned from hsa_amd_memory_pool_get_info when " |
765 | "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n"); |
766 | printf(" Size: \t\t\t\t %zu bytes\n", Size); |
767 | core::checkResult( |
768 | hsa_amd_memory_pool_get_info( |
769 | Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc), |
770 | "Error returned from hsa_amd_memory_pool_get_info when obtaining " |
771 | "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n"); |
772 | printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE")); |
773 | core::checkResult( |
774 | hsa_amd_memory_pool_get_info( |
775 | Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size), |
776 | "Error returned from hsa_amd_memory_pool_get_info when obtaining " |
777 | "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n"); |
778 | printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size); |
779 | core::checkResult( |
780 | hsa_amd_memory_pool_get_info( |
781 | Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size), |
782 | "Error returned from hsa_amd_memory_pool_get_info when obtaining " |
783 | "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n"); |
784 | printf(" Runtime Alloc alignment: \t %zu bytes\n", Size); |
785 | core::checkResult( |
786 | hsa_amd_memory_pool_get_info( |
787 | Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access), |
788 | "Error returned from hsa_amd_memory_pool_get_info when obtaining " |
789 | "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n"); |
790 | printf(" Accessable by all: \t\t %s\n", |
791 | (Access ? "TRUE" : "FALSE")); |
792 | |
793 | return HSA_STATUS_SUCCESS; |
794 | }; |
795 | // Iterate over all the memory regions for this agent. Get the memory region |
796 | // type and size |
797 | hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr); |
798 | |
799 | printf(" ISAs:\n"); |
800 | auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t { |
801 | char TmpChar[1000]; |
802 | core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar), |
803 | "Error returned from hsa_isa_get_info_alt when " |
804 | "obtaining HSA_ISA_INFO_NAME\n"); |
805 | printf(" Name: \t\t\t\t %s\n", TmpChar); |
806 | |
807 | return HSA_STATUS_SUCCESS; |
808 | }; |
809 | // Iterate over all the memory regions for this agent. Get the memory region |
810 | // type and size |
811 | hsa_agent_iterate_isas(Agent, CBIsas, nullptr); |
812 | } |
813 | |
814 | // Record entry point associated with device |
815 | void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) { |
816 | assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 817, __extension__ __PRETTY_FUNCTION__)) |
817 | "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 817, __extension__ __PRETTY_FUNCTION__)); |
818 | FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); |
819 | |
820 | E.Entries.push_back(Entry); |
821 | } |
822 | |
823 | // Return true if the entry is associated with device |
824 | bool findOffloadEntry(int32_t DeviceId, void *Addr) { |
825 | assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 826, __extension__ __PRETTY_FUNCTION__)) |
826 | "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 826, __extension__ __PRETTY_FUNCTION__)); |
827 | FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); |
828 | |
829 | for (auto &It : E.Entries) { |
830 | if (It.addr == Addr) |
831 | return true; |
832 | } |
833 | |
834 | return false; |
835 | } |
836 | |
837 | // Return the pointer to the target entries table |
838 | __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { |
839 | assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 840, __extension__ __PRETTY_FUNCTION__)) |
840 | "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 840, __extension__ __PRETTY_FUNCTION__)); |
841 | FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); |
842 | |
843 | int32_t Size = E.Entries.size(); |
844 | |
845 | // Table is empty |
846 | if (!Size) |
847 | return 0; |
848 | |
849 | __tgt_offload_entry *Begin = &E.Entries[0]; |
850 | __tgt_offload_entry *End = &E.Entries[Size - 1]; |
851 | |
852 | // Update table info according to the entries and return the pointer |
853 | E.Table.EntriesBegin = Begin; |
854 | E.Table.EntriesEnd = ++End; |
855 | |
856 | return &E.Table; |
857 | } |
858 | |
859 | // Clear entries table for a device |
860 | void clearOffloadEntriesTable(int DeviceId) { |
861 | assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 862, __extension__ __PRETTY_FUNCTION__)) |
862 | "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 862, __extension__ __PRETTY_FUNCTION__)); |
863 | FuncGblEntries[DeviceId].emplace_back(); |
864 | FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); |
865 | // KernelArgPoolMap.clear(); |
866 | E.Entries.clear(); |
867 | E.Table.EntriesBegin = E.Table.EntriesEnd = 0; |
868 | } |
869 | |
870 | hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, |
871 | unsigned int DeviceId) { |
872 | assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.")(static_cast <bool> (DeviceId < DeviceFineGrainedMemoryPools .size() && "Error here.") ? void (0) : __assert_fail ( "DeviceId < DeviceFineGrainedMemoryPools.size() && \"Error here.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 872, __extension__ __PRETTY_FUNCTION__)); |
873 | uint32_t GlobalFlags = 0; |
874 | hsa_status_t Err = hsa_amd_memory_pool_get_info( |
875 | MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); |
876 | |
877 | if (Err != HSA_STATUS_SUCCESS) { |
878 | return Err; |
879 | } |
880 | |
881 | if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { |
882 | DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; |
883 | } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { |
884 | DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; |
885 | } |
886 | |
887 | return HSA_STATUS_SUCCESS; |
888 | } |
889 | |
890 | hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) { |
891 | for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { |
892 | hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( |
893 | Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { |
894 | hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool); |
895 | if (ValidStatus != HSA_STATUS_SUCCESS) { |
896 | DP("Alloc allowed in memory pool check failed: %s\n",{} |
897 | get_error_string(ValidStatus)){}; |
898 | return HSA_STATUS_SUCCESS; |
899 | } |
900 | return addDeviceMemoryPool(MemoryPool, DeviceId); |
901 | }); |
902 | |
903 | if (Err != HSA_STATUS_SUCCESS) { |
904 | DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{} |
905 | "Iterate all memory pools", get_error_string(Err)){}; |
906 | return Err; |
907 | } |
908 | } |
909 | return HSA_STATUS_SUCCESS; |
910 | } |
911 | |
912 | hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) { |
913 | std::vector<hsa_amd_memory_pool_t> HostPools; |
914 | |
915 | // collect all the "valid" pools for all the given agents. |
916 | for (const auto &Agent : Agents) { |
917 | hsa_status_t Err = hsa_amd_agent_iterate_memory_pools( |
918 | Agent, core::addMemoryPool, static_cast<void *>(&HostPools)); |
919 | if (Err != HSA_STATUS_SUCCESS) { |
920 | DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)){}; |
921 | } |
922 | } |
923 | |
924 | // We need two fine-grained pools. |
925 | // 1. One with kernarg flag set for storing kernel arguments |
926 | // 2. Second for host allocations |
927 | bool FineGrainedMemoryPoolSet = false; |
928 | bool KernArgPoolSet = false; |
929 | for (const auto &MemoryPool : HostPools) { |
930 | hsa_status_t Err = HSA_STATUS_SUCCESS; |
931 | uint32_t GlobalFlags = 0; |
932 | Err = hsa_amd_memory_pool_get_info( |
933 | MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); |
934 | if (Err != HSA_STATUS_SUCCESS) { |
935 | DP("Get memory pool info failed: %s\n", get_error_string(Err)){}; |
936 | return Err; |
937 | } |
938 | |
939 | if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { |
940 | if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { |
941 | KernArgPool = MemoryPool; |
942 | KernArgPoolSet = true; |
943 | } else { |
944 | HostFineGrainedMemoryPool = MemoryPool; |
945 | FineGrainedMemoryPoolSet = true; |
946 | } |
947 | } |
948 | } |
949 | |
950 | if (FineGrainedMemoryPoolSet && KernArgPoolSet) |
951 | return HSA_STATUS_SUCCESS; |
952 | |
953 | return HSA_STATUS_ERROR; |
954 | } |
955 | |
956 | hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) { |
957 | assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&(static_cast <bool> (DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id" ) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 958, __extension__ __PRETTY_FUNCTION__)) |
958 | "Invalid device Id")(static_cast <bool> (DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id" ) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 958, __extension__ __PRETTY_FUNCTION__)); |
959 | return DeviceCoarseGrainedMemoryPools[DeviceId]; |
960 | } |
961 | |
962 | hsa_amd_memory_pool_t getHostMemoryPool() { |
963 | return HostFineGrainedMemoryPool; |
964 | } |
965 | |
966 | static int readEnv(const char *Env, int Default = -1) { |
967 | const char *EnvStr = getenv(Env); |
968 | int Res = Default; |
969 | if (EnvStr) { |
970 | Res = std::stoi(EnvStr); |
971 | DP("Parsed %s=%d\n", Env, Res){}; |
972 | } |
973 | return Res; |
974 | } |
975 | |
976 | RTLDeviceInfoTy() { |
977 | DP("Start initializing " GETNAME(TARGET_NAME) "\n"){}; |
978 | |
979 | // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr |
980 | // anytime. You do not need a debug library build. |
981 | // 0 => no tracing |
982 | // 1 => tracing dispatch only |
983 | // >1 => verbosity increase |
984 | |
985 | if (!HSAInitSuccess()) { |
986 | DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"){}; |
987 | return; |
988 | } |
989 | |
990 | if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) |
991 | print_kernel_trace = atoi(EnvStr); |
992 | else |
993 | print_kernel_trace = 0; |
994 | |
995 | hsa_status_t Err = core::atl_init_gpu_context(); |
996 | if (Err != HSA_STATUS_SUCCESS) { |
997 | DP("Error when initializing " GETNAME(TARGET_NAME) "\n"){}; |
998 | return; |
999 | } |
1000 | |
1001 | // Init hostcall soon after initializing hsa |
1002 | hostrpc_init(); |
1003 | |
1004 | Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { |
1005 | if (DeviceType == HSA_DEVICE_TYPE_CPU) { |
1006 | CPUAgents.push_back(Agent); |
1007 | } else { |
1008 | HSAAgents.push_back(Agent); |
1009 | } |
1010 | }); |
1011 | if (Err != HSA_STATUS_SUCCESS) |
1012 | return; |
1013 | |
1014 | NumberOfDevices = (int)HSAAgents.size(); |
1015 | |
1016 | if (NumberOfDevices == 0) { |
1017 | DP("There are no devices supporting HSA.\n"){}; |
1018 | return; |
1019 | } |
1020 | DP("There are %d devices supporting HSA.\n", NumberOfDevices){}; |
1021 | |
1022 | // Init the device info |
1023 | HSAQueueSchedulers.reserve(NumberOfDevices); |
1024 | FuncGblEntries.resize(NumberOfDevices); |
1025 | ThreadsPerGroup.resize(NumberOfDevices); |
1026 | ComputeUnits.resize(NumberOfDevices); |
1027 | GPUName.resize(NumberOfDevices); |
1028 | GroupsPerDevice.resize(NumberOfDevices); |
1029 | WarpSize.resize(NumberOfDevices); |
1030 | NumTeams.resize(NumberOfDevices); |
1031 | NumThreads.resize(NumberOfDevices); |
1032 | DeviceStateStore.resize(NumberOfDevices); |
1033 | KernelInfoTable.resize(NumberOfDevices); |
1034 | SymbolInfoTable.resize(NumberOfDevices); |
1035 | DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); |
1036 | DeviceFineGrainedMemoryPools.resize(NumberOfDevices); |
1037 | |
1038 | Err = setupDevicePools(HSAAgents); |
1039 | if (Err != HSA_STATUS_SUCCESS) { |
1040 | DP("Setup for Device Memory Pools failed\n"){}; |
1041 | return; |
1042 | } |
1043 | |
1044 | Err = setupHostMemoryPools(CPUAgents); |
1045 | if (Err != HSA_STATUS_SUCCESS) { |
1046 | DP("Setup for Host Memory Pools failed\n"){}; |
1047 | return; |
1048 | } |
1049 | |
1050 | for (int I = 0; I < NumberOfDevices; I++) { |
1051 | uint32_t QueueSize = 0; |
1052 | { |
1053 | hsa_status_t Err = hsa_agent_get_info( |
1054 | HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize); |
1055 | if (Err != HSA_STATUS_SUCCESS) { |
1056 | DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I){}; |
1057 | return; |
1058 | } |
1059 | enum { MaxQueueSize = 4096 }; |
1060 | if (QueueSize > MaxQueueSize) { |
1061 | QueueSize = MaxQueueSize; |
1062 | } |
1063 | } |
1064 | |
1065 | { |
1066 | HSAQueueScheduler QSched; |
1067 | if (!QSched.createQueues(HSAAgents[I], QueueSize)) |
1068 | return; |
1069 | HSAQueueSchedulers.emplace_back(std::move(QSched)); |
1070 | } |
1071 | |
1072 | DeviceStateStore[I] = {nullptr, 0}; |
1073 | } |
1074 | |
1075 | for (int I = 0; I < NumberOfDevices; I++) { |
1076 | ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize; |
1077 | GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams; |
1078 | ComputeUnits[I] = 1; |
1079 | DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,{} |
1080 | GroupsPerDevice[I], ThreadsPerGroup[I]){}; |
1081 | } |
1082 | |
1083 | // Get environment variables regarding teams |
1084 | Env.TeamLimit = readEnv("OMP_TEAM_LIMIT"); |
1085 | Env.NumTeams = readEnv("OMP_NUM_TEAMS"); |
1086 | Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT"); |
1087 | Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT"); |
1088 | Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0); |
1089 | |
1090 | // Default state. |
1091 | RequiresFlags = OMP_REQ_UNDEFINED; |
1092 | |
1093 | ConstructionSucceeded = true; |
1094 | } |
1095 | |
1096 | ~RTLDeviceInfoTy() { |
1097 | DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"){}; |
1098 | if (!HSAInitSuccess()) { |
1099 | // Then none of these can have been set up and they can't be torn down |
1100 | return; |
1101 | } |
1102 | // Run destructors on types that use HSA before |
1103 | // impl_finalize removes access to it |
1104 | DeviceStateStore.clear(); |
1105 | KernelArgPoolMap.clear(); |
1106 | // Terminate hostrpc before finalizing hsa |
1107 | hostrpc_terminate(); |
1108 | |
1109 | hsa_status_t Err; |
1110 | for (uint32_t I = 0; I < HSAExecutables.size(); I++) { |
1111 | Err = hsa_executable_destroy(HSAExecutables[I]); |
1112 | if (Err != HSA_STATUS_SUCCESS) { |
1113 | DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{} |
1114 | "Destroying executable", get_error_string(Err)){}; |
1115 | } |
1116 | } |
1117 | } |
1118 | }; |
1119 | |
1120 | pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } }; |
1121 | |
1122 | // Putting accesses to DeviceInfo global behind a function call prior |
1123 | // to changing to use init_plugin/deinit_plugin calls |
1124 | static RTLDeviceInfoTy DeviceInfoState; |
1125 | static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; } |
1126 | |
1127 | namespace { |
1128 | |
1129 | int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, |
1130 | __tgt_async_info *AsyncInfo) { |
1131 | assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1131, __extension__ __PRETTY_FUNCTION__)); |
1132 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1132, __extension__ __PRETTY_FUNCTION__)); |
1133 | // Return success if we are not copying back to host from target. |
1134 | if (!HstPtr) |
1135 | return OFFLOAD_SUCCESS(0); |
1136 | hsa_status_t Err; |
1137 | DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{} |
1138 | (long long unsigned)(Elf64_Addr)TgtPtr,{} |
1139 | (long long unsigned)(Elf64_Addr)HstPtr){}; |
1140 | |
1141 | Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, |
1142 | DeviceId); |
1143 | |
1144 | if (Err != HSA_STATUS_SUCCESS) { |
1145 | DP("Error when copying data from device to host. Pointers: "{} |
1146 | "host = 0x%016lx, device = 0x%016lx, size = %lld\n",{} |
1147 | (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){}; |
1148 | return OFFLOAD_FAIL(~0); |
1149 | } |
1150 | DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{} |
1151 | (long long unsigned)(Elf64_Addr)TgtPtr,{} |
1152 | (long long unsigned)(Elf64_Addr)HstPtr){}; |
1153 | return OFFLOAD_SUCCESS(0); |
1154 | } |
1155 | |
1156 | int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, |
1157 | __tgt_async_info *AsyncInfo) { |
1158 | assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1158, __extension__ __PRETTY_FUNCTION__)); |
1159 | hsa_status_t Err; |
1160 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1160, __extension__ __PRETTY_FUNCTION__)); |
1161 | // Return success if we are not doing host to target. |
1162 | if (!HstPtr) |
1163 | return OFFLOAD_SUCCESS(0); |
1164 | |
1165 | DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,{} |
1166 | (long long unsigned)(Elf64_Addr)HstPtr,{} |
1167 | (long long unsigned)(Elf64_Addr)TgtPtr){}; |
1168 | Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, |
1169 | DeviceId); |
1170 | if (Err != HSA_STATUS_SUCCESS) { |
1171 | DP("Error when copying data from host to device. Pointers: "{} |
1172 | "host = 0x%016lx, device = 0x%016lx, size = %lld\n",{} |
1173 | (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){}; |
1174 | return OFFLOAD_FAIL(~0); |
1175 | } |
1176 | return OFFLOAD_SUCCESS(0); |
1177 | } |
1178 | |
1179 | // Async. |
1180 | // The implementation was written with cuda streams in mind. The semantics of |
1181 | // that are to execute kernels on a queue in order of insertion. A synchronise |
1182 | // call then makes writes visible between host and device. This means a series |
1183 | // of N data_submit_async calls are expected to execute serially. HSA offers |
1184 | // various options to run the data copies concurrently. This may require changes |
1185 | // to libomptarget. |
1186 | |
1187 | // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that |
1188 | // there are no outstanding kernels that need to be synchronized. Any async call |
1189 | // may be passed a Queue==0, at which point the cuda implementation will set it |
1190 | // to non-null (see getStream). The cuda streams are per-device. Upstream may |
1191 | // change this interface to explicitly initialize the AsyncInfo_pointer, but |
1192 | // until then hsa lazily initializes it as well. |
1193 | |
1194 | void initAsyncInfo(__tgt_async_info *AsyncInfo) { |
1195 | // set non-null while using async calls, return to null to indicate completion |
1196 | assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail ("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1196, __extension__ __PRETTY_FUNCTION__)); |
1197 | if (!AsyncInfo->Queue) { |
1198 | AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX(18446744073709551615UL)); |
1199 | } |
1200 | } |
1201 | void finiAsyncInfo(__tgt_async_info *AsyncInfo) { |
1202 | assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail ("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1202, __extension__ __PRETTY_FUNCTION__)); |
1203 | assert(AsyncInfo->Queue)(static_cast <bool> (AsyncInfo->Queue) ? void (0) : __assert_fail ("AsyncInfo->Queue", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1203, __extension__ __PRETTY_FUNCTION__)); |
1204 | AsyncInfo->Queue = 0; |
1205 | } |
1206 | |
1207 | // Determine launch values for kernel. |
1208 | struct LaunchVals { |
1209 | int WorkgroupSize; |
1210 | int GridSize; |
1211 | }; |
1212 | LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, |
1213 | int ConstWGSize, |
1214 | llvm::omp::OMPTgtExecModeFlags ExecutionMode, |
1215 | int NumTeams, int ThreadLimit, uint64_t LoopTripcount, |
1216 | int DeviceNumTeams) { |
1217 | |
1218 | int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize; |
1219 | int NumGroups = 0; |
1220 | |
1221 | int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; |
1222 | if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit)) |
1223 | MaxTeams = RTLDeviceInfoTy::HardTeamLimit; |
1224 | |
1225 | if (print_kernel_trace & STARTUP_DETAILS) { |
1226 | DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams){}; |
1227 | DP("Max_Teams: %d\n", MaxTeams){}; |
1228 | DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize){}; |
1229 | DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize){}; |
1230 | DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",{} |
1231 | RTLDeviceInfoTy::DefaultWgSize){}; |
1232 | DP("thread_limit: %d\n", ThreadLimit){}; |
1233 | DP("threadsPerGroup: %d\n", ThreadsPerGroup){}; |
1234 | DP("ConstWGSize: %d\n", ConstWGSize){}; |
1235 | } |
1236 | // check for thread_limit() clause |
1237 | if (ThreadLimit > 0) { |
1238 | ThreadsPerGroup = ThreadLimit; |
1239 | DP("Setting threads per block to requested %d\n", ThreadLimit){}; |
1240 | // Add master warp for GENERIC |
1241 | if (ExecutionMode == |
1242 | llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { |
1243 | ThreadsPerGroup += WarpSize; |
1244 | DP("Adding master wavefront: +%d threads\n", WarpSize){}; |
1245 | } |
1246 | if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max |
1247 | ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize; |
1248 | DP("Setting threads per block to maximum %d\n", ThreadsPerGroup){}; |
1249 | } |
1250 | } |
1251 | // check flat_max_work_group_size attr here |
1252 | if (ThreadsPerGroup > ConstWGSize) { |
1253 | ThreadsPerGroup = ConstWGSize; |
1254 | DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",{} |
1255 | ThreadsPerGroup){}; |
1256 | } |
1257 | if (print_kernel_trace & STARTUP_DETAILS) |
1258 | DP("threadsPerGroup: %d\n", ThreadsPerGroup){}; |
1259 | DP("Preparing %d threads\n", ThreadsPerGroup){}; |
1260 | |
1261 | // Set default num_groups (teams) |
1262 | if (Env.TeamLimit > 0) |
1263 | NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit; |
1264 | else |
1265 | NumGroups = MaxTeams; |
1266 | DP("Set default num of groups %d\n", NumGroups){}; |
1267 | |
1268 | if (print_kernel_trace & STARTUP_DETAILS) { |
1269 | DP("num_groups: %d\n", NumGroups){}; |
1270 | DP("num_teams: %d\n", NumTeams){}; |
1271 | } |
1272 | |
1273 | // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size |
1274 | // This reduction is typical for default case (no thread_limit clause). |
1275 | // or when user goes crazy with num_teams clause. |
1276 | // FIXME: We cant distinguish between a constant or variable thread limit. |
1277 | // So we only handle constant thread_limits. |
1278 | if (ThreadsPerGroup > |
1279 | RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024 |
1280 | // Should we round threadsPerGroup up to nearest WarpSize |
1281 | // here? |
1282 | NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup; |
1283 | |
1284 | // check for num_teams() clause |
1285 | if (NumTeams > 0) { |
1286 | NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups; |
1287 | } |
1288 | if (print_kernel_trace & STARTUP_DETAILS) { |
1289 | DP("num_groups: %d\n", NumGroups){}; |
1290 | DP("Env.NumTeams %d\n", Env.NumTeams){}; |
1291 | DP("Env.TeamLimit %d\n", Env.TeamLimit){}; |
1292 | } |
1293 | |
1294 | if (Env.NumTeams > 0) { |
1295 | NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups; |
1296 | DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams){}; |
1297 | } else if (Env.TeamLimit > 0) { |
1298 | NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups; |
1299 | DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit){}; |
1300 | } else { |
1301 | if (NumTeams <= 0) { |
1302 | if (LoopTripcount > 0) { |
1303 | if (ExecutionMode == |
1304 | llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { |
1305 | // round up to the nearest integer |
1306 | NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1; |
1307 | } else if (ExecutionMode == |
1308 | llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { |
1309 | NumGroups = LoopTripcount; |
1310 | } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { |
1311 | // This is a generic kernel that was transformed to use SPMD-mode |
1312 | // execution but uses Generic-mode semantics for scheduling. |
1313 | NumGroups = LoopTripcount; |
1314 | } |
1315 | DP("Using %d teams due to loop trip count %" PRIu64 " and number of "{} |
1316 | "threads per block %d\n",{} |
1317 | NumGroups, LoopTripcount, ThreadsPerGroup){}; |
1318 | } |
1319 | } else { |
1320 | NumGroups = NumTeams; |
1321 | } |
1322 | if (NumGroups > MaxTeams) { |
1323 | NumGroups = MaxTeams; |
1324 | if (print_kernel_trace & STARTUP_DETAILS) |
1325 | DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams){}; |
1326 | } |
1327 | if (NumGroups > NumTeams && NumTeams > 0) { |
1328 | NumGroups = NumTeams; |
1329 | if (print_kernel_trace & STARTUP_DETAILS) |
1330 | DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,{} |
1331 | NumTeams){}; |
1332 | } |
1333 | } |
1334 | |
1335 | // num_teams clause always honored, no matter what, unless DEFAULT is active. |
1336 | if (NumTeams > 0) { |
1337 | NumGroups = NumTeams; |
1338 | // Cap num_groups to EnvMaxTeamsDefault if set. |
1339 | if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault) |
1340 | NumGroups = Env.MaxTeamsDefault; |
1341 | } |
1342 | if (print_kernel_trace & STARTUP_DETAILS) { |
1343 | DP("threadsPerGroup: %d\n", ThreadsPerGroup){}; |
1344 | DP("num_groups: %d\n", NumGroups){}; |
1345 | DP("loop_tripcount: %ld\n", LoopTripcount){}; |
1346 | } |
1347 | DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,{} |
1348 | ThreadsPerGroup){}; |
1349 | |
1350 | LaunchVals Res; |
1351 | Res.WorkgroupSize = ThreadsPerGroup; |
1352 | Res.GridSize = ThreadsPerGroup * NumGroups; |
1353 | return Res; |
1354 | } |
1355 | |
1356 | static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { |
1357 | uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); |
1358 | bool Full = true; |
1359 | while (Full) { |
1360 | Full = |
1361 | PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue)); |
1362 | } |
1363 | return PacketId; |
1364 | } |
1365 | |
1366 | int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, |
1367 | ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, |
1368 | int32_t ThreadLimit, uint64_t LoopTripcount) { |
1369 | // Set the context we are using |
1370 | // update thread limit content in gpu memory if un-initialized or specified |
1371 | // from host |
1372 | |
1373 | DP("Run target team region thread_limit %d\n", ThreadLimit){}; |
1374 | |
1375 | // All args are references. |
1376 | std::vector<void *> Args(ArgNum); |
1377 | std::vector<void *> Ptrs(ArgNum); |
1378 | |
1379 | DP("Arg_num: %d\n", ArgNum){}; |
1380 | for (int32_t I = 0; I < ArgNum; ++I) { |
1381 | Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); |
1382 | Args[I] = &Ptrs[I]; |
1383 | DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])){}; |
1384 | } |
1385 | |
1386 | KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr; |
1387 | |
1388 | std::string KernelName = std::string(KernelInfo->Name); |
1389 | auto &KernelInfoTable = DeviceInfo().KernelInfoTable; |
1390 | if (KernelInfoTable[DeviceId].find(KernelName) == |
1391 | KernelInfoTable[DeviceId].end()) { |
1392 | DP("Kernel %s not found\n", KernelName.c_str()){}; |
1393 | return OFFLOAD_FAIL(~0); |
1394 | } |
1395 | |
1396 | const atl_kernel_info_t KernelInfoEntry = |
1397 | KernelInfoTable[DeviceId][KernelName]; |
1398 | const uint32_t GroupSegmentSize = |
1399 | KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize; |
1400 | const uint32_t SgprCount = KernelInfoEntry.sgpr_count; |
1401 | const uint32_t VgprCount = KernelInfoEntry.vgpr_count; |
1402 | const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count; |
1403 | const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count; |
1404 | |
1405 | assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count)(static_cast <bool> (ArgNum == (int)KernelInfoEntry.explicit_argument_count ) ? void (0) : __assert_fail ("ArgNum == (int)KernelInfoEntry.explicit_argument_count" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1405, __extension__ __PRETTY_FUNCTION__)); |
1406 | |
1407 | /* |
1408 | * Set limit based on ThreadsPerGroup and GroupsPerDevice |
1409 | */ |
1410 | LaunchVals LV = |
1411 | getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env, |
1412 | KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, |
1413 | NumTeams, // From run_region arg |
1414 | ThreadLimit, // From run_region arg |
1415 | LoopTripcount, // From run_region arg |
1416 | DeviceInfo().NumTeams[KernelInfo->DeviceId]); |
1417 | const int GridSize = LV.GridSize; |
1418 | const int WorkgroupSize = LV.WorkgroupSize; |
1419 | |
1420 | if (print_kernel_trace >= LAUNCH) { |
1421 | int NumGroups = GridSize / WorkgroupSize; |
1422 | // enum modes are SPMD, GENERIC, NONE 0,1,2 |
1423 | // if doing rtl timing, print to stderr, unless stdout requested. |
1424 | bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); |
1425 | fprintf(TraceToStdout ? stdoutstdout : stderrstderr, |
1426 | "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " |
1427 | "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " |
1428 | "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", |
1429 | DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, |
1430 | ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit, |
1431 | GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount, |
1432 | VgprSpillCount, LoopTripcount, KernelInfo->Name); |
1433 | } |
1434 | |
1435 | // Run on the device. |
1436 | { |
1437 | hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next(); |
1438 | if (!Queue) { |
1439 | return OFFLOAD_FAIL(~0); |
1440 | } |
1441 | uint64_t PacketId = acquireAvailablePacketId(Queue); |
1442 | |
1443 | const uint32_t Mask = Queue->size - 1; // size is a power of 2 |
1444 | hsa_kernel_dispatch_packet_t *Packet = |
1445 | (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); |
1446 | |
1447 | // packet->header is written last |
1448 | Packet->setup = UINT16_C(1)1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; |
1449 | Packet->workgroup_size_x = WorkgroupSize; |
1450 | Packet->workgroup_size_y = 1; |
1451 | Packet->workgroup_size_z = 1; |
1452 | Packet->reserved0 = 0; |
1453 | Packet->grid_size_x = GridSize; |
1454 | Packet->grid_size_y = 1; |
1455 | Packet->grid_size_z = 1; |
1456 | Packet->private_segment_size = KernelInfoEntry.private_segment_size; |
1457 | Packet->group_segment_size = GroupSegmentSize; |
1458 | Packet->kernel_object = KernelInfoEntry.kernel_object; |
1459 | Packet->kernarg_address = 0; // use the block allocator |
1460 | Packet->reserved2 = 0; // impl writes id_ here |
1461 | Packet->completion_signal = {0}; // may want a pool of signals |
1462 | |
1463 | KernelArgPool *ArgPool = nullptr; |
1464 | void *KernArg = nullptr; |
1465 | { |
1466 | auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name)); |
1467 | if (It != KernelArgPoolMap.end()) { |
1468 | ArgPool = (It->second).get(); |
1469 | } |
1470 | } |
1471 | if (!ArgPool) { |
1472 | DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,{} |
1473 | DeviceId){}; |
1474 | } |
1475 | { |
1476 | if (ArgPool) { |
1477 | assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)))(static_cast <bool> (ArgPool->KernargSegmentSize == ( ArgNum * sizeof(void *))) ? void (0) : __assert_fail ("ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1477, __extension__ __PRETTY_FUNCTION__)); |
1478 | KernArg = ArgPool->allocate(ArgNum); |
1479 | } |
1480 | if (!KernArg) { |
1481 | DP("Allocate kernarg failed\n"){}; |
1482 | return OFFLOAD_FAIL(~0); |
1483 | } |
1484 | |
1485 | // Copy explicit arguments |
1486 | for (int I = 0; I < ArgNum; I++) { |
1487 | memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *)); |
1488 | } |
1489 | |
1490 | // Initialize implicit arguments. TODO: Which of these can be dropped |
1491 | AMDGPUImplicitArgsTy *ImplArgs = reinterpret_cast<AMDGPUImplicitArgsTy *>( |
1492 | static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize); |
1493 | memset(ImplArgs, 0, |
1494 | sizeof(AMDGPUImplicitArgsTy)); // may not be necessary |
1495 | ImplArgs->OffsetX = 0; |
1496 | ImplArgs->OffsetY = 0; |
1497 | ImplArgs->OffsetZ = 0; |
1498 | |
1499 | // assign a hostcall buffer for the selected Q |
1500 | if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE2)) { |
1501 | // hostrpc_assign_buffer is not thread safe, and this function is |
1502 | // under a multiple reader lock, not a writer lock. |
1503 | static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } }; |
1504 | pthread_mutex_lock(&HostcallInitLock); |
1505 | uint64_t Buffer = hostrpc_assign_buffer( |
1506 | DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId); |
1507 | pthread_mutex_unlock(&HostcallInitLock); |
1508 | if (!Buffer) { |
1509 | DP("hostrpc_assign_buffer failed, gpu would dereference null and "{} |
1510 | "error\n"){}; |
1511 | return OFFLOAD_FAIL(~0); |
1512 | } |
1513 | |
1514 | DP("Implicit argument count: %d\n",{} |
1515 | KernelInfoEntry.implicit_argument_count){}; |
1516 | if (KernelInfoEntry.implicit_argument_count >= 4) { |
1517 | // Initialise pointer for implicit_argument_count != 0 ABI |
1518 | // Guess that the right implicit argument is at offset 24 after |
1519 | // the explicit arguments. In the future, should be able to read |
1520 | // the offset from msgpack. Clang is not annotating it at present. |
1521 | uint64_t Offset = |
1522 | sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); |
1523 | if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { |
1524 | DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "{} |
1525 | "args: %d\n",{} |
1526 | Offset + 8, ArgPool->kernargSizeIncludingImplicit()){}; |
1527 | } else { |
1528 | memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8); |
1529 | } |
1530 | } |
1531 | |
1532 | // initialise pointer for implicit_argument_count == 0 ABI |
1533 | ImplArgs->HostcallPtr = Buffer; |
1534 | } |
1535 | |
1536 | Packet->kernarg_address = KernArg; |
1537 | } |
1538 | |
1539 | hsa_signal_t S = DeviceInfo().FreeSignalPool.pop(); |
1540 | if (S.handle == 0) { |
1541 | DP("Failed to get signal instance\n"){}; |
1542 | return OFFLOAD_FAIL(~0); |
1543 | } |
1544 | Packet->completion_signal = S; |
1545 | hsa_signal_store_relaxed(Packet->completion_signal, 1); |
1546 | |
1547 | // Publish the packet indicating it is ready to be processed |
1548 | core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet), |
1549 | core::createHeader(), Packet->setup); |
1550 | |
1551 | // Since the packet is already published, its contents must not be |
1552 | // accessed any more |
1553 | hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); |
1554 | |
1555 | while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX(18446744073709551615UL), |
1556 | HSA_WAIT_STATE_BLOCKED) != 0) |
1557 | ; |
1558 | |
1559 | assert(ArgPool)(static_cast <bool> (ArgPool) ? void (0) : __assert_fail ("ArgPool", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1559, __extension__ __PRETTY_FUNCTION__)); |
1560 | ArgPool->deallocate(KernArg); |
1561 | DeviceInfo().FreeSignalPool.push(S); |
1562 | } |
1563 | |
1564 | DP("Kernel completed\n"){}; |
1565 | return OFFLOAD_SUCCESS(0); |
1566 | } |
1567 | |
1568 | bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) { |
1569 | const uint16_t AmdgcnMachineID = EM_AMDGPU; |
1570 | const int32_t R = elf_check_machine(Image, AmdgcnMachineID); |
1571 | if (!R) { |
1572 | DP("Supported machine ID not found\n"){}; |
1573 | } |
1574 | return R; |
1575 | } |
1576 | |
1577 | uint32_t elfEFlags(__tgt_device_image *Image) { |
1578 | const char *ImgBegin = (char *)Image->ImageStart; |
1579 | size_t ImgSize = (char *)Image->ImageEnd - ImgBegin; |
1580 | |
1581 | StringRef Buffer = StringRef(ImgBegin, ImgSize); |
1582 | auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), |
1583 | /*InitContent=*/false); |
1584 | if (!ElfOrErr) { |
1585 | consumeError(ElfOrErr.takeError()); |
1586 | return 0; |
1587 | } |
1588 | |
1589 | if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get())) |
1590 | return ELFObj->getPlatformFlags(); |
1591 | return 0; |
1592 | } |
1593 | |
1594 | template <typename T> bool enforceUpperBound(T *Value, T Upper) { |
1595 | bool Changed = *Value > Upper; |
1596 | if (Changed) { |
1597 | *Value = Upper; |
1598 | } |
1599 | return Changed; |
1600 | } |
1601 | |
1602 | struct SymbolInfo { |
1603 | const void *Addr = nullptr; |
1604 | uint32_t Size = UINT32_MAX(4294967295U); |
1605 | uint32_t ShType = SHT_NULL; |
1606 | }; |
1607 | |
1608 | int getSymbolInfoWithoutLoading(const ELFObjectFile<ELF64LE> &ELFObj, |
1609 | StringRef SymName, SymbolInfo *Res) { |
1610 | auto SymOrErr = getELFSymbol(ELFObj, SymName); |
1611 | if (!SymOrErr) { |
1612 | std::string ErrorString = toString(SymOrErr.takeError()); |
1613 | DP("Failed ELF lookup: %s\n", ErrorString.c_str()){}; |
1614 | return 1; |
1615 | } |
1616 | if (!*SymOrErr) |
1617 | return 1; |
1618 | |
1619 | auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx); |
1620 | if (!SymSecOrErr) { |
1621 | std::string ErrorString = toString(SymOrErr.takeError()); |
1622 | DP("Failed ELF lookup: %s\n", ErrorString.c_str()){}; |
1623 | return 1; |
1624 | } |
1625 | |
1626 | Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base(); |
1627 | Res->Size = static_cast<uint32_t>((*SymOrErr)->st_size); |
1628 | Res->ShType = static_cast<uint32_t>((*SymSecOrErr)->sh_type); |
1629 | return 0; |
1630 | } |
1631 | |
1632 | int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName, |
1633 | SymbolInfo *Res) { |
1634 | StringRef Buffer = StringRef(Base, ImgSize); |
1635 | auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), |
1636 | /*InitContent=*/false); |
1637 | if (!ElfOrErr) { |
1638 | REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str())do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Failed to load ELF: %s\n" , toString(ElfOrErr.takeError()).c_str()); } while (0);; |
1639 | return 1; |
1640 | } |
1641 | |
1642 | if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get())) |
1643 | return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res); |
1644 | return 1; |
1645 | } |
1646 | |
1647 | hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize, |
1648 | const char *SymName, const void **VarAddr, |
1649 | uint32_t *VarSize) { |
1650 | SymbolInfo SI; |
1651 | int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI); |
1652 | if (Rc == 0) { |
1653 | *VarAddr = SI.Addr; |
1654 | *VarSize = SI.Size; |
1655 | return HSA_STATUS_SUCCESS; |
1656 | } |
1657 | return HSA_STATUS_ERROR; |
1658 | } |
1659 | |
1660 | template <typename C> |
1661 | hsa_status_t moduleRegisterFromMemoryToPlace( |
1662 | std::map<std::string, atl_kernel_info_t> &KernelInfoTable, |
1663 | std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, |
1664 | void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb, |
1665 | std::vector<hsa_executable_t> &HSAExecutables) { |
1666 | auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t { |
1667 | C *Unwrapped = static_cast<C *>(CbState); |
1668 | return (*Unwrapped)(Data, Size); |
1669 | }; |
1670 | return core::RegisterModuleFromMemory( |
1671 | KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, |
1672 | DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb), |
1673 | HSAExecutables); |
1674 | } |
1675 | |
1676 | uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) { |
1677 | uint64_t DeviceStateBytes = 0; |
1678 | { |
1679 | // If this is the deviceRTL, get the state variable size |
1680 | SymbolInfo SizeSi; |
1681 | int Rc = getSymbolInfoWithoutLoading( |
1682 | ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi); |
1683 | |
1684 | if (Rc == 0) { |
1685 | if (SizeSi.Size != sizeof(uint64_t)) { |
1686 | DP("Found device_State_size variable with wrong size\n"){}; |
1687 | return 0; |
1688 | } |
1689 | |
1690 | // Read number of bytes directly from the elf |
1691 | memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t)); |
1692 | } |
1693 | } |
1694 | return DeviceStateBytes; |
1695 | } |
1696 | |
1697 | struct DeviceEnvironment { |
1698 | // initialise an DeviceEnvironmentTy in the deviceRTL |
1699 | // patches around differences in the deviceRTL between trunk, aomp, |
1700 | // rocmcc. Over time these differences will tend to zero and this class |
1701 | // simplified. |
1702 | // Symbol may be in .data or .bss, and may be missing fields, todo: |
1703 | // review aomp/trunk/rocm and simplify the following |
1704 | |
1705 | // The symbol may also have been deadstripped because the device side |
1706 | // accessors were unused. |
1707 | |
1708 | // If the symbol is in .data (aomp, rocm) it can be written directly. |
1709 | // If it is in .bss, we must wait for it to be allocated space on the |
1710 | // gpu (trunk) and initialize after loading. |
1711 | const char *sym() { return "__omp_rtl_device_environment"; } |
1712 | |
1713 | DeviceEnvironmentTy HostDeviceEnv; |
1714 | SymbolInfo SI; |
1715 | bool Valid = false; |
1716 | |
1717 | __tgt_device_image *Image; |
1718 | const size_t ImgSize; |
1719 | |
1720 | DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize, |
1721 | __tgt_device_image *Image, const size_t ImgSize) |
1722 | : Image(Image), ImgSize(ImgSize) { |
1723 | |
1724 | HostDeviceEnv.NumDevices = NumberDevices; |
1725 | HostDeviceEnv.DeviceNum = DeviceId; |
1726 | HostDeviceEnv.DebugKind = 0; |
1727 | HostDeviceEnv.DynamicMemSize = DynamicMemSize; |
1728 | if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) |
1729 | HostDeviceEnv.DebugKind = std::stoi(EnvStr); |
1730 | |
1731 | int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize, |
1732 | sym(), &SI); |
1733 | if (Rc != 0) { |
1734 | DP("Finding global device environment '%s' - symbol missing.\n", sym()){}; |
1735 | return; |
1736 | } |
1737 | |
1738 | if (SI.Size > sizeof(HostDeviceEnv)) { |
1739 | DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,{} |
1740 | sizeof(HostDeviceEnv)){}; |
1741 | return; |
1742 | } |
1743 | |
1744 | Valid = true; |
1745 | } |
1746 | |
1747 | bool inImage() { return SI.ShType != SHT_NOBITS; } |
1748 | |
1749 | hsa_status_t beforeLoading(void *Data, size_t Size) { |
1750 | if (Valid) { |
1751 | if (inImage()) { |
1752 | DP("Setting global device environment before load (%u bytes)\n",{} |
1753 | SI.Size){}; |
1754 | uint64_t Offset = reinterpret_cast<const char *>(SI.Addr) - |
1755 | reinterpret_cast<const char *>(Image->ImageStart); |
1756 | void *Pos = reinterpret_cast<char *>(Data) + Offset; |
1757 | memcpy(Pos, &HostDeviceEnv, SI.Size); |
1758 | } |
1759 | } |
1760 | return HSA_STATUS_SUCCESS; |
1761 | } |
1762 | |
1763 | hsa_status_t afterLoading() { |
1764 | if (Valid) { |
1765 | if (!inImage()) { |
1766 | DP("Setting global device environment after load (%u bytes)\n",{} |
1767 | SI.Size){}; |
1768 | int DeviceId = HostDeviceEnv.DeviceNum; |
1769 | auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; |
1770 | void *StatePtr; |
1771 | uint32_t StatePtrSize; |
1772 | hsa_status_t Err = interop_hsa_get_symbol_info( |
1773 | SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize); |
1774 | if (Err != HSA_STATUS_SUCCESS) { |
1775 | DP("failed to find %s in loaded image\n", sym()){}; |
1776 | return Err; |
1777 | } |
1778 | |
1779 | if (StatePtrSize != SI.Size) { |
1780 | DP("Symbol had size %u before loading, %u after\n", StatePtrSize,{} |
1781 | SI.Size){}; |
1782 | return HSA_STATUS_ERROR; |
1783 | } |
1784 | |
1785 | return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, |
1786 | StatePtrSize, DeviceId); |
1787 | } |
1788 | } |
1789 | return HSA_STATUS_SUCCESS; |
1790 | } |
1791 | }; |
1792 | |
1793 | hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { |
1794 | uint64_t Rounded = 4 * ((Size + 3) / 4); |
1795 | void *Ptr; |
1796 | hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); |
1797 | hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr); |
1798 | if (Err != HSA_STATUS_SUCCESS) { |
1799 | return Err; |
1800 | } |
1801 | |
1802 | hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4); |
1803 | if (Rc != HSA_STATUS_SUCCESS) { |
1804 | DP("zero fill device_state failed with %u\n", Rc){}; |
1805 | core::Runtime::Memfree(Ptr); |
1806 | return HSA_STATUS_ERROR; |
1807 | } |
1808 | |
1809 | *RetPtr = Ptr; |
1810 | return HSA_STATUS_SUCCESS; |
1811 | } |
1812 | |
1813 | bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { |
1814 | SymbolInfo SI; |
1815 | int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI); |
1816 | return (Rc == 0) && (SI.Addr != nullptr); |
1817 | } |
1818 | |
1819 | hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent, |
1820 | void **LockedHostPtr) { |
1821 | hsa_status_t err = is_locked(HostPtr, LockedHostPtr); |
1822 | if (err != HSA_STATUS_SUCCESS) |
1823 | return err; |
1824 | |
1825 | // HostPtr is already locked, just return it |
1826 | if (*LockedHostPtr) |
1827 | return HSA_STATUS_SUCCESS; |
1828 | |
1829 | hsa_agent_t Agents[1] = {Agent}; |
1830 | return hsa_amd_memory_lock(HostPtr, Size, Agents, /*num_agent=*/1, |
1831 | LockedHostPtr); |
1832 | } |
1833 | |
1834 | hsa_status_t unlock_memory(void *HostPtr) { |
1835 | void *LockedHostPtr = nullptr; |
1836 | hsa_status_t err = is_locked(HostPtr, &LockedHostPtr); |
1837 | if (err != HSA_STATUS_SUCCESS) |
1838 | return err; |
1839 | |
1840 | // if LockedHostPtr is nullptr, then HostPtr was not locked |
1841 | if (!LockedHostPtr) |
1842 | return HSA_STATUS_SUCCESS; |
1843 | |
1844 | err = hsa_amd_memory_unlock(HostPtr); |
1845 | return err; |
1846 | } |
1847 | |
1848 | } // namespace |
1849 | |
1850 | namespace core { |
1851 | hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) { |
1852 | return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(), |
1853 | &DeviceInfo().HSAAgents[0], NULL__null, Ptr); |
1854 | } |
1855 | } // namespace core |
1856 | |
1857 | static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) { |
1858 | hsa_status_t err; |
1859 | uint32_t name_len; |
1860 | err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len); |
1861 | if (err != HSA_STATUS_SUCCESS) { |
1862 | DP("Error getting ISA info length\n"){}; |
1863 | return err; |
1864 | } |
1865 | |
1866 | char TargetID[name_len]; |
1867 | err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID); |
1868 | if (err != HSA_STATUS_SUCCESS) { |
1869 | DP("Error getting ISA info name\n"){}; |
1870 | return err; |
1871 | } |
1872 | |
1873 | auto TripleTargetID = llvm::StringRef(TargetID); |
1874 | if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) { |
1875 | DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str()); |
1876 | } |
1877 | return HSA_STATUS_SUCCESS; |
1878 | } |
1879 | |
1880 | extern "C" { |
1881 | int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { |
1882 | return elfMachineIdIsAmdgcn(Image); |
1883 | } |
1884 | |
1885 | int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, |
1886 | __tgt_image_info *info) { |
1887 | if (!__tgt_rtl_is_valid_binary(image)) |
1888 | return false; |
1889 | |
1890 | // A subarchitecture was not specified. Assume it is compatible. |
1891 | if (!info->Arch) |
1892 | return true; |
1893 | |
1894 | int32_t NumberOfDevices = __tgt_rtl_number_of_devices(); |
1895 | |
1896 | for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { |
1897 | __tgt_rtl_init_device(DeviceId); |
1898 | hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId]; |
1899 | hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId); |
1900 | if (err != HSA_STATUS_SUCCESS) { |
1901 | DP("Error iterating ISAs\n"){}; |
1902 | return false; |
1903 | } |
1904 | if (!isImageCompatibleWithEnv(info, DeviceInfo().TargetID[DeviceId])) |
1905 | return false; |
1906 | } |
1907 | DP("Image has Target ID compatible with the current environment: %s\n",{} |
1908 | info->Arch){}; |
1909 | return true; |
1910 | } |
1911 | |
1912 | int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS(0); } |
1913 | int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS(0); } |
1914 | |
1915 | int __tgt_rtl_number_of_devices() { |
1916 | // If the construction failed, no methods are safe to call |
1917 | if (DeviceInfo().ConstructionSucceeded) { |
1918 | return DeviceInfo().NumberOfDevices; |
1919 | } |
1920 | DP("AMDGPU plugin construction failed. Zero devices available\n"){}; |
1921 | return 0; |
1922 | } |
1923 | |
1924 | int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { |
1925 | DP("Init requires flags to %ld\n", RequiresFlags){}; |
1926 | DeviceInfo().RequiresFlags = RequiresFlags; |
1927 | return RequiresFlags; |
1928 | } |
1929 | |
1930 | int32_t __tgt_rtl_init_device(int DeviceId) { |
1931 | hsa_status_t Err = hsa_init(); |
1932 | if (Err != HSA_STATUS_SUCCESS) { |
1933 | DP("HSA Initialization Failed.\n"){}; |
1934 | return HSA_STATUS_ERROR; |
1935 | } |
1936 | // this is per device id init |
1937 | DP("Initialize the device id: %d\n", DeviceId){}; |
1938 | |
1939 | hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; |
1940 | |
1941 | // Get number of Compute Unit |
1942 | uint32_t ComputeUnits = 0; |
1943 | Err = hsa_agent_get_info( |
1944 | Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, |
1945 | &ComputeUnits); |
1946 | if (Err != HSA_STATUS_SUCCESS) { |
1947 | DeviceInfo().ComputeUnits[DeviceId] = 1; |
1948 | DP("Error getting compute units : settiing to 1\n"){}; |
1949 | } else { |
1950 | DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits; |
1951 | DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]){}; |
1952 | } |
1953 | |
1954 | char GetInfoName[64]; // 64 max size returned by get info |
1955 | Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, |
1956 | (void *)GetInfoName); |
1957 | if (Err) |
1958 | DeviceInfo().GPUName[DeviceId] = "--unknown gpu--"; |
1959 | else { |
1960 | DeviceInfo().GPUName[DeviceId] = GetInfoName; |
1961 | } |
1962 | |
1963 | if (print_kernel_trace & STARTUP_DETAILS) |
1964 | DP("Device#%-2d CU's: %2d %s\n", DeviceId,{} |
1965 | DeviceInfo().ComputeUnits[DeviceId],{} |
1966 | DeviceInfo().GPUName[DeviceId].c_str()){}; |
1967 | |
1968 | // Query attributes to determine number of threads/block and blocks/grid. |
1969 | uint16_t WorkgroupMaxDim[3]; |
1970 | Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, |
1971 | &WorkgroupMaxDim); |
1972 | if (Err != HSA_STATUS_SUCCESS) { |
1973 | DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; |
1974 | DP("Error getting grid dims: num groups : %d\n",{} |
1975 | RTLDeviceInfoTy::DefaultNumTeams){}; |
1976 | } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { |
1977 | DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; |
1978 | DP("Using %d ROCm blocks per grid\n",{} |
1979 | DeviceInfo().GroupsPerDevice[DeviceId]){}; |
1980 | } else { |
1981 | DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; |
1982 | DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "{} |
1983 | "at the hard limit\n",{} |
1984 | WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit){}; |
1985 | } |
1986 | |
1987 | // Get thread limit |
1988 | hsa_dim3_t GridMaxDim; |
1989 | Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim); |
1990 | if (Err == HSA_STATUS_SUCCESS) { |
1991 | DeviceInfo().ThreadsPerGroup[DeviceId] = |
1992 | reinterpret_cast<uint32_t *>(&GridMaxDim)[0] / |
1993 | DeviceInfo().GroupsPerDevice[DeviceId]; |
1994 | |
1995 | if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) { |
1996 | DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; |
1997 | DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){}; |
1998 | } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId], |
1999 | RTLDeviceInfoTy::MaxWgSize)) { |
2000 | DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){}; |
2001 | } else { |
2002 | DP("Using ROCm Queried thread limit: %d\n",{} |
2003 | DeviceInfo().ThreadsPerGroup[DeviceId]){}; |
2004 | } |
2005 | } else { |
2006 | DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; |
2007 | DP("Error getting max block dimension, use default:%d \n",{} |
2008 | RTLDeviceInfoTy::MaxWgSize){}; |
2009 | } |
2010 | |
2011 | // Get wavefront size |
2012 | uint32_t WavefrontSize = 0; |
2013 | Err = |
2014 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize); |
2015 | if (Err == HSA_STATUS_SUCCESS) { |
2016 | DP("Queried wavefront size: %d\n", WavefrontSize){}; |
2017 | DeviceInfo().WarpSize[DeviceId] = WavefrontSize; |
2018 | } else { |
2019 | // TODO: Burn the wavefront size into the code object |
2020 | DP("Warning: Unknown wavefront size, assuming 64\n"){}; |
2021 | DeviceInfo().WarpSize[DeviceId] = 64; |
2022 | } |
2023 | |
2024 | // Adjust teams to the env variables |
2025 | |
2026 | if (DeviceInfo().Env.TeamLimit > 0 && |
2027 | (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId], |
2028 | DeviceInfo().Env.TeamLimit))) { |
2029 | DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",{} |
2030 | DeviceInfo().Env.TeamLimit){}; |
2031 | } |
2032 | |
2033 | // Set default number of teams |
2034 | if (DeviceInfo().Env.NumTeams > 0) { |
2035 | DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams; |
2036 | DP("Default number of teams set according to environment %d\n",{} |
2037 | DeviceInfo().Env.NumTeams){}; |
2038 | } else { |
2039 | char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); |
2040 | int TeamsPerCU = DefaultTeamsPerCU; |
2041 | if (TeamsPerCUEnvStr) { |
2042 | TeamsPerCU = std::stoi(TeamsPerCUEnvStr); |
2043 | } |
2044 | |
2045 | DeviceInfo().NumTeams[DeviceId] = |
2046 | TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId]; |
2047 | DP("Default number of teams = %d * number of compute units %d\n",{} |
2048 | TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]){}; |
2049 | } |
2050 | |
2051 | if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId], |
2052 | DeviceInfo().GroupsPerDevice[DeviceId])) { |
2053 | DP("Default number of teams exceeds device limit, capping at %d\n",{} |
2054 | DeviceInfo().GroupsPerDevice[DeviceId]){}; |
2055 | } |
2056 | |
2057 | // Adjust threads to the env variables |
2058 | if (DeviceInfo().Env.TeamThreadLimit > 0 && |
2059 | (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], |
2060 | DeviceInfo().Env.TeamThreadLimit))) { |
2061 | DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",{} |
2062 | DeviceInfo().Env.TeamThreadLimit){}; |
2063 | } |
2064 | |
2065 | // Set default number of threads |
2066 | DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; |
2067 | DP("Default number of threads set according to library's default %d\n",{} |
2068 | RTLDeviceInfoTy::DefaultWgSize){}; |
2069 | if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], |
2070 | DeviceInfo().ThreadsPerGroup[DeviceId])) { |
2071 | DP("Default number of threads exceeds device limit, capping at %d\n",{} |
2072 | DeviceInfo().ThreadsPerGroup[DeviceId]){}; |
2073 | } |
2074 | |
2075 | DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",{} |
2076 | DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],{} |
2077 | DeviceInfo().ThreadsPerGroup[DeviceId]){}; |
2078 | |
2079 | DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,{} |
2080 | DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],{} |
2081 | DeviceInfo().GroupsPerDevice[DeviceId],{} |
2082 | DeviceInfo().GroupsPerDevice[DeviceId] *{} |
2083 | DeviceInfo().ThreadsPerGroup[DeviceId]){}; |
2084 | |
2085 | return OFFLOAD_SUCCESS(0); |
2086 | } |
2087 | |
2088 | static __tgt_target_table * |
2089 | __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); |
2090 | |
2091 | __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, |
2092 | __tgt_device_image *Image) { |
2093 | DeviceInfo().LoadRunLock.lock(); |
2094 | __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image); |
2095 | DeviceInfo().LoadRunLock.unlock(); |
2096 | return Res; |
2097 | } |
2098 | |
2099 | __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, |
2100 | __tgt_device_image *Image) { |
2101 | // This function loads the device image onto gpu[DeviceId] and does other |
2102 | // per-image initialization work. Specifically: |
2103 | // |
2104 | // - Initialize an DeviceEnvironmentTy instance embedded in the |
2105 | // image at the symbol "__omp_rtl_device_environment" |
2106 | // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. |
2107 | // |
2108 | // - Allocate a large array per-gpu (could be moved to init_device) |
2109 | // - Read a uint64_t at symbol omptarget_nvptx_device_State_size |
2110 | // - Allocate at least that many bytes of gpu memory |
2111 | // - Zero initialize it |
2112 | // - Write the pointer to the symbol omptarget_nvptx_device_State |
2113 | // |
2114 | // - Pulls some per-kernel information together from various sources and |
2115 | // records it in the KernelsList for quicker access later |
2116 | // |
2117 | // The initialization can be done before or after loading the image onto the |
2118 | // gpu. This function presently does a mixture. Using the hsa api to get/set |
2119 | // the information is simpler to implement, in exchange for more complicated |
2120 | // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes |
2121 | // back from the gpu vs a hashtable lookup on the host. |
2122 | |
2123 | const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart; |
2124 | |
2125 | DeviceInfo().clearOffloadEntriesTable(DeviceId); |
2126 | |
2127 | // We do not need to set the ELF version because the caller of this function |
2128 | // had to do that to decide the right runtime to use |
2129 | |
2130 | if (!elfMachineIdIsAmdgcn(Image)) |
2131 | return NULL__null; |
2132 | |
2133 | { |
2134 | auto Env = |
2135 | DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, |
2136 | DeviceInfo().Env.DynamicMemSize, Image, ImgSize); |
2137 | |
2138 | auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId]; |
2139 | auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; |
2140 | hsa_status_t Err = moduleRegisterFromMemoryToPlace( |
2141 | KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId, |
2142 | [&](void *Data, size_t Size) { |
2143 | if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) { |
2144 | __atomic_store_n(&DeviceInfo().HostcallRequired, true, |
2145 | __ATOMIC_RELEASE3); |
2146 | } |
2147 | return Env.beforeLoading(Data, Size); |
2148 | }, |
2149 | DeviceInfo().HSAExecutables); |
2150 | |
2151 | check("Module registering", Err){}; |
2152 | if (Err != HSA_STATUS_SUCCESS) { |
2153 | const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str(); |
2154 | const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image)); |
2155 | |
2156 | if (strcmp(DeviceName, ElfName) != 0) { |
2157 | DP("Possible gpu arch mismatch: device:%s, image:%s please check"{} |
2158 | " compiler flag: -march=<gpu>\n",{} |
2159 | DeviceName, ElfName){}; |
2160 | } else { |
2161 | DP("Error loading image onto GPU: %s\n", get_error_string(Err)){}; |
2162 | } |
2163 | |
2164 | return NULL__null; |
2165 | } |
2166 | |
2167 | Err = Env.afterLoading(); |
2168 | if (Err != HSA_STATUS_SUCCESS) { |
2169 | return NULL__null; |
2170 | } |
2171 | } |
2172 | |
2173 | DP("AMDGPU module successfully loaded!\n"){}; |
2174 | |
2175 | { |
2176 | // the device_State array is either large value in bss or a void* that |
2177 | // needs to be assigned to a pointer to an array of size device_state_bytes |
2178 | // If absent, it has been deadstripped and needs no setup. |
2179 | |
2180 | void *StatePtr; |
2181 | uint32_t StatePtrSize; |
2182 | auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; |
2183 | hsa_status_t Err = interop_hsa_get_symbol_info( |
2184 | SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr, |
2185 | &StatePtrSize); |
2186 | |
2187 | if (Err != HSA_STATUS_SUCCESS) { |
2188 | DP("No device_state symbol found, skipping initialization\n"){}; |
2189 | } else { |
2190 | if (StatePtrSize < sizeof(void *)) { |
2191 | DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,{} |
2192 | sizeof(void *)){}; |
2193 | return NULL__null; |
2194 | } |
2195 | |
2196 | // if it's larger than a void*, assume it's a bss array and no further |
2197 | // initialization is required. Only try to set up a pointer for |
2198 | // sizeof(void*) |
2199 | if (StatePtrSize == sizeof(void *)) { |
2200 | uint64_t DeviceStateBytes = |
2201 | getDeviceStateBytes((char *)Image->ImageStart, ImgSize); |
2202 | if (DeviceStateBytes == 0) { |
2203 | DP("Can't initialize device_State, missing size information\n"){}; |
2204 | return NULL__null; |
2205 | } |
2206 | |
2207 | auto &DSS = DeviceInfo().DeviceStateStore[DeviceId]; |
2208 | if (DSS.first.get() == nullptr) { |
2209 | assert(DSS.second == 0)(static_cast <bool> (DSS.second == 0) ? void (0) : __assert_fail ("DSS.second == 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 2209, __extension__ __PRETTY_FUNCTION__)); |
2210 | void *Ptr = NULL__null; |
2211 | hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId); |
2212 | if (Err != HSA_STATUS_SUCCESS) { |
2213 | DP("Failed to allocate device_state array\n"){}; |
2214 | return NULL__null; |
2215 | } |
2216 | DSS = { |
2217 | std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr}, |
2218 | DeviceStateBytes, |
2219 | }; |
2220 | } |
2221 | |
2222 | void *Ptr = DSS.first.get(); |
2223 | if (DeviceStateBytes != DSS.second) { |
2224 | DP("Inconsistent sizes of device_State unsupported\n"){}; |
2225 | return NULL__null; |
2226 | } |
2227 | |
2228 | // write ptr to device memory so it can be used by later kernels |
2229 | Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, |
2230 | sizeof(void *), DeviceId); |
2231 | if (Err != HSA_STATUS_SUCCESS) { |
2232 | DP("memcpy install of state_ptr failed\n"){}; |
2233 | return NULL__null; |
2234 | } |
2235 | } |
2236 | } |
2237 | } |
2238 | |
2239 | // Here, we take advantage of the data that is appended after img_end to get |
2240 | // the symbols' name we need to load. This data consist of the host entries |
2241 | // begin and end as well as the target name (see the offloading linker script |
2242 | // creation in clang compiler). |
2243 | |
2244 | // Find the symbols in the module by name. The name can be obtain by |
2245 | // concatenating the host entry name with the target name |
2246 | |
2247 | __tgt_offload_entry *HostBegin = Image->EntriesBegin; |
2248 | __tgt_offload_entry *HostEnd = Image->EntriesEnd; |
2249 | |
2250 | for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { |
2251 | |
2252 | if (!E->addr) { |
2253 | // The host should have always something in the address to |
2254 | // uniquely identify the target region. |
2255 | DP("Analyzing host entry '<null>' (size = %lld)...\n",{} |
2256 | (unsigned long long)E->size){}; |
2257 | return NULL__null; |
2258 | } |
2259 | |
2260 | if (E->size) { |
2261 | __tgt_offload_entry Entry = *E; |
2262 | |
2263 | void *Varptr; |
2264 | uint32_t Varsize; |
2265 | |
2266 | auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; |
2267 | hsa_status_t Err = interop_hsa_get_symbol_info( |
2268 | SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize); |
2269 | |
2270 | if (Err != HSA_STATUS_SUCCESS) { |
2271 | // Inform the user what symbol prevented offloading |
2272 | DP("Loading global '%s' (Failed)\n", E->name){}; |
2273 | return NULL__null; |
2274 | } |
2275 | |
2276 | if (Varsize != E->size) { |
2277 | DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,{} |
2278 | Varsize, E->size){}; |
2279 | return NULL__null; |
2280 | } |
2281 | |
2282 | DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",{} |
2283 | DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)){}; |
2284 | Entry.addr = (void *)Varptr; |
2285 | |
2286 | DeviceInfo().addOffloadEntry(DeviceId, Entry); |
2287 | |
2288 | if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && |
2289 | E->flags & OMP_DECLARE_TARGET_LINK) { |
2290 | // If unified memory is present any target link variables |
2291 | // can access host addresses directly. There is no longer a |
2292 | // need for device copies. |
2293 | Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, |
2294 | sizeof(void *), DeviceId); |
2295 | if (Err != HSA_STATUS_SUCCESS) |
2296 | DP("Error when copying USM\n"){}; |
2297 | DP("Copy linked variable host address (" DPxMOD ")"{} |
2298 | "to device address (" DPxMOD ")\n",{} |
2299 | DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)){}; |
2300 | } |
2301 | |
2302 | continue; |
2303 | } |
2304 | |
2305 | DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)){}; |
2306 | |
2307 | // errors in kernarg_segment_size previously treated as = 0 (or as undef) |
2308 | uint32_t KernargSegmentSize = 0; |
2309 | auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId]; |
2310 | hsa_status_t Err = HSA_STATUS_SUCCESS; |
2311 | if (!E->name) { |
2312 | Err = HSA_STATUS_ERROR; |
Value stored to 'Err' is never read | |
2313 | } else { |
2314 | std::string KernelStr = std::string(E->name); |
2315 | auto It = KernelInfoMap.find(KernelStr); |
2316 | if (It != KernelInfoMap.end()) { |
2317 | atl_kernel_info_t Info = It->second; |
2318 | KernargSegmentSize = Info.kernel_segment_size; |
2319 | } else { |
2320 | Err = HSA_STATUS_ERROR; |
2321 | } |
2322 | } |
2323 | |
2324 | // default value GENERIC (in case symbol is missing from cubin file) |
2325 | llvm::omp::OMPTgtExecModeFlags ExecModeVal = |
2326 | llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; |
2327 | |
2328 | // get flat group size if present, else Default_WG_Size |
2329 | int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; |
2330 | |
2331 | // get Kernel Descriptor if present. |
2332 | // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp |
2333 | struct KernDescValType { |
2334 | uint16_t Version; |
2335 | uint16_t TSize; |
2336 | uint16_t WGSize; |
2337 | }; |
2338 | struct KernDescValType KernDescVal; |
2339 | std::string KernDescNameStr(E->name); |
2340 | KernDescNameStr += "_kern_desc"; |
2341 | const char *KernDescName = KernDescNameStr.c_str(); |
2342 | |
2343 | const void *KernDescPtr; |
2344 | uint32_t KernDescSize; |
2345 | void *CallStackAddr = nullptr; |
2346 | Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName, |
2347 | &KernDescPtr, &KernDescSize); |
2348 | |
2349 | if (Err == HSA_STATUS_SUCCESS) { |
2350 | if ((size_t)KernDescSize != sizeof(KernDescVal)) |
2351 | DP("Loading global computation properties '%s' - size mismatch (%u != "{} |
2352 | "%lu)\n",{} |
2353 | KernDescName, KernDescSize, sizeof(KernDescVal)){}; |
2354 | |
2355 | memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); |
2356 | |
2357 | // Check structure size against recorded size. |
2358 | if ((size_t)KernDescSize != KernDescVal.TSize) |
2359 | DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",{} |
2360 | sizeof(KernDescVal), KernDescVal.TSize, KernDescName){}; |
2361 | |
2362 | DP("After loading global for %s KernDesc \n", KernDescName){}; |
2363 | DP("KernDesc: Version: %d\n", KernDescVal.Version){}; |
2364 | DP("KernDesc: TSize: %d\n", KernDescVal.TSize){}; |
2365 | DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize){}; |
2366 | |
2367 | if (KernDescVal.WGSize == 0) { |
2368 | KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize; |
2369 | DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize){}; |
2370 | } |
2371 | WGSizeVal = KernDescVal.WGSize; |
2372 | DP("WGSizeVal %d\n", WGSizeVal){}; |
2373 | check("Loading KernDesc computation property", Err){}; |
2374 | } else { |
2375 | DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName){}; |
2376 | |
2377 | // Flat group size |
2378 | std::string WGSizeNameStr(E->name); |
2379 | WGSizeNameStr += "_wg_size"; |
2380 | const char *WGSizeName = WGSizeNameStr.c_str(); |
2381 | |
2382 | const void *WGSizePtr; |
2383 | uint32_t WGSize; |
2384 | Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName, |
2385 | &WGSizePtr, &WGSize); |
2386 | |
2387 | if (Err == HSA_STATUS_SUCCESS) { |
2388 | if ((size_t)WGSize != sizeof(int16_t)) { |
2389 | DP("Loading global computation properties '%s' - size mismatch (%u "{} |
2390 | "!= "{} |
2391 | "%lu)\n",{} |
2392 | WGSizeName, WGSize, sizeof(int16_t)){}; |
2393 | return NULL__null; |
2394 | } |
2395 | |
2396 | memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); |
2397 | |
2398 | DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal){}; |
2399 | |
2400 | if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize || |
2401 | WGSizeVal > RTLDeviceInfoTy::MaxWgSize) { |
2402 | DP("Error wrong WGSize value specified in HSA code object file: "{} |
2403 | "%d\n",{} |
2404 | WGSizeVal){}; |
2405 | WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; |
2406 | } |
2407 | } else { |
2408 | DP("Warning: Loading WGSize '%s' - symbol not found, "{} |
2409 | "using default value %d\n",{} |
2410 | WGSizeName, WGSizeVal){}; |
2411 | } |
2412 | |
2413 | check("Loading WGSize computation property", Err){}; |
2414 | } |
2415 | |
2416 | // Read execution mode from global in binary |
2417 | std::string ExecModeNameStr(E->name); |
2418 | ExecModeNameStr += "_exec_mode"; |
2419 | const char *ExecModeName = ExecModeNameStr.c_str(); |
2420 | |
2421 | const void *ExecModePtr; |
2422 | uint32_t VarSize; |
2423 | Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName, |
2424 | &ExecModePtr, &VarSize); |
2425 | |
2426 | if (Err == HSA_STATUS_SUCCESS) { |
2427 | if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { |
2428 | DP("Loading global computation properties '%s' - size mismatch(%u != "{} |
2429 | "%lu)\n",{} |
2430 | ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)){}; |
2431 | return NULL__null; |
2432 | } |
2433 | |
2434 | memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize); |
2435 | |
2436 | DP("After loading global for %s ExecMode = %d\n", ExecModeName,{} |
2437 | ExecModeVal){}; |
2438 | |
2439 | if (ExecModeVal < 0 || |
2440 | ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { |
2441 | DP("Error wrong exec_mode value specified in HSA code object file: "{} |
2442 | "%d\n",{} |
2443 | ExecModeVal){}; |
2444 | return NULL__null; |
2445 | } |
2446 | } else { |
2447 | DP("Loading global exec_mode '%s' - symbol missing, using default "{} |
2448 | "value "{} |
2449 | "GENERIC (1)\n",{} |
2450 | ExecModeName){}; |
2451 | } |
2452 | check("Loading computation property", Err){}; |
2453 | |
2454 | KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, |
2455 | CallStackAddr, E->name, KernargSegmentSize, |
2456 | DeviceInfo().KernArgPool)); |
2457 | __tgt_offload_entry Entry = *E; |
2458 | Entry.addr = (void *)&KernelsList.back(); |
2459 | DeviceInfo().addOffloadEntry(DeviceId, Entry); |
2460 | DP("Entry point %ld maps to %s\n", E - HostBegin, E->name){}; |
2461 | } |
2462 | |
2463 | return DeviceInfo().getOffloadEntriesTable(DeviceId); |
2464 | } |
2465 | |
2466 | void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { |
2467 | void *Ptr = NULL__null; |
2468 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2468, __extension__ __PRETTY_FUNCTION__)); |
2469 | |
2470 | hsa_amd_memory_pool_t MemoryPool; |
2471 | switch (Kind) { |
2472 | case TARGET_ALLOC_DEFAULT: |
2473 | case TARGET_ALLOC_DEVICE: |
2474 | // GPU memory |
2475 | MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); |
2476 | break; |
2477 | case TARGET_ALLOC_HOST: |
2478 | // non-migratable memory accessible by host and device(s) |
2479 | MemoryPool = DeviceInfo().getHostMemoryPool(); |
2480 | break; |
2481 | default: |
2482 | REPORT("Invalid target data allocation kind or requested allocator not "do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not " "implemented yet\n"); } while (0); |
2483 | "implemented yet\n")do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not " "implemented yet\n"); } while (0);; |
2484 | return NULL__null; |
2485 | } |
2486 | |
2487 | hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr); |
2488 | DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,{} |
2489 | (long long unsigned)(Elf64_Addr)Ptr){}; |
2490 | Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL__null; |
2491 | return Ptr; |
2492 | } |
2493 | |
2494 | int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, |
2495 | int64_t Size) { |
2496 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2496, __extension__ __PRETTY_FUNCTION__)); |
2497 | __tgt_async_info AsyncInfo; |
2498 | int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); |
2499 | if (Rc != OFFLOAD_SUCCESS(0)) |
2500 | return OFFLOAD_FAIL(~0); |
2501 | |
2502 | return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); |
2503 | } |
2504 | |
2505 | int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, |
2506 | int64_t Size, __tgt_async_info *AsyncInfo) { |
2507 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2507, __extension__ __PRETTY_FUNCTION__)); |
2508 | if (AsyncInfo) { |
2509 | initAsyncInfo(AsyncInfo); |
2510 | return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo); |
2511 | } |
2512 | return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size); |
2513 | } |
2514 | |
2515 | int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr, |
2516 | int64_t Size) { |
2517 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2517, __extension__ __PRETTY_FUNCTION__)); |
2518 | __tgt_async_info AsyncInfo; |
2519 | int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); |
2520 | if (Rc != OFFLOAD_SUCCESS(0)) |
2521 | return OFFLOAD_FAIL(~0); |
2522 | |
2523 | return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); |
2524 | } |
2525 | |
2526 | int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr, |
2527 | int64_t Size, |
2528 | __tgt_async_info *AsyncInfo) { |
2529 | assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2529, __extension__ __PRETTY_FUNCTION__)); |
2530 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2530, __extension__ __PRETTY_FUNCTION__)); |
2531 | initAsyncInfo(AsyncInfo); |
2532 | return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); |
2533 | } |
2534 | |
2535 | int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) { |
2536 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2536, __extension__ __PRETTY_FUNCTION__)); |
2537 | // HSA can free pointers allocated from different types of memory pool. |
2538 | hsa_status_t Err; |
2539 | DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr){}; |
2540 | Err = core::Runtime::Memfree(TgtPtr); |
2541 | if (Err != HSA_STATUS_SUCCESS) { |
2542 | DP("Error when freeing CUDA memory\n"){}; |
2543 | return OFFLOAD_FAIL(~0); |
2544 | } |
2545 | return OFFLOAD_SUCCESS(0); |
2546 | } |
2547 | |
2548 | int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, |
2549 | void **TgtArgs, ptrdiff_t *TgtOffsets, |
2550 | KernelArgsTy *KernelArgs, |
2551 | __tgt_async_info *AsyncInfo) { |
2552 | assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&(static_cast <bool> (!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit [1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported." ) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__ __PRETTY_FUNCTION__)) |
2553 | !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&(static_cast <bool> (!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit [1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported." ) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__ __PRETTY_FUNCTION__)) |
2554 | "Only one dimensional kernels supported.")(static_cast <bool> (!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit [1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported." ) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__ __PRETTY_FUNCTION__)); |
2555 | assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2555, __extension__ __PRETTY_FUNCTION__)); |
2556 | initAsyncInfo(AsyncInfo); |
2557 | |
2558 | DeviceInfo().LoadRunLock.lock_shared(); |
2559 | int32_t Res = |
2560 | runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, |
2561 | KernelArgs->NumArgs, KernelArgs->NumTeams[0], |
2562 | KernelArgs->ThreadLimit[0], KernelArgs->Tripcount); |
2563 | |
2564 | DeviceInfo().LoadRunLock.unlock_shared(); |
2565 | return Res; |
2566 | } |
2567 | |
2568 | int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) { |
2569 | assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2569, __extension__ __PRETTY_FUNCTION__)); |
2570 | |
2571 | // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant |
2572 | // is not ensured by devices.cpp for amdgcn |
2573 | // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); |
2574 | if (AsyncInfo->Queue) { |
2575 | finiAsyncInfo(AsyncInfo); |
2576 | } |
2577 | return OFFLOAD_SUCCESS(0); |
2578 | } |
2579 | |
2580 | void __tgt_rtl_print_device_info(int32_t DeviceId) { |
2581 | // TODO: Assertion to see if DeviceId is correct |
2582 | // NOTE: We don't need to set context for print device info. |
2583 | |
2584 | DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]); |
2585 | } |
2586 | |
2587 | int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size, |
2588 | void **LockedHostPtr) { |
2589 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2589, __extension__ __PRETTY_FUNCTION__)); |
2590 | |
2591 | hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; |
2592 | hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr); |
2593 | if (err != HSA_STATUS_SUCCESS) { |
2594 | DP("Error in tgt_rtl_data_lock\n"){}; |
2595 | return OFFLOAD_FAIL(~0); |
2596 | } |
2597 | DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size,{} |
2598 | (long long unsigned)(Elf64_Addr)*LockedHostPtr){}; |
2599 | return OFFLOAD_SUCCESS(0); |
2600 | } |
2601 | |
2602 | int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) { |
2603 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2603, __extension__ __PRETTY_FUNCTION__)); |
2604 | hsa_status_t err = unlock_memory(HostPtr); |
2605 | if (err != HSA_STATUS_SUCCESS) { |
2606 | DP("Error in tgt_rtl_data_unlock\n"){}; |
2607 | return OFFLOAD_FAIL(~0); |
2608 | } |
2609 | |
2610 | DP("Tgt unlock data (tgt:%016llx).\n",{} |
2611 | (long long unsigned)(Elf64_Addr)HostPtr){}; |
2612 | return OFFLOAD_SUCCESS(0); |
2613 | } |
2614 | |
2615 | } // extern "C" |