| File: | build/source/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp |
| Warning: | line 2312, column 7 Value stored to 'Err' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===--- amdgpu/src/rtl.cpp --------------------------------------- C++ -*-===// |
| 2 | // |
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
| 6 | // |
| 7 | //===----------------------------------------------------------------------===// |
| 8 | // |
| 9 | // RTL for AMD hsa machine |
| 10 | // |
| 11 | //===----------------------------------------------------------------------===// |
| 12 | |
| 13 | #include "llvm/ADT/StringRef.h" |
| 14 | #include "llvm/Frontend/OpenMP/OMPConstants.h" |
| 15 | #include "llvm/Frontend/OpenMP/OMPGridValues.h" |
| 16 | #include "llvm/Object/ELF.h" |
| 17 | #include "llvm/Object/ELFObjectFile.h" |
| 18 | |
| 19 | #include <algorithm> |
| 20 | #include <assert.h> |
| 21 | #include <cstdio> |
| 22 | #include <cstdlib> |
| 23 | #include <cstring> |
| 24 | #include <functional> |
| 25 | #include <list> |
| 26 | #include <memory> |
| 27 | #include <mutex> |
| 28 | #include <shared_mutex> |
| 29 | #include <unordered_map> |
| 30 | #include <vector> |
| 31 | |
| 32 | #include "ELFSymbols.h" |
| 33 | #include "impl_runtime.h" |
| 34 | #include "interop_hsa.h" |
| 35 | |
| 36 | #include "UtilitiesRTL.h" |
| 37 | #include "internal.h" |
| 38 | #include "rt.h" |
| 39 | |
| 40 | #include "DeviceEnvironment.h" |
| 41 | #include "get_elf_mach_gfx_name.h" |
| 42 | #include "omptargetplugin.h" |
| 43 | #include "print_tracing.h" |
| 44 | |
| 45 | using namespace llvm; |
| 46 | using namespace llvm::object; |
| 47 | using namespace llvm::ELF; |
| 48 | using namespace llvm::omp::target::plugin::utils; |
| 49 | |
| 50 | // hostrpc interface, FIXME: consider moving to its own include these are |
| 51 | // statically linked into amdgpu/plugin if present from hostrpc_services.a, |
| 52 | // linked as --whole-archive to override the weak symbols that are used to |
| 53 | // implement a fallback for toolchains that do not yet have a hostrpc library. |
| 54 | extern "C" { |
| 55 | uint64_t hostrpc_assign_buffer(hsa_agent_t Agent, hsa_queue_t *ThisQ, |
| 56 | uint32_t DeviceId); |
| 57 | hsa_status_t hostrpc_init(); |
| 58 | hsa_status_t hostrpc_terminate(); |
| 59 | |
| 60 | __attribute__((weak)) hsa_status_t hostrpc_init() { return HSA_STATUS_SUCCESS; } |
| 61 | __attribute__((weak)) hsa_status_t hostrpc_terminate() { |
| 62 | return HSA_STATUS_SUCCESS; |
| 63 | } |
| 64 | __attribute__((weak)) uint64_t hostrpc_assign_buffer(hsa_agent_t, hsa_queue_t *, |
| 65 | uint32_t DeviceId) { |
| 66 | DP("Warning: Attempting to assign hostrpc to device %u, but hostrpc library "{} |
| 67 | "missing\n",{} |
| 68 | DeviceId){}; |
| 69 | return 0; |
| 70 | } |
| 71 | } |
| 72 | |
| 73 | // Heuristic parameters used for kernel launch |
| 74 | // Number of teams per CU to allow scheduling flexibility |
| 75 | static const unsigned DefaultTeamsPerCU = 4; |
| 76 | |
| 77 | int print_kernel_trace; |
| 78 | |
| 79 | #ifdef OMPTARGET_DEBUG |
| 80 | #define check(msg, status){} \ |
| 81 | if (status != HSA_STATUS_SUCCESS) { \ |
| 82 | DP(#msg " failed\n"){}; \ |
| 83 | } else { \ |
| 84 | DP(#msg " succeeded\n"){}; \ |
| 85 | } |
| 86 | #else |
| 87 | #define check(msg, status){} \ |
| 88 | {} |
| 89 | #endif |
| 90 | |
| 91 | #include "elf_common.h" |
| 92 | |
| 93 | namespace hsa { |
| 94 | template <typename C> hsa_status_t iterate_agents(C Cb) { |
| 95 | auto L = [](hsa_agent_t Agent, void *Data) -> hsa_status_t { |
| 96 | C *Unwrapped = static_cast<C *>(Data); |
| 97 | return (*Unwrapped)(Agent); |
| 98 | }; |
| 99 | return hsa_iterate_agents(L, static_cast<void *>(&Cb)); |
| 100 | } |
| 101 | |
| 102 | template <typename C> |
| 103 | hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C Cb) { |
| 104 | auto L = [](hsa_amd_memory_pool_t MemoryPool, void *Data) -> hsa_status_t { |
| 105 | C *Unwrapped = static_cast<C *>(Data); |
| 106 | return (*Unwrapped)(MemoryPool); |
| 107 | }; |
| 108 | |
| 109 | return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&Cb)); |
| 110 | } |
| 111 | |
| 112 | } // namespace hsa |
| 113 | |
| 114 | /// Keep entries table per device |
| 115 | struct FuncOrGblEntryTy { |
| 116 | __tgt_target_table Table; |
| 117 | std::vector<__tgt_offload_entry> Entries; |
| 118 | }; |
| 119 | |
| 120 | struct KernelArgPool { |
| 121 | private: |
| 122 | static pthread_mutex_t Mutex; |
| 123 | |
| 124 | public: |
| 125 | uint32_t KernargSegmentSize; |
| 126 | void *KernargRegion = nullptr; |
| 127 | std::queue<int> FreeKernargSegments; |
| 128 | |
| 129 | uint32_t kernargSizeIncludingImplicit() { |
| 130 | return KernargSegmentSize + sizeof(AMDGPUImplicitArgsTy); |
| 131 | } |
| 132 | |
| 133 | ~KernelArgPool() { |
| 134 | if (KernargRegion) { |
| 135 | auto R = hsa_amd_memory_pool_free(KernargRegion); |
| 136 | if (R != HSA_STATUS_SUCCESS) { |
| 137 | DP("hsa_amd_memory_pool_free failed: %s\n", get_error_string(R)){}; |
| 138 | } |
| 139 | } |
| 140 | } |
| 141 | |
| 142 | // Can't really copy or move a mutex |
| 143 | KernelArgPool() = default; |
| 144 | KernelArgPool(const KernelArgPool &) = delete; |
| 145 | KernelArgPool(KernelArgPool &&) = delete; |
| 146 | |
| 147 | KernelArgPool(uint32_t KernargSegmentSize, hsa_amd_memory_pool_t &MemoryPool) |
| 148 | : KernargSegmentSize(KernargSegmentSize) { |
| 149 | |
| 150 | // impl uses one pool per kernel for all gpus, with a fixed upper size |
| 151 | // preserving that exact scheme here, including the queue<int> |
| 152 | |
| 153 | hsa_status_t Err = hsa_amd_memory_pool_allocate( |
| 154 | MemoryPool, kernargSizeIncludingImplicit() * MAX_NUM_KERNELS(1024 * 16), 0, |
| 155 | &KernargRegion); |
| 156 | |
| 157 | if (Err != HSA_STATUS_SUCCESS) { |
| 158 | DP("hsa_amd_memory_pool_allocate failed: %s\n", get_error_string(Err)){}; |
| 159 | KernargRegion = nullptr; // paranoid |
| 160 | return; |
| 161 | } |
| 162 | |
| 163 | Err = core::allow_access_to_all_gpu_agents(KernargRegion); |
| 164 | if (Err != HSA_STATUS_SUCCESS) { |
| 165 | DP("hsa allow_access_to_all_gpu_agents failed: %s\n",{} |
| 166 | get_error_string(Err)){}; |
| 167 | auto R = hsa_amd_memory_pool_free(KernargRegion); |
| 168 | if (R != HSA_STATUS_SUCCESS) { |
| 169 | // if free failed, can't do anything more to resolve it |
| 170 | DP("hsa memory poll free failed: %s\n", get_error_string(Err)){}; |
| 171 | } |
| 172 | KernargRegion = nullptr; |
| 173 | return; |
| 174 | } |
| 175 | |
| 176 | for (int I = 0; I < MAX_NUM_KERNELS(1024 * 16); I++) { |
| 177 | FreeKernargSegments.push(I); |
| 178 | } |
| 179 | } |
| 180 | |
| 181 | void *allocate(uint64_t ArgNum) { |
| 182 | assert((ArgNum * sizeof(void *)) == KernargSegmentSize)(static_cast <bool> ((ArgNum * sizeof(void *)) == KernargSegmentSize ) ? void (0) : __assert_fail ("(ArgNum * sizeof(void *)) == KernargSegmentSize" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 182, __extension__ __PRETTY_FUNCTION__)); |
| 183 | Lock L(&Mutex); |
| 184 | void *Res = nullptr; |
| 185 | if (!FreeKernargSegments.empty()) { |
| 186 | |
| 187 | int FreeIdx = FreeKernargSegments.front(); |
| 188 | Res = static_cast<void *>(static_cast<char *>(KernargRegion) + |
| 189 | (FreeIdx * kernargSizeIncludingImplicit())); |
| 190 | assert(FreeIdx == pointerToIndex(Res))(static_cast <bool> (FreeIdx == pointerToIndex(Res)) ? void (0) : __assert_fail ("FreeIdx == pointerToIndex(Res)", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 190, __extension__ __PRETTY_FUNCTION__)); |
| 191 | FreeKernargSegments.pop(); |
| 192 | } |
| 193 | return Res; |
| 194 | } |
| 195 | |
| 196 | void deallocate(void *Ptr) { |
| 197 | Lock L(&Mutex); |
| 198 | int Idx = pointerToIndex(Ptr); |
| 199 | FreeKernargSegments.push(Idx); |
| 200 | } |
| 201 | |
| 202 | private: |
| 203 | int pointerToIndex(void *Ptr) { |
| 204 | ptrdiff_t Bytes = |
| 205 | static_cast<char *>(Ptr) - static_cast<char *>(KernargRegion); |
| 206 | assert(Bytes >= 0)(static_cast <bool> (Bytes >= 0) ? void (0) : __assert_fail ("Bytes >= 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 206, __extension__ __PRETTY_FUNCTION__)); |
| 207 | assert(Bytes % kernargSizeIncludingImplicit() == 0)(static_cast <bool> (Bytes % kernargSizeIncludingImplicit () == 0) ? void (0) : __assert_fail ("Bytes % kernargSizeIncludingImplicit() == 0" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 207, __extension__ __PRETTY_FUNCTION__)); |
| 208 | return Bytes / kernargSizeIncludingImplicit(); |
| 209 | } |
| 210 | struct Lock { |
| 211 | Lock(pthread_mutex_t *M) : M(M) { pthread_mutex_lock(M); } |
| 212 | ~Lock() { pthread_mutex_unlock(M); } |
| 213 | pthread_mutex_t *M; |
| 214 | }; |
| 215 | }; |
| 216 | pthread_mutex_t KernelArgPool::Mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } }; |
| 217 | |
| 218 | std::unordered_map<std::string /*kernel*/, std::unique_ptr<KernelArgPool>> |
| 219 | KernelArgPoolMap; |
| 220 | |
| 221 | /// Use a single entity to encode a kernel and a set of flags |
| 222 | struct KernelTy { |
| 223 | llvm::omp::OMPTgtExecModeFlags ExecutionMode; |
| 224 | int16_t ConstWGSize; |
| 225 | int32_t DeviceId; |
| 226 | void *CallStackAddr = nullptr; |
| 227 | const char *Name; |
| 228 | |
| 229 | KernelTy(llvm::omp::OMPTgtExecModeFlags ExecutionMode, int16_t ConstWgSize, |
| 230 | int32_t DeviceId, void *CallStackAddr, const char *Name, |
| 231 | uint32_t KernargSegmentSize, |
| 232 | hsa_amd_memory_pool_t &KernArgMemoryPool) |
| 233 | : ExecutionMode(ExecutionMode), ConstWGSize(ConstWgSize), |
| 234 | DeviceId(DeviceId), CallStackAddr(CallStackAddr), Name(Name) { |
| 235 | DP("Construct kernelinfo: ExecMode %d\n", ExecutionMode){}; |
| 236 | |
| 237 | std::string N(Name); |
| 238 | if (KernelArgPoolMap.find(N) == KernelArgPoolMap.end()) { |
| 239 | KernelArgPoolMap.insert( |
| 240 | std::make_pair(N, std::unique_ptr<KernelArgPool>(new KernelArgPool( |
| 241 | KernargSegmentSize, KernArgMemoryPool)))); |
| 242 | } |
| 243 | } |
| 244 | }; |
| 245 | |
| 246 | /// List that contains all the kernels. |
| 247 | /// FIXME: we may need this to be per device and per library. |
| 248 | std::list<KernelTy> KernelsList; |
| 249 | |
| 250 | template <typename Callback> static hsa_status_t findAgents(Callback CB) { |
| 251 | |
| 252 | hsa_status_t Err = |
| 253 | hsa::iterate_agents([&](hsa_agent_t Agent) -> hsa_status_t { |
| 254 | hsa_device_type_t DeviceType; |
| 255 | // get_info fails iff HSA runtime not yet initialized |
| 256 | hsa_status_t Err = |
| 257 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DeviceType); |
| 258 | |
| 259 | if (Err != HSA_STATUS_SUCCESS) { |
| 260 | if (print_kernel_trace > 0) |
| 261 | DP("rtl.cpp: err %s\n", get_error_string(Err)){}; |
| 262 | |
| 263 | return Err; |
| 264 | } |
| 265 | |
| 266 | CB(DeviceType, Agent); |
| 267 | return HSA_STATUS_SUCCESS; |
| 268 | }); |
| 269 | |
| 270 | // iterate_agents fails iff HSA runtime not yet initialized |
| 271 | if (print_kernel_trace > 0 && Err != HSA_STATUS_SUCCESS) { |
| 272 | DP("rtl.cpp: err %s\n", get_error_string(Err)){}; |
| 273 | } |
| 274 | |
| 275 | return Err; |
| 276 | } |
| 277 | |
| 278 | static void callbackQueue(hsa_status_t Status, hsa_queue_t *Source, |
| 279 | void *Data) { |
| 280 | if (Status != HSA_STATUS_SUCCESS) { |
| 281 | const char *StatusString; |
| 282 | if (hsa_status_string(Status, &StatusString) != HSA_STATUS_SUCCESS) { |
| 283 | StatusString = "unavailable"; |
| 284 | } |
| 285 | DP("[%s:%d] GPU error in queue %p %d (%s)\n", __FILE__, __LINE__, Source,{} |
| 286 | Status, StatusString){}; |
| 287 | abort(); |
| 288 | } |
| 289 | } |
| 290 | |
| 291 | namespace core { |
| 292 | namespace { |
| 293 | |
| 294 | bool checkResult(hsa_status_t Err, const char *ErrMsg) { |
| 295 | if (Err == HSA_STATUS_SUCCESS) |
| 296 | return true; |
| 297 | |
| 298 | REPORT("%s", ErrMsg)do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s" , ErrMsg); } while (0);; |
| 299 | REPORT("%s", get_error_string(Err))do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "%s" , get_error_string(Err)); } while (0);; |
| 300 | return false; |
| 301 | } |
| 302 | |
| 303 | void packetStoreRelease(uint32_t *Packet, uint16_t Header, uint16_t Rest) { |
| 304 | __atomic_store_n(Packet, Header | (Rest << 16), __ATOMIC_RELEASE3); |
| 305 | } |
| 306 | |
| 307 | uint16_t createHeader() { |
| 308 | uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE; |
| 309 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE; |
| 310 | Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE; |
| 311 | return Header; |
| 312 | } |
| 313 | |
| 314 | hsa_status_t isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) { |
| 315 | bool AllocAllowed = false; |
| 316 | hsa_status_t Err = hsa_amd_memory_pool_get_info( |
| 317 | MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, |
| 318 | &AllocAllowed); |
| 319 | if (Err != HSA_STATUS_SUCCESS) { |
| 320 | DP("Alloc allowed in memory pool check failed: %s\n",{} |
| 321 | get_error_string(Err)){}; |
| 322 | return Err; |
| 323 | } |
| 324 | |
| 325 | size_t Size = 0; |
| 326 | Err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE, |
| 327 | &Size); |
| 328 | if (Err != HSA_STATUS_SUCCESS) { |
| 329 | DP("Get memory pool size failed: %s\n", get_error_string(Err)){}; |
| 330 | return Err; |
| 331 | } |
| 332 | |
| 333 | return (AllocAllowed && Size > 0) ? HSA_STATUS_SUCCESS : HSA_STATUS_ERROR; |
| 334 | } |
| 335 | |
| 336 | hsa_status_t addMemoryPool(hsa_amd_memory_pool_t MemoryPool, void *Data) { |
| 337 | std::vector<hsa_amd_memory_pool_t> *Result = |
| 338 | static_cast<std::vector<hsa_amd_memory_pool_t> *>(Data); |
| 339 | |
| 340 | hsa_status_t Err; |
| 341 | if ((Err = isValidMemoryPool(MemoryPool)) != HSA_STATUS_SUCCESS) { |
| 342 | return Err; |
| 343 | } |
| 344 | |
| 345 | Result->push_back(MemoryPool); |
| 346 | return HSA_STATUS_SUCCESS; |
| 347 | } |
| 348 | |
| 349 | } // namespace |
| 350 | } // namespace core |
| 351 | |
| 352 | struct EnvironmentVariables { |
| 353 | int NumTeams; |
| 354 | int TeamLimit; |
| 355 | int TeamThreadLimit; |
| 356 | int MaxTeamsDefault; |
| 357 | int DynamicMemSize; |
| 358 | }; |
| 359 | |
| 360 | template <uint32_t wavesize> |
| 361 | static constexpr const llvm::omp::GV &getGridValue() { |
| 362 | return llvm::omp::getAMDGPUGridValues<wavesize>(); |
| 363 | } |
| 364 | |
| 365 | struct HSALifetime { |
| 366 | // Wrapper around HSA used to ensure it is constructed before other types |
| 367 | // and destructed after, which means said other types can use raii for |
| 368 | // cleanup without risking running outside of the lifetime of HSA |
| 369 | const hsa_status_t S; |
| 370 | |
| 371 | bool HSAInitSuccess() { return S == HSA_STATUS_SUCCESS; } |
| 372 | HSALifetime() : S(hsa_init()) {} |
| 373 | |
| 374 | ~HSALifetime() { |
| 375 | if (S == HSA_STATUS_SUCCESS) { |
| 376 | hsa_status_t Err = hsa_shut_down(); |
| 377 | if (Err != HSA_STATUS_SUCCESS) { |
| 378 | // Can't call into HSA to get a string from the integer |
| 379 | DP("Shutting down HSA failed: %d\n", Err){}; |
| 380 | } |
| 381 | } |
| 382 | } |
| 383 | }; |
| 384 | |
| 385 | // Handle scheduling of multiple hsa_queue's per device to |
| 386 | // multiple threads (one scheduler per device) |
| 387 | class HSAQueueScheduler { |
| 388 | public: |
| 389 | HSAQueueScheduler() : Current(0) {} |
| 390 | |
| 391 | HSAQueueScheduler(const HSAQueueScheduler &) = delete; |
| 392 | |
| 393 | HSAQueueScheduler(HSAQueueScheduler &&Q) { |
| 394 | Current = Q.Current.load(); |
| 395 | for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { |
| 396 | HSAQueues[I] = Q.HSAQueues[I]; |
| 397 | Q.HSAQueues[I] = nullptr; |
| 398 | } |
| 399 | } |
| 400 | |
| 401 | // \return false if any HSA queue creation fails |
| 402 | bool createQueues(hsa_agent_t HSAAgent, uint32_t QueueSize) { |
| 403 | for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { |
| 404 | hsa_queue_t *Q = nullptr; |
| 405 | hsa_status_t Rc = |
| 406 | hsa_queue_create(HSAAgent, QueueSize, HSA_QUEUE_TYPE_MULTI, |
| 407 | callbackQueue, NULL__null, UINT32_MAX(4294967295U), UINT32_MAX(4294967295U), &Q); |
| 408 | if (Rc != HSA_STATUS_SUCCESS) { |
| 409 | DP("Failed to create HSA queue %d\n", I){}; |
| 410 | return false; |
| 411 | } |
| 412 | HSAQueues[I] = Q; |
| 413 | } |
| 414 | return true; |
| 415 | } |
| 416 | |
| 417 | ~HSAQueueScheduler() { |
| 418 | for (uint8_t I = 0; I < NUM_QUEUES_PER_DEVICE; I++) { |
| 419 | if (HSAQueues[I]) { |
| 420 | hsa_status_t Err = hsa_queue_destroy(HSAQueues[I]); |
| 421 | if (Err != HSA_STATUS_SUCCESS) |
| 422 | DP("Error destroying HSA queue"){}; |
| 423 | } |
| 424 | } |
| 425 | } |
| 426 | |
| 427 | // \return next queue to use for device |
| 428 | hsa_queue_t *next() { |
| 429 | return HSAQueues[(Current.fetch_add(1, std::memory_order_relaxed)) % |
| 430 | NUM_QUEUES_PER_DEVICE]; |
| 431 | } |
| 432 | |
| 433 | private: |
| 434 | // Number of queues per device |
| 435 | enum : uint8_t { NUM_QUEUES_PER_DEVICE = 4 }; |
| 436 | hsa_queue_t *HSAQueues[NUM_QUEUES_PER_DEVICE] = {}; |
| 437 | std::atomic<uint8_t> Current; |
| 438 | }; |
| 439 | |
| 440 | /// Class containing all the device information |
| 441 | class RTLDeviceInfoTy : HSALifetime { |
| 442 | std::vector<std::list<FuncOrGblEntryTy>> FuncGblEntries; |
| 443 | |
| 444 | struct QueueDeleter { |
| 445 | void operator()(hsa_queue_t *Q) { |
| 446 | if (Q) { |
| 447 | hsa_status_t Err = hsa_queue_destroy(Q); |
| 448 | if (Err != HSA_STATUS_SUCCESS) { |
| 449 | DP("Error destroying hsa queue: %s\n", get_error_string(Err)){}; |
| 450 | } |
| 451 | } |
| 452 | } |
| 453 | }; |
| 454 | |
| 455 | public: |
| 456 | bool ConstructionSucceeded = false; |
| 457 | |
| 458 | // load binary populates symbol tables and mutates various global state |
| 459 | // run uses those symbol tables |
| 460 | std::shared_timed_mutex LoadRunLock; |
| 461 | |
| 462 | int NumberOfDevices = 0; |
| 463 | |
| 464 | // GPU devices |
| 465 | std::vector<hsa_agent_t> HSAAgents; |
| 466 | std::vector<HSAQueueScheduler> HSAQueueSchedulers; // one per gpu |
| 467 | |
| 468 | // CPUs |
| 469 | std::vector<hsa_agent_t> CPUAgents; |
| 470 | |
| 471 | // Device properties |
| 472 | std::vector<int> ComputeUnits; |
| 473 | std::vector<int> GroupsPerDevice; |
| 474 | std::vector<int> ThreadsPerGroup; |
| 475 | std::vector<int> WarpSize; |
| 476 | std::vector<std::string> GPUName; |
| 477 | std::vector<std::string> TargetID; |
| 478 | |
| 479 | // OpenMP properties |
| 480 | std::vector<int> NumTeams; |
| 481 | std::vector<int> NumThreads; |
| 482 | |
| 483 | // OpenMP Environment properties |
| 484 | EnvironmentVariables Env; |
| 485 | |
| 486 | // OpenMP Requires Flags |
| 487 | int64_t RequiresFlags; |
| 488 | |
| 489 | // Resource pools |
| 490 | SignalPoolT FreeSignalPool; |
| 491 | |
| 492 | bool HostcallRequired = false; |
| 493 | |
| 494 | std::vector<hsa_executable_t> HSAExecutables; |
| 495 | |
| 496 | std::vector<std::map<std::string, atl_kernel_info_t>> KernelInfoTable; |
| 497 | std::vector<std::map<std::string, atl_symbol_info_t>> SymbolInfoTable; |
| 498 | |
| 499 | hsa_amd_memory_pool_t KernArgPool; |
| 500 | |
| 501 | // fine grained memory pool for host allocations |
| 502 | hsa_amd_memory_pool_t HostFineGrainedMemoryPool; |
| 503 | |
| 504 | // fine and coarse-grained memory pools per offloading device |
| 505 | std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools; |
| 506 | std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools; |
| 507 | |
| 508 | struct ImplFreePtrDeletor { |
| 509 | void operator()(void *P) { |
| 510 | core::Runtime::Memfree(P); // ignore failure to free |
| 511 | } |
| 512 | }; |
| 513 | |
| 514 | // device_State shared across loaded binaries, error if inconsistent size |
| 515 | std::vector<std::pair<std::unique_ptr<void, ImplFreePtrDeletor>, uint64_t>> |
| 516 | DeviceStateStore; |
| 517 | |
| 518 | static const unsigned HardTeamLimit = |
| 519 | (1 << 16) - 1; // 64K needed to fit in uint16 |
| 520 | static const int DefaultNumTeams = 128; |
| 521 | |
| 522 | // These need to be per-device since different devices can have different |
| 523 | // wave sizes, but are currently the same number for each so that refactor |
| 524 | // can be postponed. |
| 525 | static_assert(getGridValue<32>().GV_Max_Teams == |
| 526 | getGridValue<64>().GV_Max_Teams, |
| 527 | ""); |
| 528 | static const int MaxTeams = getGridValue<64>().GV_Max_Teams; |
| 529 | |
| 530 | static_assert(getGridValue<32>().GV_Max_WG_Size == |
| 531 | getGridValue<64>().GV_Max_WG_Size, |
| 532 | ""); |
| 533 | static const int MaxWgSize = getGridValue<64>().GV_Max_WG_Size; |
| 534 | |
| 535 | static_assert(getGridValue<32>().GV_Default_WG_Size == |
| 536 | getGridValue<64>().GV_Default_WG_Size, |
| 537 | ""); |
| 538 | static const int DefaultWgSize = getGridValue<64>().GV_Default_WG_Size; |
| 539 | |
| 540 | using MemcpyFunc = hsa_status_t (*)(hsa_signal_t, void *, void *, size_t Size, |
| 541 | hsa_agent_t, hsa_amd_memory_pool_t); |
| 542 | hsa_status_t freesignalpoolMemcpy(void *Dest, void *Src, size_t Size, |
| 543 | MemcpyFunc Func, int32_t DeviceId) { |
| 544 | hsa_agent_t Agent = HSAAgents[DeviceId]; |
| 545 | hsa_signal_t S = FreeSignalPool.pop(); |
| 546 | if (S.handle == 0) { |
| 547 | return HSA_STATUS_ERROR; |
| 548 | } |
| 549 | hsa_status_t R = Func(S, Dest, Src, Size, Agent, HostFineGrainedMemoryPool); |
| 550 | FreeSignalPool.push(S); |
| 551 | return R; |
| 552 | } |
| 553 | |
| 554 | hsa_status_t freesignalpoolMemcpyD2H(void *Dest, void *Src, size_t Size, |
| 555 | int32_t DeviceId) { |
| 556 | return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_d2h, DeviceId); |
| 557 | } |
| 558 | |
| 559 | hsa_status_t freesignalpoolMemcpyH2D(void *Dest, void *Src, size_t Size, |
| 560 | int32_t DeviceId) { |
| 561 | return freesignalpoolMemcpy(Dest, Src, Size, impl_memcpy_h2d, DeviceId); |
| 562 | } |
| 563 | |
| 564 | static void printDeviceInfo(int32_t DeviceId, hsa_agent_t Agent) { |
| 565 | char TmpChar[1000]; |
| 566 | uint16_t Major, Minor; |
| 567 | uint32_t TmpUInt; |
| 568 | uint32_t TmpUInt2; |
| 569 | uint32_t CacheSize[4]; |
| 570 | bool TmpBool; |
| 571 | uint16_t WorkgroupMaxDim[3]; |
| 572 | hsa_dim3_t GridMaxDim; |
| 573 | |
| 574 | // Getting basic information about HSA and Device |
| 575 | core::checkResult( |
| 576 | hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &Major), |
| 577 | "Error from hsa_system_get_info when obtaining " |
| 578 | "HSA_SYSTEM_INFO_VERSION_MAJOR\n"); |
| 579 | core::checkResult( |
| 580 | hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MINOR, &Minor), |
| 581 | "Error from hsa_system_get_info when obtaining " |
| 582 | "HSA_SYSTEM_INFO_VERSION_MINOR\n"); |
| 583 | printf(" HSA Runtime Version: \t\t%u.%u \n", Major, Minor); |
| 584 | printf(" HSA OpenMP Device Number: \t\t%d \n", DeviceId); |
| 585 | core::checkResult( |
| 586 | hsa_agent_get_info( |
| 587 | Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, TmpChar), |
| 588 | "Error returned from hsa_agent_get_info when obtaining " |
| 589 | "HSA_AMD_AGENT_INFO_PRODUCT_NAME\n"); |
| 590 | printf(" Product Name: \t\t\t%s \n", TmpChar); |
| 591 | core::checkResult(hsa_agent_get_info(Agent, HSA_AGENT_INFO_NAME, TmpChar), |
| 592 | "Error returned from hsa_agent_get_info when obtaining " |
| 593 | "HSA_AGENT_INFO_NAME\n"); |
| 594 | printf(" Device Name: \t\t\t%s \n", TmpChar); |
| 595 | core::checkResult( |
| 596 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_VENDOR_NAME, TmpChar), |
| 597 | "Error returned from hsa_agent_get_info when obtaining " |
| 598 | "HSA_AGENT_INFO_NAME\n"); |
| 599 | printf(" Vendor Name: \t\t\t%s \n", TmpChar); |
| 600 | hsa_device_type_t DevType; |
| 601 | core::checkResult( |
| 602 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_DEVICE, &DevType), |
| 603 | "Error returned from hsa_agent_get_info when obtaining " |
| 604 | "HSA_AGENT_INFO_DEVICE\n"); |
| 605 | printf(" Device Type: \t\t\t%s \n", |
| 606 | DevType == HSA_DEVICE_TYPE_CPU |
| 607 | ? "CPU" |
| 608 | : (DevType == HSA_DEVICE_TYPE_GPU |
| 609 | ? "GPU" |
| 610 | : (DevType == HSA_DEVICE_TYPE_DSP ? "DSP" : "UNKNOWN"))); |
| 611 | core::checkResult( |
| 612 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUES_MAX, &TmpUInt), |
| 613 | "Error returned from hsa_agent_get_info when obtaining " |
| 614 | "HSA_AGENT_INFO_QUEUES_MAX\n"); |
| 615 | printf(" Max Queues: \t\t\t%u \n", TmpUInt); |
| 616 | core::checkResult( |
| 617 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &TmpUInt), |
| 618 | "Error returned from hsa_agent_get_info when obtaining " |
| 619 | "HSA_AGENT_INFO_QUEUE_MIN_SIZE\n"); |
| 620 | printf(" Queue Min Size: \t\t\t%u \n", TmpUInt); |
| 621 | core::checkResult( |
| 622 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, &TmpUInt), |
| 623 | "Error returned from hsa_agent_get_info when obtaining " |
| 624 | "HSA_AGENT_INFO_QUEUE_MAX_SIZE\n"); |
| 625 | printf(" Queue Max Size: \t\t\t%u \n", TmpUInt); |
| 626 | |
| 627 | // Getting cache information |
| 628 | printf(" Cache:\n"); |
| 629 | |
| 630 | // FIXME: This is deprecated according to HSA documentation. But using |
| 631 | // hsa_agent_iterate_caches and hsa_cache_get_info breaks execution during |
| 632 | // runtime. |
| 633 | core::checkResult( |
| 634 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_CACHE_SIZE, CacheSize), |
| 635 | "Error returned from hsa_agent_get_info when obtaining " |
| 636 | "HSA_AGENT_INFO_CACHE_SIZE\n"); |
| 637 | |
| 638 | for (int I = 0; I < 4; I++) { |
| 639 | if (CacheSize[I]) { |
| 640 | printf(" L%u: \t\t\t\t%u bytes\n", I, CacheSize[I]); |
| 641 | } |
| 642 | } |
| 643 | |
| 644 | core::checkResult( |
| 645 | hsa_agent_get_info(Agent, |
| 646 | (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CACHELINE_SIZE, |
| 647 | &TmpUInt), |
| 648 | "Error returned from hsa_agent_get_info when obtaining " |
| 649 | "HSA_AMD_AGENT_INFO_CACHELINE_SIZE\n"); |
| 650 | printf(" Cacheline Size: \t\t\t%u \n", TmpUInt); |
| 651 | core::checkResult( |
| 652 | hsa_agent_get_info( |
| 653 | Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, |
| 654 | &TmpUInt), |
| 655 | "Error returned from hsa_agent_get_info when obtaining " |
| 656 | "HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY\n"); |
| 657 | printf(" Max Clock Freq(MHz): \t\t%u \n", TmpUInt); |
| 658 | core::checkResult( |
| 659 | hsa_agent_get_info( |
| 660 | Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, |
| 661 | &TmpUInt), |
| 662 | "Error returned from hsa_agent_get_info when obtaining " |
| 663 | "HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT\n"); |
| 664 | printf(" Compute Units: \t\t\t%u \n", TmpUInt); |
| 665 | core::checkResult(hsa_agent_get_info( |
| 666 | Agent, |
| 667 | (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU, |
| 668 | &TmpUInt), |
| 669 | "Error returned from hsa_agent_get_info when obtaining " |
| 670 | "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); |
| 671 | printf(" SIMD per CU: \t\t\t%u \n", TmpUInt); |
| 672 | core::checkResult( |
| 673 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_FAST_F16_OPERATION, &TmpBool), |
| 674 | "Error returned from hsa_agent_get_info when obtaining " |
| 675 | "HSA_AMD_AGENT_INFO_NUM_SIMDS_PER_CU\n"); |
| 676 | printf(" Fast F16 Operation: \t\t%s \n", (TmpBool ? "TRUE" : "FALSE")); |
| 677 | core::checkResult( |
| 678 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &TmpUInt2), |
| 679 | "Error returned from hsa_agent_get_info when obtaining " |
| 680 | "HSA_AGENT_INFO_WAVEFRONT_SIZE\n"); |
| 681 | printf(" Wavefront Size: \t\t\t%u \n", TmpUInt2); |
| 682 | core::checkResult( |
| 683 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_SIZE, &TmpUInt), |
| 684 | "Error returned from hsa_agent_get_info when obtaining " |
| 685 | "HSA_AGENT_INFO_WORKGROUP_MAX_SIZE\n"); |
| 686 | printf(" Workgroup Max Size: \t\t%u \n", TmpUInt); |
| 687 | core::checkResult(hsa_agent_get_info(Agent, |
| 688 | HSA_AGENT_INFO_WORKGROUP_MAX_DIM, |
| 689 | WorkgroupMaxDim), |
| 690 | "Error returned from hsa_agent_get_info when obtaining " |
| 691 | "HSA_AGENT_INFO_WORKGROUP_MAX_DIM\n"); |
| 692 | printf(" Workgroup Max Size per Dimension:\n"); |
| 693 | printf(" x: \t\t\t\t%u\n", WorkgroupMaxDim[0]); |
| 694 | printf(" y: \t\t\t\t%u\n", WorkgroupMaxDim[1]); |
| 695 | printf(" z: \t\t\t\t%u\n", WorkgroupMaxDim[2]); |
| 696 | core::checkResult(hsa_agent_get_info( |
| 697 | Agent, |
| 698 | (hsa_agent_info_t)HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, |
| 699 | &TmpUInt), |
| 700 | "Error returned from hsa_agent_get_info when obtaining " |
| 701 | "HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU\n"); |
| 702 | printf(" Max Waves Per CU: \t\t\t%u \n", TmpUInt); |
| 703 | printf(" Max Work-item Per CU: \t\t%u \n", TmpUInt * TmpUInt2); |
| 704 | core::checkResult( |
| 705 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_SIZE, &TmpUInt), |
| 706 | "Error returned from hsa_agent_get_info when obtaining " |
| 707 | "HSA_AGENT_INFO_GRID_MAX_SIZE\n"); |
| 708 | printf(" Grid Max Size: \t\t\t%u \n", TmpUInt); |
| 709 | core::checkResult( |
| 710 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim), |
| 711 | "Error returned from hsa_agent_get_info when obtaining " |
| 712 | "HSA_AGENT_INFO_GRID_MAX_DIM\n"); |
| 713 | printf(" Grid Max Size per Dimension: \t\t\n"); |
| 714 | printf(" x: \t\t\t\t%u\n", GridMaxDim.x); |
| 715 | printf(" y: \t\t\t\t%u\n", GridMaxDim.y); |
| 716 | printf(" z: \t\t\t\t%u\n", GridMaxDim.z); |
| 717 | core::checkResult( |
| 718 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_FBARRIER_MAX_SIZE, &TmpUInt), |
| 719 | "Error returned from hsa_agent_get_info when obtaining " |
| 720 | "HSA_AGENT_INFO_FBARRIER_MAX_SIZE\n"); |
| 721 | printf(" Max fbarriers/Workgrp: \t\t%u\n", TmpUInt); |
| 722 | |
| 723 | printf(" Memory Pools:\n"); |
| 724 | auto CbMem = [](hsa_amd_memory_pool_t Region, void *Data) -> hsa_status_t { |
| 725 | std::string TmpStr; |
| 726 | size_t Size; |
| 727 | bool Alloc, Access; |
| 728 | hsa_amd_segment_t Segment; |
| 729 | hsa_amd_memory_pool_global_flag_t GlobalFlags; |
| 730 | core::checkResult( |
| 731 | hsa_amd_memory_pool_get_info( |
| 732 | Region, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags), |
| 733 | "Error returned from hsa_amd_memory_pool_get_info when obtaining " |
| 734 | "HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS\n"); |
| 735 | core::checkResult(hsa_amd_memory_pool_get_info( |
| 736 | Region, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &Segment), |
| 737 | "Error returned from hsa_amd_memory_pool_get_info when " |
| 738 | "obtaining HSA_AMD_MEMORY_POOL_INFO_SEGMENT\n"); |
| 739 | |
| 740 | switch (Segment) { |
| 741 | case HSA_AMD_SEGMENT_GLOBAL: |
| 742 | TmpStr = "GLOBAL; FLAGS: "; |
| 743 | if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT & GlobalFlags) |
| 744 | TmpStr += "KERNARG, "; |
| 745 | if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED & GlobalFlags) |
| 746 | TmpStr += "FINE GRAINED, "; |
| 747 | if (HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED & GlobalFlags) |
| 748 | TmpStr += "COARSE GRAINED, "; |
| 749 | break; |
| 750 | case HSA_AMD_SEGMENT_READONLY: |
| 751 | TmpStr = "READONLY"; |
| 752 | break; |
| 753 | case HSA_AMD_SEGMENT_PRIVATE: |
| 754 | TmpStr = "PRIVATE"; |
| 755 | break; |
| 756 | case HSA_AMD_SEGMENT_GROUP: |
| 757 | TmpStr = "GROUP"; |
| 758 | break; |
| 759 | } |
| 760 | printf(" Pool %s: \n", TmpStr.c_str()); |
| 761 | |
| 762 | core::checkResult(hsa_amd_memory_pool_get_info( |
| 763 | Region, HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size), |
| 764 | "Error returned from hsa_amd_memory_pool_get_info when " |
| 765 | "obtaining HSA_AMD_MEMORY_POOL_INFO_SIZE\n"); |
| 766 | printf(" Size: \t\t\t\t %zu bytes\n", Size); |
| 767 | core::checkResult( |
| 768 | hsa_amd_memory_pool_get_info( |
| 769 | Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED, &Alloc), |
| 770 | "Error returned from hsa_amd_memory_pool_get_info when obtaining " |
| 771 | "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED\n"); |
| 772 | printf(" Allocatable: \t\t\t %s\n", (Alloc ? "TRUE" : "FALSE")); |
| 773 | core::checkResult( |
| 774 | hsa_amd_memory_pool_get_info( |
| 775 | Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE, &Size), |
| 776 | "Error returned from hsa_amd_memory_pool_get_info when obtaining " |
| 777 | "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE\n"); |
| 778 | printf(" Runtime Alloc Granule: \t\t %zu bytes\n", Size); |
| 779 | core::checkResult( |
| 780 | hsa_amd_memory_pool_get_info( |
| 781 | Region, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT, &Size), |
| 782 | "Error returned from hsa_amd_memory_pool_get_info when obtaining " |
| 783 | "HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT\n"); |
| 784 | printf(" Runtime Alloc alignment: \t %zu bytes\n", Size); |
| 785 | core::checkResult( |
| 786 | hsa_amd_memory_pool_get_info( |
| 787 | Region, HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL, &Access), |
| 788 | "Error returned from hsa_amd_memory_pool_get_info when obtaining " |
| 789 | "HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL\n"); |
| 790 | printf(" Accessable by all: \t\t %s\n", |
| 791 | (Access ? "TRUE" : "FALSE")); |
| 792 | |
| 793 | return HSA_STATUS_SUCCESS; |
| 794 | }; |
| 795 | // Iterate over all the memory regions for this agent. Get the memory region |
| 796 | // type and size |
| 797 | hsa_amd_agent_iterate_memory_pools(Agent, CbMem, nullptr); |
| 798 | |
| 799 | printf(" ISAs:\n"); |
| 800 | auto CBIsas = [](hsa_isa_t Isa, void *Data) -> hsa_status_t { |
| 801 | char TmpChar[1000]; |
| 802 | core::checkResult(hsa_isa_get_info_alt(Isa, HSA_ISA_INFO_NAME, TmpChar), |
| 803 | "Error returned from hsa_isa_get_info_alt when " |
| 804 | "obtaining HSA_ISA_INFO_NAME\n"); |
| 805 | printf(" Name: \t\t\t\t %s\n", TmpChar); |
| 806 | |
| 807 | return HSA_STATUS_SUCCESS; |
| 808 | }; |
| 809 | // Iterate over all the memory regions for this agent. Get the memory region |
| 810 | // type and size |
| 811 | hsa_agent_iterate_isas(Agent, CBIsas, nullptr); |
| 812 | } |
| 813 | |
| 814 | // Record entry point associated with device |
| 815 | void addOffloadEntry(int32_t DeviceId, __tgt_offload_entry Entry) { |
| 816 | assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 817, __extension__ __PRETTY_FUNCTION__)) |
| 817 | "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 817, __extension__ __PRETTY_FUNCTION__)); |
| 818 | FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); |
| 819 | |
| 820 | E.Entries.push_back(Entry); |
| 821 | } |
| 822 | |
| 823 | // Return true if the entry is associated with device |
| 824 | bool findOffloadEntry(int32_t DeviceId, void *Addr) { |
| 825 | assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 826, __extension__ __PRETTY_FUNCTION__)) |
| 826 | "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 826, __extension__ __PRETTY_FUNCTION__)); |
| 827 | FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); |
| 828 | |
| 829 | for (auto &It : E.Entries) { |
| 830 | if (It.addr == Addr) |
| 831 | return true; |
| 832 | } |
| 833 | |
| 834 | return false; |
| 835 | } |
| 836 | |
| 837 | // Return the pointer to the target entries table |
| 838 | __tgt_target_table *getOffloadEntriesTable(int32_t DeviceId) { |
| 839 | assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 840, __extension__ __PRETTY_FUNCTION__)) |
| 840 | "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 840, __extension__ __PRETTY_FUNCTION__)); |
| 841 | FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); |
| 842 | |
| 843 | int32_t Size = E.Entries.size(); |
| 844 | |
| 845 | // Table is empty |
| 846 | if (!Size) |
| 847 | return 0; |
| 848 | |
| 849 | __tgt_offload_entry *Begin = &E.Entries[0]; |
| 850 | __tgt_offload_entry *End = &E.Entries[Size - 1]; |
| 851 | |
| 852 | // Update table info according to the entries and return the pointer |
| 853 | E.Table.EntriesBegin = Begin; |
| 854 | E.Table.EntriesEnd = ++End; |
| 855 | |
| 856 | return &E.Table; |
| 857 | } |
| 858 | |
| 859 | // Clear entries table for a device |
| 860 | void clearOffloadEntriesTable(int DeviceId) { |
| 861 | assert(DeviceId < (int32_t)FuncGblEntries.size() &&(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 862, __extension__ __PRETTY_FUNCTION__)) |
| 862 | "Unexpected device id!")(static_cast <bool> (DeviceId < (int32_t)FuncGblEntries .size() && "Unexpected device id!") ? void (0) : __assert_fail ("DeviceId < (int32_t)FuncGblEntries.size() && \"Unexpected device id!\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 862, __extension__ __PRETTY_FUNCTION__)); |
| 863 | FuncGblEntries[DeviceId].emplace_back(); |
| 864 | FuncOrGblEntryTy &E = FuncGblEntries[DeviceId].back(); |
| 865 | // KernelArgPoolMap.clear(); |
| 866 | E.Entries.clear(); |
| 867 | E.Table.EntriesBegin = E.Table.EntriesEnd = 0; |
| 868 | } |
| 869 | |
| 870 | hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool, |
| 871 | unsigned int DeviceId) { |
| 872 | assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.")(static_cast <bool> (DeviceId < DeviceFineGrainedMemoryPools .size() && "Error here.") ? void (0) : __assert_fail ( "DeviceId < DeviceFineGrainedMemoryPools.size() && \"Error here.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 872, __extension__ __PRETTY_FUNCTION__)); |
| 873 | uint32_t GlobalFlags = 0; |
| 874 | hsa_status_t Err = hsa_amd_memory_pool_get_info( |
| 875 | MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); |
| 876 | |
| 877 | if (Err != HSA_STATUS_SUCCESS) { |
| 878 | return Err; |
| 879 | } |
| 880 | |
| 881 | if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { |
| 882 | DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool; |
| 883 | } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { |
| 884 | DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool; |
| 885 | } |
| 886 | |
| 887 | return HSA_STATUS_SUCCESS; |
| 888 | } |
| 889 | |
| 890 | hsa_status_t setupDevicePools(const std::vector<hsa_agent_t> &Agents) { |
| 891 | for (unsigned int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) { |
| 892 | hsa_status_t Err = hsa::amd_agent_iterate_memory_pools( |
| 893 | Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) { |
| 894 | hsa_status_t ValidStatus = core::isValidMemoryPool(MemoryPool); |
| 895 | if (ValidStatus != HSA_STATUS_SUCCESS) { |
| 896 | DP("Alloc allowed in memory pool check failed: %s\n",{} |
| 897 | get_error_string(ValidStatus)){}; |
| 898 | return HSA_STATUS_SUCCESS; |
| 899 | } |
| 900 | return addDeviceMemoryPool(MemoryPool, DeviceId); |
| 901 | }); |
| 902 | |
| 903 | if (Err != HSA_STATUS_SUCCESS) { |
| 904 | DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{} |
| 905 | "Iterate all memory pools", get_error_string(Err)){}; |
| 906 | return Err; |
| 907 | } |
| 908 | } |
| 909 | return HSA_STATUS_SUCCESS; |
| 910 | } |
| 911 | |
| 912 | hsa_status_t setupHostMemoryPools(std::vector<hsa_agent_t> &Agents) { |
| 913 | std::vector<hsa_amd_memory_pool_t> HostPools; |
| 914 | |
| 915 | // collect all the "valid" pools for all the given agents. |
| 916 | for (const auto &Agent : Agents) { |
| 917 | hsa_status_t Err = hsa_amd_agent_iterate_memory_pools( |
| 918 | Agent, core::addMemoryPool, static_cast<void *>(&HostPools)); |
| 919 | if (Err != HSA_STATUS_SUCCESS) { |
| 920 | DP("addMemoryPool returned %s, continuing\n", get_error_string(Err)){}; |
| 921 | } |
| 922 | } |
| 923 | |
| 924 | // We need two fine-grained pools. |
| 925 | // 1. One with kernarg flag set for storing kernel arguments |
| 926 | // 2. Second for host allocations |
| 927 | bool FineGrainedMemoryPoolSet = false; |
| 928 | bool KernArgPoolSet = false; |
| 929 | for (const auto &MemoryPool : HostPools) { |
| 930 | hsa_status_t Err = HSA_STATUS_SUCCESS; |
| 931 | uint32_t GlobalFlags = 0; |
| 932 | Err = hsa_amd_memory_pool_get_info( |
| 933 | MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags); |
| 934 | if (Err != HSA_STATUS_SUCCESS) { |
| 935 | DP("Get memory pool info failed: %s\n", get_error_string(Err)){}; |
| 936 | return Err; |
| 937 | } |
| 938 | |
| 939 | if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { |
| 940 | if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { |
| 941 | KernArgPool = MemoryPool; |
| 942 | KernArgPoolSet = true; |
| 943 | } else { |
| 944 | HostFineGrainedMemoryPool = MemoryPool; |
| 945 | FineGrainedMemoryPoolSet = true; |
| 946 | } |
| 947 | } |
| 948 | } |
| 949 | |
| 950 | if (FineGrainedMemoryPoolSet && KernArgPoolSet) |
| 951 | return HSA_STATUS_SUCCESS; |
| 952 | |
| 953 | return HSA_STATUS_ERROR; |
| 954 | } |
| 955 | |
| 956 | hsa_amd_memory_pool_t getDeviceMemoryPool(unsigned int DeviceId) { |
| 957 | assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&(static_cast <bool> (DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id" ) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 958, __extension__ __PRETTY_FUNCTION__)) |
| 958 | "Invalid device Id")(static_cast <bool> (DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && "Invalid device Id" ) ? void (0) : __assert_fail ("DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() && \"Invalid device Id\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 958, __extension__ __PRETTY_FUNCTION__)); |
| 959 | return DeviceCoarseGrainedMemoryPools[DeviceId]; |
| 960 | } |
| 961 | |
| 962 | hsa_amd_memory_pool_t getHostMemoryPool() { |
| 963 | return HostFineGrainedMemoryPool; |
| 964 | } |
| 965 | |
| 966 | static int readEnv(const char *Env, int Default = -1) { |
| 967 | const char *EnvStr = getenv(Env); |
| 968 | int Res = Default; |
| 969 | if (EnvStr) { |
| 970 | Res = std::stoi(EnvStr); |
| 971 | DP("Parsed %s=%d\n", Env, Res){}; |
| 972 | } |
| 973 | return Res; |
| 974 | } |
| 975 | |
| 976 | RTLDeviceInfoTy() { |
| 977 | DP("Start initializing " GETNAME(TARGET_NAME) "\n"){}; |
| 978 | |
| 979 | // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr |
| 980 | // anytime. You do not need a debug library build. |
| 981 | // 0 => no tracing |
| 982 | // 1 => tracing dispatch only |
| 983 | // >1 => verbosity increase |
| 984 | |
| 985 | if (!HSAInitSuccess()) { |
| 986 | DP("Error when initializing HSA in " GETNAME(TARGET_NAME) "\n"){}; |
| 987 | return; |
| 988 | } |
| 989 | |
| 990 | if (char *EnvStr = getenv("LIBOMPTARGET_KERNEL_TRACE")) |
| 991 | print_kernel_trace = atoi(EnvStr); |
| 992 | else |
| 993 | print_kernel_trace = 0; |
| 994 | |
| 995 | hsa_status_t Err = core::atl_init_gpu_context(); |
| 996 | if (Err != HSA_STATUS_SUCCESS) { |
| 997 | DP("Error when initializing " GETNAME(TARGET_NAME) "\n"){}; |
| 998 | return; |
| 999 | } |
| 1000 | |
| 1001 | // Init hostcall soon after initializing hsa |
| 1002 | hostrpc_init(); |
| 1003 | |
| 1004 | Err = findAgents([&](hsa_device_type_t DeviceType, hsa_agent_t Agent) { |
| 1005 | if (DeviceType == HSA_DEVICE_TYPE_CPU) { |
| 1006 | CPUAgents.push_back(Agent); |
| 1007 | } else { |
| 1008 | HSAAgents.push_back(Agent); |
| 1009 | } |
| 1010 | }); |
| 1011 | if (Err != HSA_STATUS_SUCCESS) |
| 1012 | return; |
| 1013 | |
| 1014 | NumberOfDevices = (int)HSAAgents.size(); |
| 1015 | |
| 1016 | if (NumberOfDevices == 0) { |
| 1017 | DP("There are no devices supporting HSA.\n"){}; |
| 1018 | return; |
| 1019 | } |
| 1020 | DP("There are %d devices supporting HSA.\n", NumberOfDevices){}; |
| 1021 | |
| 1022 | // Init the device info |
| 1023 | HSAQueueSchedulers.reserve(NumberOfDevices); |
| 1024 | FuncGblEntries.resize(NumberOfDevices); |
| 1025 | ThreadsPerGroup.resize(NumberOfDevices); |
| 1026 | ComputeUnits.resize(NumberOfDevices); |
| 1027 | GPUName.resize(NumberOfDevices); |
| 1028 | GroupsPerDevice.resize(NumberOfDevices); |
| 1029 | WarpSize.resize(NumberOfDevices); |
| 1030 | NumTeams.resize(NumberOfDevices); |
| 1031 | NumThreads.resize(NumberOfDevices); |
| 1032 | DeviceStateStore.resize(NumberOfDevices); |
| 1033 | KernelInfoTable.resize(NumberOfDevices); |
| 1034 | SymbolInfoTable.resize(NumberOfDevices); |
| 1035 | DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices); |
| 1036 | DeviceFineGrainedMemoryPools.resize(NumberOfDevices); |
| 1037 | |
| 1038 | Err = setupDevicePools(HSAAgents); |
| 1039 | if (Err != HSA_STATUS_SUCCESS) { |
| 1040 | DP("Setup for Device Memory Pools failed\n"){}; |
| 1041 | return; |
| 1042 | } |
| 1043 | |
| 1044 | Err = setupHostMemoryPools(CPUAgents); |
| 1045 | if (Err != HSA_STATUS_SUCCESS) { |
| 1046 | DP("Setup for Host Memory Pools failed\n"){}; |
| 1047 | return; |
| 1048 | } |
| 1049 | |
| 1050 | for (int I = 0; I < NumberOfDevices; I++) { |
| 1051 | uint32_t QueueSize = 0; |
| 1052 | { |
| 1053 | hsa_status_t Err = hsa_agent_get_info( |
| 1054 | HSAAgents[I], HSA_AGENT_INFO_QUEUE_MAX_SIZE, &QueueSize); |
| 1055 | if (Err != HSA_STATUS_SUCCESS) { |
| 1056 | DP("HSA query QUEUE_MAX_SIZE failed for agent %d\n", I){}; |
| 1057 | return; |
| 1058 | } |
| 1059 | enum { MaxQueueSize = 4096 }; |
| 1060 | if (QueueSize > MaxQueueSize) { |
| 1061 | QueueSize = MaxQueueSize; |
| 1062 | } |
| 1063 | } |
| 1064 | |
| 1065 | { |
| 1066 | HSAQueueScheduler QSched; |
| 1067 | if (!QSched.createQueues(HSAAgents[I], QueueSize)) |
| 1068 | return; |
| 1069 | HSAQueueSchedulers.emplace_back(std::move(QSched)); |
| 1070 | } |
| 1071 | |
| 1072 | DeviceStateStore[I] = {nullptr, 0}; |
| 1073 | } |
| 1074 | |
| 1075 | for (int I = 0; I < NumberOfDevices; I++) { |
| 1076 | ThreadsPerGroup[I] = RTLDeviceInfoTy::DefaultWgSize; |
| 1077 | GroupsPerDevice[I] = RTLDeviceInfoTy::DefaultNumTeams; |
| 1078 | ComputeUnits[I] = 1; |
| 1079 | DP("Device %d: Initial groupsPerDevice %d & threadsPerGroup %d\n", I,{} |
| 1080 | GroupsPerDevice[I], ThreadsPerGroup[I]){}; |
| 1081 | } |
| 1082 | |
| 1083 | // Get environment variables regarding teams |
| 1084 | Env.TeamLimit = readEnv("OMP_TEAM_LIMIT"); |
| 1085 | Env.NumTeams = readEnv("OMP_NUM_TEAMS"); |
| 1086 | Env.MaxTeamsDefault = readEnv("OMP_MAX_TEAMS_DEFAULT"); |
| 1087 | Env.TeamThreadLimit = readEnv("OMP_TEAMS_THREAD_LIMIT"); |
| 1088 | Env.DynamicMemSize = readEnv("LIBOMPTARGET_SHARED_MEMORY_SIZE", 0); |
| 1089 | |
| 1090 | // Default state. |
| 1091 | RequiresFlags = OMP_REQ_UNDEFINED; |
| 1092 | |
| 1093 | ConstructionSucceeded = true; |
| 1094 | } |
| 1095 | |
| 1096 | ~RTLDeviceInfoTy() { |
| 1097 | DP("Finalizing the " GETNAME(TARGET_NAME) " DeviceInfo.\n"){}; |
| 1098 | if (!HSAInitSuccess()) { |
| 1099 | // Then none of these can have been set up and they can't be torn down |
| 1100 | return; |
| 1101 | } |
| 1102 | // Run destructors on types that use HSA before |
| 1103 | // impl_finalize removes access to it |
| 1104 | DeviceStateStore.clear(); |
| 1105 | KernelArgPoolMap.clear(); |
| 1106 | // Terminate hostrpc before finalizing hsa |
| 1107 | hostrpc_terminate(); |
| 1108 | |
| 1109 | hsa_status_t Err; |
| 1110 | for (uint32_t I = 0; I < HSAExecutables.size(); I++) { |
| 1111 | Err = hsa_executable_destroy(HSAExecutables[I]); |
| 1112 | if (Err != HSA_STATUS_SUCCESS) { |
| 1113 | DP("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,{} |
| 1114 | "Destroying executable", get_error_string(Err)){}; |
| 1115 | } |
| 1116 | } |
| 1117 | } |
| 1118 | }; |
| 1119 | |
| 1120 | pthread_mutex_t SignalPoolT::mutex = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } }; |
| 1121 | |
| 1122 | // Putting accesses to DeviceInfo global behind a function call prior |
| 1123 | // to changing to use init_plugin/deinit_plugin calls |
| 1124 | static RTLDeviceInfoTy DeviceInfoState; |
| 1125 | static RTLDeviceInfoTy &DeviceInfo() { return DeviceInfoState; } |
| 1126 | |
| 1127 | namespace { |
| 1128 | |
| 1129 | int32_t dataRetrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr, int64_t Size, |
| 1130 | __tgt_async_info *AsyncInfo) { |
| 1131 | assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1131, __extension__ __PRETTY_FUNCTION__)); |
| 1132 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1132, __extension__ __PRETTY_FUNCTION__)); |
| 1133 | // Return success if we are not copying back to host from target. |
| 1134 | if (!HstPtr) |
| 1135 | return OFFLOAD_SUCCESS(0); |
| 1136 | hsa_status_t Err; |
| 1137 | DP("Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{} |
| 1138 | (long long unsigned)(Elf64_Addr)TgtPtr,{} |
| 1139 | (long long unsigned)(Elf64_Addr)HstPtr){}; |
| 1140 | |
| 1141 | Err = DeviceInfo().freesignalpoolMemcpyD2H(HstPtr, TgtPtr, (size_t)Size, |
| 1142 | DeviceId); |
| 1143 | |
| 1144 | if (Err != HSA_STATUS_SUCCESS) { |
| 1145 | DP("Error when copying data from device to host. Pointers: "{} |
| 1146 | "host = 0x%016lx, device = 0x%016lx, size = %lld\n",{} |
| 1147 | (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){}; |
| 1148 | return OFFLOAD_FAIL(~0); |
| 1149 | } |
| 1150 | DP("DONE Retrieve data %ld bytes, (tgt:%016llx) -> (hst:%016llx).\n", Size,{} |
| 1151 | (long long unsigned)(Elf64_Addr)TgtPtr,{} |
| 1152 | (long long unsigned)(Elf64_Addr)HstPtr){}; |
| 1153 | return OFFLOAD_SUCCESS(0); |
| 1154 | } |
| 1155 | |
| 1156 | int32_t dataSubmit(int32_t DeviceId, void *TgtPtr, void *HstPtr, int64_t Size, |
| 1157 | __tgt_async_info *AsyncInfo) { |
| 1158 | assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1158, __extension__ __PRETTY_FUNCTION__)); |
| 1159 | hsa_status_t Err; |
| 1160 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1160, __extension__ __PRETTY_FUNCTION__)); |
| 1161 | // Return success if we are not doing host to target. |
| 1162 | if (!HstPtr) |
| 1163 | return OFFLOAD_SUCCESS(0); |
| 1164 | |
| 1165 | DP("Submit data %ld bytes, (hst:%016llx) -> (tgt:%016llx).\n", Size,{} |
| 1166 | (long long unsigned)(Elf64_Addr)HstPtr,{} |
| 1167 | (long long unsigned)(Elf64_Addr)TgtPtr){}; |
| 1168 | Err = DeviceInfo().freesignalpoolMemcpyH2D(TgtPtr, HstPtr, (size_t)Size, |
| 1169 | DeviceId); |
| 1170 | if (Err != HSA_STATUS_SUCCESS) { |
| 1171 | DP("Error when copying data from host to device. Pointers: "{} |
| 1172 | "host = 0x%016lx, device = 0x%016lx, size = %lld\n",{} |
| 1173 | (Elf64_Addr)HstPtr, (Elf64_Addr)TgtPtr, (unsigned long long)Size){}; |
| 1174 | return OFFLOAD_FAIL(~0); |
| 1175 | } |
| 1176 | return OFFLOAD_SUCCESS(0); |
| 1177 | } |
| 1178 | |
| 1179 | // Async. |
| 1180 | // The implementation was written with cuda streams in mind. The semantics of |
| 1181 | // that are to execute kernels on a queue in order of insertion. A synchronise |
| 1182 | // call then makes writes visible between host and device. This means a series |
| 1183 | // of N data_submit_async calls are expected to execute serially. HSA offers |
| 1184 | // various options to run the data copies concurrently. This may require changes |
| 1185 | // to libomptarget. |
| 1186 | |
| 1187 | // __tgt_async_info* contains a void * Queue. Queue = 0 is used to indicate that |
| 1188 | // there are no outstanding kernels that need to be synchronized. Any async call |
| 1189 | // may be passed a Queue==0, at which point the cuda implementation will set it |
| 1190 | // to non-null (see getStream). The cuda streams are per-device. Upstream may |
| 1191 | // change this interface to explicitly initialize the AsyncInfo_pointer, but |
| 1192 | // until then hsa lazily initializes it as well. |
| 1193 | |
| 1194 | void initAsyncInfo(__tgt_async_info *AsyncInfo) { |
| 1195 | // set non-null while using async calls, return to null to indicate completion |
| 1196 | assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail ("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1196, __extension__ __PRETTY_FUNCTION__)); |
| 1197 | if (!AsyncInfo->Queue) { |
| 1198 | AsyncInfo->Queue = reinterpret_cast<void *>(UINT64_MAX(18446744073709551615UL)); |
| 1199 | } |
| 1200 | } |
| 1201 | void finiAsyncInfo(__tgt_async_info *AsyncInfo) { |
| 1202 | assert(AsyncInfo)(static_cast <bool> (AsyncInfo) ? void (0) : __assert_fail ("AsyncInfo", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1202, __extension__ __PRETTY_FUNCTION__)); |
| 1203 | assert(AsyncInfo->Queue)(static_cast <bool> (AsyncInfo->Queue) ? void (0) : __assert_fail ("AsyncInfo->Queue", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1203, __extension__ __PRETTY_FUNCTION__)); |
| 1204 | AsyncInfo->Queue = 0; |
| 1205 | } |
| 1206 | |
| 1207 | // Determine launch values for kernel. |
| 1208 | struct LaunchVals { |
| 1209 | int WorkgroupSize; |
| 1210 | int GridSize; |
| 1211 | }; |
| 1212 | LaunchVals getLaunchVals(int WarpSize, EnvironmentVariables Env, |
| 1213 | int ConstWGSize, |
| 1214 | llvm::omp::OMPTgtExecModeFlags ExecutionMode, |
| 1215 | int NumTeams, int ThreadLimit, uint64_t LoopTripcount, |
| 1216 | int DeviceNumTeams) { |
| 1217 | |
| 1218 | int ThreadsPerGroup = RTLDeviceInfoTy::DefaultWgSize; |
| 1219 | int NumGroups = 0; |
| 1220 | |
| 1221 | int MaxTeams = Env.MaxTeamsDefault > 0 ? Env.MaxTeamsDefault : DeviceNumTeams; |
| 1222 | if (MaxTeams > static_cast<int>(RTLDeviceInfoTy::HardTeamLimit)) |
| 1223 | MaxTeams = RTLDeviceInfoTy::HardTeamLimit; |
| 1224 | |
| 1225 | if (print_kernel_trace & STARTUP_DETAILS) { |
| 1226 | DP("RTLDeviceInfoTy::Max_Teams: %d\n", RTLDeviceInfoTy::MaxTeams){}; |
| 1227 | DP("Max_Teams: %d\n", MaxTeams){}; |
| 1228 | DP("RTLDeviceInfoTy::Warp_Size: %d\n", WarpSize){}; |
| 1229 | DP("RTLDeviceInfoTy::Max_WG_Size: %d\n", RTLDeviceInfoTy::MaxWgSize){}; |
| 1230 | DP("RTLDeviceInfoTy::Default_WG_Size: %d\n",{} |
| 1231 | RTLDeviceInfoTy::DefaultWgSize){}; |
| 1232 | DP("thread_limit: %d\n", ThreadLimit){}; |
| 1233 | DP("threadsPerGroup: %d\n", ThreadsPerGroup){}; |
| 1234 | DP("ConstWGSize: %d\n", ConstWGSize){}; |
| 1235 | } |
| 1236 | // check for thread_limit() clause |
| 1237 | if (ThreadLimit > 0) { |
| 1238 | ThreadsPerGroup = ThreadLimit; |
| 1239 | DP("Setting threads per block to requested %d\n", ThreadLimit){}; |
| 1240 | // Add master warp for GENERIC |
| 1241 | if (ExecutionMode == |
| 1242 | llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { |
| 1243 | ThreadsPerGroup += WarpSize; |
| 1244 | DP("Adding master wavefront: +%d threads\n", WarpSize){}; |
| 1245 | } |
| 1246 | if (ThreadsPerGroup > RTLDeviceInfoTy::MaxWgSize) { // limit to max |
| 1247 | ThreadsPerGroup = RTLDeviceInfoTy::MaxWgSize; |
| 1248 | DP("Setting threads per block to maximum %d\n", ThreadsPerGroup){}; |
| 1249 | } |
| 1250 | } |
| 1251 | // check flat_max_work_group_size attr here |
| 1252 | if (ThreadsPerGroup > ConstWGSize) { |
| 1253 | ThreadsPerGroup = ConstWGSize; |
| 1254 | DP("Reduced threadsPerGroup to flat-attr-group-size limit %d\n",{} |
| 1255 | ThreadsPerGroup){}; |
| 1256 | } |
| 1257 | if (print_kernel_trace & STARTUP_DETAILS) |
| 1258 | DP("threadsPerGroup: %d\n", ThreadsPerGroup){}; |
| 1259 | DP("Preparing %d threads\n", ThreadsPerGroup){}; |
| 1260 | |
| 1261 | // Set default num_groups (teams) |
| 1262 | if (Env.TeamLimit > 0) |
| 1263 | NumGroups = (MaxTeams < Env.TeamLimit) ? MaxTeams : Env.TeamLimit; |
| 1264 | else |
| 1265 | NumGroups = MaxTeams; |
| 1266 | DP("Set default num of groups %d\n", NumGroups){}; |
| 1267 | |
| 1268 | if (print_kernel_trace & STARTUP_DETAILS) { |
| 1269 | DP("num_groups: %d\n", NumGroups){}; |
| 1270 | DP("num_teams: %d\n", NumTeams){}; |
| 1271 | } |
| 1272 | |
| 1273 | // Reduce num_groups if threadsPerGroup exceeds RTLDeviceInfoTy::Max_WG_Size |
| 1274 | // This reduction is typical for default case (no thread_limit clause). |
| 1275 | // or when user goes crazy with num_teams clause. |
| 1276 | // FIXME: We cant distinguish between a constant or variable thread limit. |
| 1277 | // So we only handle constant thread_limits. |
| 1278 | if (ThreadsPerGroup > |
| 1279 | RTLDeviceInfoTy::DefaultWgSize) // 256 < threadsPerGroup <= 1024 |
| 1280 | // Should we round threadsPerGroup up to nearest WarpSize |
| 1281 | // here? |
| 1282 | NumGroups = (MaxTeams * RTLDeviceInfoTy::MaxWgSize) / ThreadsPerGroup; |
| 1283 | |
| 1284 | // check for num_teams() clause |
| 1285 | if (NumTeams > 0) { |
| 1286 | NumGroups = (NumTeams < NumGroups) ? NumTeams : NumGroups; |
| 1287 | } |
| 1288 | if (print_kernel_trace & STARTUP_DETAILS) { |
| 1289 | DP("num_groups: %d\n", NumGroups){}; |
| 1290 | DP("Env.NumTeams %d\n", Env.NumTeams){}; |
| 1291 | DP("Env.TeamLimit %d\n", Env.TeamLimit){}; |
| 1292 | } |
| 1293 | |
| 1294 | if (Env.NumTeams > 0) { |
| 1295 | NumGroups = (Env.NumTeams < NumGroups) ? Env.NumTeams : NumGroups; |
| 1296 | DP("Modifying teams based on Env.NumTeams %d\n", Env.NumTeams){}; |
| 1297 | } else if (Env.TeamLimit > 0) { |
| 1298 | NumGroups = (Env.TeamLimit < NumGroups) ? Env.TeamLimit : NumGroups; |
| 1299 | DP("Modifying teams based on Env.TeamLimit%d\n", Env.TeamLimit){}; |
| 1300 | } else { |
| 1301 | if (NumTeams <= 0) { |
| 1302 | if (LoopTripcount > 0) { |
| 1303 | if (ExecutionMode == |
| 1304 | llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD) { |
| 1305 | // round up to the nearest integer |
| 1306 | NumGroups = ((LoopTripcount - 1) / ThreadsPerGroup) + 1; |
| 1307 | } else if (ExecutionMode == |
| 1308 | llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC) { |
| 1309 | NumGroups = LoopTripcount; |
| 1310 | } else /* OMP_TGT_EXEC_MODE_GENERIC_SPMD */ { |
| 1311 | // This is a generic kernel that was transformed to use SPMD-mode |
| 1312 | // execution but uses Generic-mode semantics for scheduling. |
| 1313 | NumGroups = LoopTripcount; |
| 1314 | } |
| 1315 | DP("Using %d teams due to loop trip count %" PRIu64 " and number of "{} |
| 1316 | "threads per block %d\n",{} |
| 1317 | NumGroups, LoopTripcount, ThreadsPerGroup){}; |
| 1318 | } |
| 1319 | } else { |
| 1320 | NumGroups = NumTeams; |
| 1321 | } |
| 1322 | if (NumGroups > MaxTeams) { |
| 1323 | NumGroups = MaxTeams; |
| 1324 | if (print_kernel_trace & STARTUP_DETAILS) |
| 1325 | DP("Limiting num_groups %d to Max_Teams %d \n", NumGroups, MaxTeams){}; |
| 1326 | } |
| 1327 | if (NumGroups > NumTeams && NumTeams > 0) { |
| 1328 | NumGroups = NumTeams; |
| 1329 | if (print_kernel_trace & STARTUP_DETAILS) |
| 1330 | DP("Limiting num_groups %d to clause num_teams %d \n", NumGroups,{} |
| 1331 | NumTeams){}; |
| 1332 | } |
| 1333 | } |
| 1334 | |
| 1335 | // num_teams clause always honored, no matter what, unless DEFAULT is active. |
| 1336 | if (NumTeams > 0) { |
| 1337 | NumGroups = NumTeams; |
| 1338 | // Cap num_groups to EnvMaxTeamsDefault if set. |
| 1339 | if (Env.MaxTeamsDefault > 0 && NumGroups > Env.MaxTeamsDefault) |
| 1340 | NumGroups = Env.MaxTeamsDefault; |
| 1341 | } |
| 1342 | if (print_kernel_trace & STARTUP_DETAILS) { |
| 1343 | DP("threadsPerGroup: %d\n", ThreadsPerGroup){}; |
| 1344 | DP("num_groups: %d\n", NumGroups){}; |
| 1345 | DP("loop_tripcount: %ld\n", LoopTripcount){}; |
| 1346 | } |
| 1347 | DP("Final %d num_groups and %d threadsPerGroup\n", NumGroups,{} |
| 1348 | ThreadsPerGroup){}; |
| 1349 | |
| 1350 | LaunchVals Res; |
| 1351 | Res.WorkgroupSize = ThreadsPerGroup; |
| 1352 | Res.GridSize = ThreadsPerGroup * NumGroups; |
| 1353 | return Res; |
| 1354 | } |
| 1355 | |
| 1356 | static uint64_t acquireAvailablePacketId(hsa_queue_t *Queue) { |
| 1357 | uint64_t PacketId = hsa_queue_add_write_index_relaxed(Queue, 1); |
| 1358 | bool Full = true; |
| 1359 | while (Full) { |
| 1360 | Full = |
| 1361 | PacketId >= (Queue->size + hsa_queue_load_read_index_scacquire(Queue)); |
| 1362 | } |
| 1363 | return PacketId; |
| 1364 | } |
| 1365 | |
| 1366 | int32_t runRegionLocked(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs, |
| 1367 | ptrdiff_t *TgtOffsets, int32_t ArgNum, int32_t NumTeams, |
| 1368 | int32_t ThreadLimit, uint64_t LoopTripcount) { |
| 1369 | // Set the context we are using |
| 1370 | // update thread limit content in gpu memory if un-initialized or specified |
| 1371 | // from host |
| 1372 | |
| 1373 | DP("Run target team region thread_limit %d\n", ThreadLimit){}; |
| 1374 | |
| 1375 | // All args are references. |
| 1376 | std::vector<void *> Args(ArgNum); |
| 1377 | std::vector<void *> Ptrs(ArgNum); |
| 1378 | |
| 1379 | DP("Arg_num: %d\n", ArgNum){}; |
| 1380 | for (int32_t I = 0; I < ArgNum; ++I) { |
| 1381 | Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); |
| 1382 | Args[I] = &Ptrs[I]; |
| 1383 | DP("Offseted base: arg[%d]:" DPxMOD "\n", I, DPxPTR(Ptrs[I])){}; |
| 1384 | } |
| 1385 | |
| 1386 | KernelTy *KernelInfo = (KernelTy *)TgtEntryPtr; |
| 1387 | |
| 1388 | std::string KernelName = std::string(KernelInfo->Name); |
| 1389 | auto &KernelInfoTable = DeviceInfo().KernelInfoTable; |
| 1390 | if (KernelInfoTable[DeviceId].find(KernelName) == |
| 1391 | KernelInfoTable[DeviceId].end()) { |
| 1392 | DP("Kernel %s not found\n", KernelName.c_str()){}; |
| 1393 | return OFFLOAD_FAIL(~0); |
| 1394 | } |
| 1395 | |
| 1396 | const atl_kernel_info_t KernelInfoEntry = |
| 1397 | KernelInfoTable[DeviceId][KernelName]; |
| 1398 | const uint32_t GroupSegmentSize = |
| 1399 | KernelInfoEntry.group_segment_size + DeviceInfo().Env.DynamicMemSize; |
| 1400 | const uint32_t SgprCount = KernelInfoEntry.sgpr_count; |
| 1401 | const uint32_t VgprCount = KernelInfoEntry.vgpr_count; |
| 1402 | const uint32_t SgprSpillCount = KernelInfoEntry.sgpr_spill_count; |
| 1403 | const uint32_t VgprSpillCount = KernelInfoEntry.vgpr_spill_count; |
| 1404 | |
| 1405 | assert(ArgNum == (int)KernelInfoEntry.explicit_argument_count)(static_cast <bool> (ArgNum == (int)KernelInfoEntry.explicit_argument_count ) ? void (0) : __assert_fail ("ArgNum == (int)KernelInfoEntry.explicit_argument_count" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1405, __extension__ __PRETTY_FUNCTION__)); |
| 1406 | |
| 1407 | /* |
| 1408 | * Set limit based on ThreadsPerGroup and GroupsPerDevice |
| 1409 | */ |
| 1410 | LaunchVals LV = |
| 1411 | getLaunchVals(DeviceInfo().WarpSize[DeviceId], DeviceInfo().Env, |
| 1412 | KernelInfo->ConstWGSize, KernelInfo->ExecutionMode, |
| 1413 | NumTeams, // From run_region arg |
| 1414 | ThreadLimit, // From run_region arg |
| 1415 | LoopTripcount, // From run_region arg |
| 1416 | DeviceInfo().NumTeams[KernelInfo->DeviceId]); |
| 1417 | const int GridSize = LV.GridSize; |
| 1418 | const int WorkgroupSize = LV.WorkgroupSize; |
| 1419 | |
| 1420 | if (print_kernel_trace >= LAUNCH) { |
| 1421 | int NumGroups = GridSize / WorkgroupSize; |
| 1422 | // enum modes are SPMD, GENERIC, NONE 0,1,2 |
| 1423 | // if doing rtl timing, print to stderr, unless stdout requested. |
| 1424 | bool TraceToStdout = print_kernel_trace & (RTL_TO_STDOUT | RTL_TIMING); |
| 1425 | fprintf(TraceToStdout ? stdoutstdout : stderrstderr, |
| 1426 | "DEVID:%2d SGN:%1d ConstWGSize:%-4d args:%2d teamsXthrds:(%4dX%4d) " |
| 1427 | "reqd:(%4dX%4d) lds_usage:%uB sgpr_count:%u vgpr_count:%u " |
| 1428 | "sgpr_spill_count:%u vgpr_spill_count:%u tripcount:%lu n:%s\n", |
| 1429 | DeviceId, KernelInfo->ExecutionMode, KernelInfo->ConstWGSize, |
| 1430 | ArgNum, NumGroups, WorkgroupSize, NumTeams, ThreadLimit, |
| 1431 | GroupSegmentSize, SgprCount, VgprCount, SgprSpillCount, |
| 1432 | VgprSpillCount, LoopTripcount, KernelInfo->Name); |
| 1433 | } |
| 1434 | |
| 1435 | // Run on the device. |
| 1436 | { |
| 1437 | hsa_queue_t *Queue = DeviceInfo().HSAQueueSchedulers[DeviceId].next(); |
| 1438 | if (!Queue) { |
| 1439 | return OFFLOAD_FAIL(~0); |
| 1440 | } |
| 1441 | uint64_t PacketId = acquireAvailablePacketId(Queue); |
| 1442 | |
| 1443 | const uint32_t Mask = Queue->size - 1; // size is a power of 2 |
| 1444 | hsa_kernel_dispatch_packet_t *Packet = |
| 1445 | (hsa_kernel_dispatch_packet_t *)Queue->base_address + (PacketId & Mask); |
| 1446 | |
| 1447 | // packet->header is written last |
| 1448 | Packet->setup = UINT16_C(1)1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; |
| 1449 | Packet->workgroup_size_x = WorkgroupSize; |
| 1450 | Packet->workgroup_size_y = 1; |
| 1451 | Packet->workgroup_size_z = 1; |
| 1452 | Packet->reserved0 = 0; |
| 1453 | Packet->grid_size_x = GridSize; |
| 1454 | Packet->grid_size_y = 1; |
| 1455 | Packet->grid_size_z = 1; |
| 1456 | Packet->private_segment_size = KernelInfoEntry.private_segment_size; |
| 1457 | Packet->group_segment_size = GroupSegmentSize; |
| 1458 | Packet->kernel_object = KernelInfoEntry.kernel_object; |
| 1459 | Packet->kernarg_address = 0; // use the block allocator |
| 1460 | Packet->reserved2 = 0; // impl writes id_ here |
| 1461 | Packet->completion_signal = {0}; // may want a pool of signals |
| 1462 | |
| 1463 | KernelArgPool *ArgPool = nullptr; |
| 1464 | void *KernArg = nullptr; |
| 1465 | { |
| 1466 | auto It = KernelArgPoolMap.find(std::string(KernelInfo->Name)); |
| 1467 | if (It != KernelArgPoolMap.end()) { |
| 1468 | ArgPool = (It->second).get(); |
| 1469 | } |
| 1470 | } |
| 1471 | if (!ArgPool) { |
| 1472 | DP("Warning: No ArgPool for %s on device %d\n", KernelInfo->Name,{} |
| 1473 | DeviceId){}; |
| 1474 | } |
| 1475 | { |
| 1476 | if (ArgPool) { |
| 1477 | assert(ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *)))(static_cast <bool> (ArgPool->KernargSegmentSize == ( ArgNum * sizeof(void *))) ? void (0) : __assert_fail ("ArgPool->KernargSegmentSize == (ArgNum * sizeof(void *))" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 1477, __extension__ __PRETTY_FUNCTION__)); |
| 1478 | KernArg = ArgPool->allocate(ArgNum); |
| 1479 | } |
| 1480 | if (!KernArg) { |
| 1481 | DP("Allocate kernarg failed\n"){}; |
| 1482 | return OFFLOAD_FAIL(~0); |
| 1483 | } |
| 1484 | |
| 1485 | // Copy explicit arguments |
| 1486 | for (int I = 0; I < ArgNum; I++) { |
| 1487 | memcpy((char *)KernArg + sizeof(void *) * I, Args[I], sizeof(void *)); |
| 1488 | } |
| 1489 | |
| 1490 | // Initialize implicit arguments. TODO: Which of these can be dropped |
| 1491 | AMDGPUImplicitArgsTy *ImplArgs = reinterpret_cast<AMDGPUImplicitArgsTy *>( |
| 1492 | static_cast<char *>(KernArg) + ArgPool->KernargSegmentSize); |
| 1493 | memset(ImplArgs, 0, |
| 1494 | sizeof(AMDGPUImplicitArgsTy)); // may not be necessary |
| 1495 | ImplArgs->OffsetX = 0; |
| 1496 | ImplArgs->OffsetY = 0; |
| 1497 | ImplArgs->OffsetZ = 0; |
| 1498 | |
| 1499 | // assign a hostcall buffer for the selected Q |
| 1500 | if (__atomic_load_n(&DeviceInfo().HostcallRequired, __ATOMIC_ACQUIRE2)) { |
| 1501 | // hostrpc_assign_buffer is not thread safe, and this function is |
| 1502 | // under a multiple reader lock, not a writer lock. |
| 1503 | static pthread_mutex_t HostcallInitLock = PTHREAD_MUTEX_INITIALIZER{ { 0, 0, 0, 0, PTHREAD_MUTEX_TIMED_NP, 0, 0, { 0, 0 } } }; |
| 1504 | pthread_mutex_lock(&HostcallInitLock); |
| 1505 | uint64_t Buffer = hostrpc_assign_buffer( |
| 1506 | DeviceInfo().HSAAgents[DeviceId], Queue, DeviceId); |
| 1507 | pthread_mutex_unlock(&HostcallInitLock); |
| 1508 | if (!Buffer) { |
| 1509 | DP("hostrpc_assign_buffer failed, gpu would dereference null and "{} |
| 1510 | "error\n"){}; |
| 1511 | return OFFLOAD_FAIL(~0); |
| 1512 | } |
| 1513 | |
| 1514 | DP("Implicit argument count: %d\n",{} |
| 1515 | KernelInfoEntry.implicit_argument_count){}; |
| 1516 | if (KernelInfoEntry.implicit_argument_count >= 4) { |
| 1517 | // Initialise pointer for implicit_argument_count != 0 ABI |
| 1518 | // Guess that the right implicit argument is at offset 24 after |
| 1519 | // the explicit arguments. In the future, should be able to read |
| 1520 | // the offset from msgpack. Clang is not annotating it at present. |
| 1521 | uint64_t Offset = |
| 1522 | sizeof(void *) * (KernelInfoEntry.explicit_argument_count + 3); |
| 1523 | if ((Offset + 8) > ArgPool->kernargSizeIncludingImplicit()) { |
| 1524 | DP("Bad offset of hostcall: %lu, exceeds kernarg size w/ implicit "{} |
| 1525 | "args: %d\n",{} |
| 1526 | Offset + 8, ArgPool->kernargSizeIncludingImplicit()){}; |
| 1527 | } else { |
| 1528 | memcpy(static_cast<char *>(KernArg) + Offset, &Buffer, 8); |
| 1529 | } |
| 1530 | } |
| 1531 | |
| 1532 | // initialise pointer for implicit_argument_count == 0 ABI |
| 1533 | ImplArgs->HostcallPtr = Buffer; |
| 1534 | } |
| 1535 | |
| 1536 | Packet->kernarg_address = KernArg; |
| 1537 | } |
| 1538 | |
| 1539 | hsa_signal_t S = DeviceInfo().FreeSignalPool.pop(); |
| 1540 | if (S.handle == 0) { |
| 1541 | DP("Failed to get signal instance\n"){}; |
| 1542 | return OFFLOAD_FAIL(~0); |
| 1543 | } |
| 1544 | Packet->completion_signal = S; |
| 1545 | hsa_signal_store_relaxed(Packet->completion_signal, 1); |
| 1546 | |
| 1547 | // Publish the packet indicating it is ready to be processed |
| 1548 | core::packetStoreRelease(reinterpret_cast<uint32_t *>(Packet), |
| 1549 | core::createHeader(), Packet->setup); |
| 1550 | |
| 1551 | // Since the packet is already published, its contents must not be |
| 1552 | // accessed any more |
| 1553 | hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId); |
| 1554 | |
| 1555 | while (hsa_signal_wait_scacquire(S, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX(18446744073709551615UL), |
| 1556 | HSA_WAIT_STATE_BLOCKED) != 0) |
| 1557 | ; |
| 1558 | |
| 1559 | assert(ArgPool)(static_cast <bool> (ArgPool) ? void (0) : __assert_fail ("ArgPool", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 1559, __extension__ __PRETTY_FUNCTION__)); |
| 1560 | ArgPool->deallocate(KernArg); |
| 1561 | DeviceInfo().FreeSignalPool.push(S); |
| 1562 | } |
| 1563 | |
| 1564 | DP("Kernel completed\n"){}; |
| 1565 | return OFFLOAD_SUCCESS(0); |
| 1566 | } |
| 1567 | |
| 1568 | bool elfMachineIdIsAmdgcn(__tgt_device_image *Image) { |
| 1569 | const uint16_t AmdgcnMachineID = EM_AMDGPU; |
| 1570 | const int32_t R = elf_check_machine(Image, AmdgcnMachineID); |
| 1571 | if (!R) { |
| 1572 | DP("Supported machine ID not found\n"){}; |
| 1573 | } |
| 1574 | return R; |
| 1575 | } |
| 1576 | |
| 1577 | uint32_t elfEFlags(__tgt_device_image *Image) { |
| 1578 | const char *ImgBegin = (char *)Image->ImageStart; |
| 1579 | size_t ImgSize = (char *)Image->ImageEnd - ImgBegin; |
| 1580 | |
| 1581 | StringRef Buffer = StringRef(ImgBegin, ImgSize); |
| 1582 | auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), |
| 1583 | /*InitContent=*/false); |
| 1584 | if (!ElfOrErr) { |
| 1585 | consumeError(ElfOrErr.takeError()); |
| 1586 | return 0; |
| 1587 | } |
| 1588 | |
| 1589 | if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get())) |
| 1590 | return ELFObj->getPlatformFlags(); |
| 1591 | return 0; |
| 1592 | } |
| 1593 | |
| 1594 | template <typename T> bool enforceUpperBound(T *Value, T Upper) { |
| 1595 | bool Changed = *Value > Upper; |
| 1596 | if (Changed) { |
| 1597 | *Value = Upper; |
| 1598 | } |
| 1599 | return Changed; |
| 1600 | } |
| 1601 | |
| 1602 | struct SymbolInfo { |
| 1603 | const void *Addr = nullptr; |
| 1604 | uint32_t Size = UINT32_MAX(4294967295U); |
| 1605 | uint32_t ShType = SHT_NULL; |
| 1606 | }; |
| 1607 | |
| 1608 | int getSymbolInfoWithoutLoading(const ELFObjectFile<ELF64LE> &ELFObj, |
| 1609 | StringRef SymName, SymbolInfo *Res) { |
| 1610 | auto SymOrErr = getELFSymbol(ELFObj, SymName); |
| 1611 | if (!SymOrErr) { |
| 1612 | std::string ErrorString = toString(SymOrErr.takeError()); |
| 1613 | DP("Failed ELF lookup: %s\n", ErrorString.c_str()){}; |
| 1614 | return 1; |
| 1615 | } |
| 1616 | if (!*SymOrErr) |
| 1617 | return 1; |
| 1618 | |
| 1619 | auto SymSecOrErr = ELFObj.getELFFile().getSection((*SymOrErr)->st_shndx); |
| 1620 | if (!SymSecOrErr) { |
| 1621 | std::string ErrorString = toString(SymOrErr.takeError()); |
| 1622 | DP("Failed ELF lookup: %s\n", ErrorString.c_str()){}; |
| 1623 | return 1; |
| 1624 | } |
| 1625 | |
| 1626 | Res->Addr = (*SymOrErr)->st_value + ELFObj.getELFFile().base(); |
| 1627 | Res->Size = static_cast<uint32_t>((*SymOrErr)->st_size); |
| 1628 | Res->ShType = static_cast<uint32_t>((*SymSecOrErr)->sh_type); |
| 1629 | return 0; |
| 1630 | } |
| 1631 | |
| 1632 | int getSymbolInfoWithoutLoading(char *Base, size_t ImgSize, const char *SymName, |
| 1633 | SymbolInfo *Res) { |
| 1634 | StringRef Buffer = StringRef(Base, ImgSize); |
| 1635 | auto ElfOrErr = ObjectFile::createELFObjectFile(MemoryBufferRef(Buffer, ""), |
| 1636 | /*InitContent=*/false); |
| 1637 | if (!ElfOrErr) { |
| 1638 | REPORT("Failed to load ELF: %s\n", toString(ElfOrErr.takeError()).c_str())do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Failed to load ELF: %s\n" , toString(ElfOrErr.takeError()).c_str()); } while (0);; |
| 1639 | return 1; |
| 1640 | } |
| 1641 | |
| 1642 | if (const auto *ELFObj = dyn_cast<ELF64LEObjectFile>(ElfOrErr->get())) |
| 1643 | return getSymbolInfoWithoutLoading(*ELFObj, SymName, Res); |
| 1644 | return 1; |
| 1645 | } |
| 1646 | |
| 1647 | hsa_status_t interopGetSymbolInfo(char *Base, size_t ImgSize, |
| 1648 | const char *SymName, const void **VarAddr, |
| 1649 | uint32_t *VarSize) { |
| 1650 | SymbolInfo SI; |
| 1651 | int Rc = getSymbolInfoWithoutLoading(Base, ImgSize, SymName, &SI); |
| 1652 | if (Rc == 0) { |
| 1653 | *VarAddr = SI.Addr; |
| 1654 | *VarSize = SI.Size; |
| 1655 | return HSA_STATUS_SUCCESS; |
| 1656 | } |
| 1657 | return HSA_STATUS_ERROR; |
| 1658 | } |
| 1659 | |
| 1660 | template <typename C> |
| 1661 | hsa_status_t moduleRegisterFromMemoryToPlace( |
| 1662 | std::map<std::string, atl_kernel_info_t> &KernelInfoTable, |
| 1663 | std::map<std::string, atl_symbol_info_t> &SymbolInfoTable, |
| 1664 | void *ModuleBytes, size_t ModuleSize, int DeviceId, C Cb, |
| 1665 | std::vector<hsa_executable_t> &HSAExecutables) { |
| 1666 | auto L = [](void *Data, size_t Size, void *CbState) -> hsa_status_t { |
| 1667 | C *Unwrapped = static_cast<C *>(CbState); |
| 1668 | return (*Unwrapped)(Data, Size); |
| 1669 | }; |
| 1670 | return core::RegisterModuleFromMemory( |
| 1671 | KernelInfoTable, SymbolInfoTable, ModuleBytes, ModuleSize, |
| 1672 | DeviceInfo().HSAAgents[DeviceId], L, static_cast<void *>(&Cb), |
| 1673 | HSAExecutables); |
| 1674 | } |
| 1675 | |
| 1676 | uint64_t getDeviceStateBytes(char *ImageStart, size_t ImgSize) { |
| 1677 | uint64_t DeviceStateBytes = 0; |
| 1678 | { |
| 1679 | // If this is the deviceRTL, get the state variable size |
| 1680 | SymbolInfo SizeSi; |
| 1681 | int Rc = getSymbolInfoWithoutLoading( |
| 1682 | ImageStart, ImgSize, "omptarget_nvptx_device_State_size", &SizeSi); |
| 1683 | |
| 1684 | if (Rc == 0) { |
| 1685 | if (SizeSi.Size != sizeof(uint64_t)) { |
| 1686 | DP("Found device_State_size variable with wrong size\n"){}; |
| 1687 | return 0; |
| 1688 | } |
| 1689 | |
| 1690 | // Read number of bytes directly from the elf |
| 1691 | memcpy(&DeviceStateBytes, SizeSi.Addr, sizeof(uint64_t)); |
| 1692 | } |
| 1693 | } |
| 1694 | return DeviceStateBytes; |
| 1695 | } |
| 1696 | |
| 1697 | struct DeviceEnvironment { |
| 1698 | // initialise an DeviceEnvironmentTy in the deviceRTL |
| 1699 | // patches around differences in the deviceRTL between trunk, aomp, |
| 1700 | // rocmcc. Over time these differences will tend to zero and this class |
| 1701 | // simplified. |
| 1702 | // Symbol may be in .data or .bss, and may be missing fields, todo: |
| 1703 | // review aomp/trunk/rocm and simplify the following |
| 1704 | |
| 1705 | // The symbol may also have been deadstripped because the device side |
| 1706 | // accessors were unused. |
| 1707 | |
| 1708 | // If the symbol is in .data (aomp, rocm) it can be written directly. |
| 1709 | // If it is in .bss, we must wait for it to be allocated space on the |
| 1710 | // gpu (trunk) and initialize after loading. |
| 1711 | const char *sym() { return "__omp_rtl_device_environment"; } |
| 1712 | |
| 1713 | DeviceEnvironmentTy HostDeviceEnv; |
| 1714 | SymbolInfo SI; |
| 1715 | bool Valid = false; |
| 1716 | |
| 1717 | __tgt_device_image *Image; |
| 1718 | const size_t ImgSize; |
| 1719 | |
| 1720 | DeviceEnvironment(int DeviceId, int NumberDevices, int DynamicMemSize, |
| 1721 | __tgt_device_image *Image, const size_t ImgSize) |
| 1722 | : Image(Image), ImgSize(ImgSize) { |
| 1723 | |
| 1724 | HostDeviceEnv.NumDevices = NumberDevices; |
| 1725 | HostDeviceEnv.DeviceNum = DeviceId; |
| 1726 | HostDeviceEnv.DebugKind = 0; |
| 1727 | HostDeviceEnv.DynamicMemSize = DynamicMemSize; |
| 1728 | if (char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) |
| 1729 | HostDeviceEnv.DebugKind = std::stoi(EnvStr); |
| 1730 | |
| 1731 | int Rc = getSymbolInfoWithoutLoading((char *)Image->ImageStart, ImgSize, |
| 1732 | sym(), &SI); |
| 1733 | if (Rc != 0) { |
| 1734 | DP("Finding global device environment '%s' - symbol missing.\n", sym()){}; |
| 1735 | return; |
| 1736 | } |
| 1737 | |
| 1738 | if (SI.Size > sizeof(HostDeviceEnv)) { |
| 1739 | DP("Symbol '%s' has size %u, expected at most %zu.\n", sym(), SI.Size,{} |
| 1740 | sizeof(HostDeviceEnv)){}; |
| 1741 | return; |
| 1742 | } |
| 1743 | |
| 1744 | Valid = true; |
| 1745 | } |
| 1746 | |
| 1747 | bool inImage() { return SI.ShType != SHT_NOBITS; } |
| 1748 | |
| 1749 | hsa_status_t beforeLoading(void *Data, size_t Size) { |
| 1750 | if (Valid) { |
| 1751 | if (inImage()) { |
| 1752 | DP("Setting global device environment before load (%u bytes)\n",{} |
| 1753 | SI.Size){}; |
| 1754 | uint64_t Offset = reinterpret_cast<const char *>(SI.Addr) - |
| 1755 | reinterpret_cast<const char *>(Image->ImageStart); |
| 1756 | void *Pos = reinterpret_cast<char *>(Data) + Offset; |
| 1757 | memcpy(Pos, &HostDeviceEnv, SI.Size); |
| 1758 | } |
| 1759 | } |
| 1760 | return HSA_STATUS_SUCCESS; |
| 1761 | } |
| 1762 | |
| 1763 | hsa_status_t afterLoading() { |
| 1764 | if (Valid) { |
| 1765 | if (!inImage()) { |
| 1766 | DP("Setting global device environment after load (%u bytes)\n",{} |
| 1767 | SI.Size){}; |
| 1768 | int DeviceId = HostDeviceEnv.DeviceNum; |
| 1769 | auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; |
| 1770 | void *StatePtr; |
| 1771 | uint32_t StatePtrSize; |
| 1772 | hsa_status_t Err = interop_hsa_get_symbol_info( |
| 1773 | SymbolInfo, DeviceId, sym(), &StatePtr, &StatePtrSize); |
| 1774 | if (Err != HSA_STATUS_SUCCESS) { |
| 1775 | DP("failed to find %s in loaded image\n", sym()){}; |
| 1776 | return Err; |
| 1777 | } |
| 1778 | |
| 1779 | if (StatePtrSize != SI.Size) { |
| 1780 | DP("Symbol had size %u before loading, %u after\n", StatePtrSize,{} |
| 1781 | SI.Size){}; |
| 1782 | return HSA_STATUS_ERROR; |
| 1783 | } |
| 1784 | |
| 1785 | return DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &HostDeviceEnv, |
| 1786 | StatePtrSize, DeviceId); |
| 1787 | } |
| 1788 | } |
| 1789 | return HSA_STATUS_SUCCESS; |
| 1790 | } |
| 1791 | }; |
| 1792 | |
| 1793 | hsa_status_t implCalloc(void **RetPtr, size_t Size, int DeviceId) { |
| 1794 | uint64_t Rounded = 4 * ((Size + 3) / 4); |
| 1795 | void *Ptr; |
| 1796 | hsa_amd_memory_pool_t MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); |
| 1797 | hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Rounded, 0, &Ptr); |
| 1798 | if (Err != HSA_STATUS_SUCCESS) { |
| 1799 | return Err; |
| 1800 | } |
| 1801 | |
| 1802 | hsa_status_t Rc = hsa_amd_memory_fill(Ptr, 0, Rounded / 4); |
| 1803 | if (Rc != HSA_STATUS_SUCCESS) { |
| 1804 | DP("zero fill device_state failed with %u\n", Rc){}; |
| 1805 | core::Runtime::Memfree(Ptr); |
| 1806 | return HSA_STATUS_ERROR; |
| 1807 | } |
| 1808 | |
| 1809 | *RetPtr = Ptr; |
| 1810 | return HSA_STATUS_SUCCESS; |
| 1811 | } |
| 1812 | |
| 1813 | bool imageContainsSymbol(void *Data, size_t Size, const char *Sym) { |
| 1814 | SymbolInfo SI; |
| 1815 | int Rc = getSymbolInfoWithoutLoading((char *)Data, Size, Sym, &SI); |
| 1816 | return (Rc == 0) && (SI.Addr != nullptr); |
| 1817 | } |
| 1818 | |
| 1819 | hsa_status_t lock_memory(void *HostPtr, size_t Size, hsa_agent_t Agent, |
| 1820 | void **LockedHostPtr) { |
| 1821 | hsa_status_t err = is_locked(HostPtr, LockedHostPtr); |
| 1822 | if (err != HSA_STATUS_SUCCESS) |
| 1823 | return err; |
| 1824 | |
| 1825 | // HostPtr is already locked, just return it |
| 1826 | if (*LockedHostPtr) |
| 1827 | return HSA_STATUS_SUCCESS; |
| 1828 | |
| 1829 | hsa_agent_t Agents[1] = {Agent}; |
| 1830 | return hsa_amd_memory_lock(HostPtr, Size, Agents, /*num_agent=*/1, |
| 1831 | LockedHostPtr); |
| 1832 | } |
| 1833 | |
| 1834 | hsa_status_t unlock_memory(void *HostPtr) { |
| 1835 | void *LockedHostPtr = nullptr; |
| 1836 | hsa_status_t err = is_locked(HostPtr, &LockedHostPtr); |
| 1837 | if (err != HSA_STATUS_SUCCESS) |
| 1838 | return err; |
| 1839 | |
| 1840 | // if LockedHostPtr is nullptr, then HostPtr was not locked |
| 1841 | if (!LockedHostPtr) |
| 1842 | return HSA_STATUS_SUCCESS; |
| 1843 | |
| 1844 | err = hsa_amd_memory_unlock(HostPtr); |
| 1845 | return err; |
| 1846 | } |
| 1847 | |
| 1848 | } // namespace |
| 1849 | |
| 1850 | namespace core { |
| 1851 | hsa_status_t allow_access_to_all_gpu_agents(void *Ptr) { |
| 1852 | return hsa_amd_agents_allow_access(DeviceInfo().HSAAgents.size(), |
| 1853 | &DeviceInfo().HSAAgents[0], NULL__null, Ptr); |
| 1854 | } |
| 1855 | } // namespace core |
| 1856 | |
| 1857 | static hsa_status_t GetIsaInfo(hsa_isa_t isa, void *data) { |
| 1858 | hsa_status_t err; |
| 1859 | uint32_t name_len; |
| 1860 | err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME_LENGTH, &name_len); |
| 1861 | if (err != HSA_STATUS_SUCCESS) { |
| 1862 | DP("Error getting ISA info length\n"){}; |
| 1863 | return err; |
| 1864 | } |
| 1865 | |
| 1866 | char TargetID[name_len]; |
| 1867 | err = hsa_isa_get_info_alt(isa, HSA_ISA_INFO_NAME, TargetID); |
| 1868 | if (err != HSA_STATUS_SUCCESS) { |
| 1869 | DP("Error getting ISA info name\n"){}; |
| 1870 | return err; |
| 1871 | } |
| 1872 | |
| 1873 | auto TripleTargetID = llvm::StringRef(TargetID); |
| 1874 | if (TripleTargetID.consume_front("amdgcn-amd-amdhsa")) { |
| 1875 | DeviceInfo().TargetID.push_back(TripleTargetID.ltrim('-').str()); |
| 1876 | } |
| 1877 | return HSA_STATUS_SUCCESS; |
| 1878 | } |
| 1879 | |
| 1880 | extern "C" { |
| 1881 | int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image) { |
| 1882 | return elfMachineIdIsAmdgcn(Image); |
| 1883 | } |
| 1884 | |
| 1885 | int32_t __tgt_rtl_is_valid_binary_info(__tgt_device_image *image, |
| 1886 | __tgt_image_info *info) { |
| 1887 | if (!__tgt_rtl_is_valid_binary(image)) |
| 1888 | return false; |
| 1889 | |
| 1890 | // A subarchitecture was not specified. Assume it is compatible. |
| 1891 | if (!info->Arch) |
| 1892 | return true; |
| 1893 | |
| 1894 | int32_t NumberOfDevices = __tgt_rtl_number_of_devices(); |
| 1895 | |
| 1896 | for (int32_t DeviceId = 0; DeviceId < NumberOfDevices; ++DeviceId) { |
| 1897 | __tgt_rtl_init_device(DeviceId); |
| 1898 | hsa_agent_t agent = DeviceInfo().HSAAgents[DeviceId]; |
| 1899 | hsa_status_t err = hsa_agent_iterate_isas(agent, GetIsaInfo, &DeviceId); |
| 1900 | if (err != HSA_STATUS_SUCCESS) { |
| 1901 | DP("Error iterating ISAs\n"){}; |
| 1902 | return false; |
| 1903 | } |
| 1904 | if (!isImageCompatibleWithEnv(info, DeviceInfo().TargetID[DeviceId])) |
| 1905 | return false; |
| 1906 | } |
| 1907 | DP("Image has Target ID compatible with the current environment: %s\n",{} |
| 1908 | info->Arch){}; |
| 1909 | return true; |
| 1910 | } |
| 1911 | |
| 1912 | int32_t __tgt_rtl_init_plugin() { return OFFLOAD_SUCCESS(0); } |
| 1913 | int32_t __tgt_rtl_deinit_plugin() { return OFFLOAD_SUCCESS(0); } |
| 1914 | |
| 1915 | int __tgt_rtl_number_of_devices() { |
| 1916 | // If the construction failed, no methods are safe to call |
| 1917 | if (DeviceInfo().ConstructionSucceeded) { |
| 1918 | return DeviceInfo().NumberOfDevices; |
| 1919 | } |
| 1920 | DP("AMDGPU plugin construction failed. Zero devices available\n"){}; |
| 1921 | return 0; |
| 1922 | } |
| 1923 | |
| 1924 | int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { |
| 1925 | DP("Init requires flags to %ld\n", RequiresFlags){}; |
| 1926 | DeviceInfo().RequiresFlags = RequiresFlags; |
| 1927 | return RequiresFlags; |
| 1928 | } |
| 1929 | |
| 1930 | int32_t __tgt_rtl_init_device(int DeviceId) { |
| 1931 | hsa_status_t Err = hsa_init(); |
| 1932 | if (Err != HSA_STATUS_SUCCESS) { |
| 1933 | DP("HSA Initialization Failed.\n"){}; |
| 1934 | return HSA_STATUS_ERROR; |
| 1935 | } |
| 1936 | // this is per device id init |
| 1937 | DP("Initialize the device id: %d\n", DeviceId){}; |
| 1938 | |
| 1939 | hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; |
| 1940 | |
| 1941 | // Get number of Compute Unit |
| 1942 | uint32_t ComputeUnits = 0; |
| 1943 | Err = hsa_agent_get_info( |
| 1944 | Agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT, |
| 1945 | &ComputeUnits); |
| 1946 | if (Err != HSA_STATUS_SUCCESS) { |
| 1947 | DeviceInfo().ComputeUnits[DeviceId] = 1; |
| 1948 | DP("Error getting compute units : settiing to 1\n"){}; |
| 1949 | } else { |
| 1950 | DeviceInfo().ComputeUnits[DeviceId] = ComputeUnits; |
| 1951 | DP("Using %d compute unis per grid\n", DeviceInfo().ComputeUnits[DeviceId]){}; |
| 1952 | } |
| 1953 | |
| 1954 | char GetInfoName[64]; // 64 max size returned by get info |
| 1955 | Err = hsa_agent_get_info(Agent, (hsa_agent_info_t)HSA_AGENT_INFO_NAME, |
| 1956 | (void *)GetInfoName); |
| 1957 | if (Err) |
| 1958 | DeviceInfo().GPUName[DeviceId] = "--unknown gpu--"; |
| 1959 | else { |
| 1960 | DeviceInfo().GPUName[DeviceId] = GetInfoName; |
| 1961 | } |
| 1962 | |
| 1963 | if (print_kernel_trace & STARTUP_DETAILS) |
| 1964 | DP("Device#%-2d CU's: %2d %s\n", DeviceId,{} |
| 1965 | DeviceInfo().ComputeUnits[DeviceId],{} |
| 1966 | DeviceInfo().GPUName[DeviceId].c_str()){}; |
| 1967 | |
| 1968 | // Query attributes to determine number of threads/block and blocks/grid. |
| 1969 | uint16_t WorkgroupMaxDim[3]; |
| 1970 | Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_WORKGROUP_MAX_DIM, |
| 1971 | &WorkgroupMaxDim); |
| 1972 | if (Err != HSA_STATUS_SUCCESS) { |
| 1973 | DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::DefaultNumTeams; |
| 1974 | DP("Error getting grid dims: num groups : %d\n",{} |
| 1975 | RTLDeviceInfoTy::DefaultNumTeams){}; |
| 1976 | } else if (WorkgroupMaxDim[0] <= RTLDeviceInfoTy::HardTeamLimit) { |
| 1977 | DeviceInfo().GroupsPerDevice[DeviceId] = WorkgroupMaxDim[0]; |
| 1978 | DP("Using %d ROCm blocks per grid\n",{} |
| 1979 | DeviceInfo().GroupsPerDevice[DeviceId]){}; |
| 1980 | } else { |
| 1981 | DeviceInfo().GroupsPerDevice[DeviceId] = RTLDeviceInfoTy::HardTeamLimit; |
| 1982 | DP("Max ROCm blocks per grid %d exceeds the hard team limit %d, capping "{} |
| 1983 | "at the hard limit\n",{} |
| 1984 | WorkgroupMaxDim[0], RTLDeviceInfoTy::HardTeamLimit){}; |
| 1985 | } |
| 1986 | |
| 1987 | // Get thread limit |
| 1988 | hsa_dim3_t GridMaxDim; |
| 1989 | Err = hsa_agent_get_info(Agent, HSA_AGENT_INFO_GRID_MAX_DIM, &GridMaxDim); |
| 1990 | if (Err == HSA_STATUS_SUCCESS) { |
| 1991 | DeviceInfo().ThreadsPerGroup[DeviceId] = |
| 1992 | reinterpret_cast<uint32_t *>(&GridMaxDim)[0] / |
| 1993 | DeviceInfo().GroupsPerDevice[DeviceId]; |
| 1994 | |
| 1995 | if (DeviceInfo().ThreadsPerGroup[DeviceId] == 0) { |
| 1996 | DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; |
| 1997 | DP("Default thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){}; |
| 1998 | } else if (enforceUpperBound(&DeviceInfo().ThreadsPerGroup[DeviceId], |
| 1999 | RTLDeviceInfoTy::MaxWgSize)) { |
| 2000 | DP("Capped thread limit: %d\n", RTLDeviceInfoTy::MaxWgSize){}; |
| 2001 | } else { |
| 2002 | DP("Using ROCm Queried thread limit: %d\n",{} |
| 2003 | DeviceInfo().ThreadsPerGroup[DeviceId]){}; |
| 2004 | } |
| 2005 | } else { |
| 2006 | DeviceInfo().ThreadsPerGroup[DeviceId] = RTLDeviceInfoTy::MaxWgSize; |
| 2007 | DP("Error getting max block dimension, use default:%d \n",{} |
| 2008 | RTLDeviceInfoTy::MaxWgSize){}; |
| 2009 | } |
| 2010 | |
| 2011 | // Get wavefront size |
| 2012 | uint32_t WavefrontSize = 0; |
| 2013 | Err = |
| 2014 | hsa_agent_get_info(Agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &WavefrontSize); |
| 2015 | if (Err == HSA_STATUS_SUCCESS) { |
| 2016 | DP("Queried wavefront size: %d\n", WavefrontSize){}; |
| 2017 | DeviceInfo().WarpSize[DeviceId] = WavefrontSize; |
| 2018 | } else { |
| 2019 | // TODO: Burn the wavefront size into the code object |
| 2020 | DP("Warning: Unknown wavefront size, assuming 64\n"){}; |
| 2021 | DeviceInfo().WarpSize[DeviceId] = 64; |
| 2022 | } |
| 2023 | |
| 2024 | // Adjust teams to the env variables |
| 2025 | |
| 2026 | if (DeviceInfo().Env.TeamLimit > 0 && |
| 2027 | (enforceUpperBound(&DeviceInfo().GroupsPerDevice[DeviceId], |
| 2028 | DeviceInfo().Env.TeamLimit))) { |
| 2029 | DP("Capping max groups per device to OMP_TEAM_LIMIT=%d\n",{} |
| 2030 | DeviceInfo().Env.TeamLimit){}; |
| 2031 | } |
| 2032 | |
| 2033 | // Set default number of teams |
| 2034 | if (DeviceInfo().Env.NumTeams > 0) { |
| 2035 | DeviceInfo().NumTeams[DeviceId] = DeviceInfo().Env.NumTeams; |
| 2036 | DP("Default number of teams set according to environment %d\n",{} |
| 2037 | DeviceInfo().Env.NumTeams){}; |
| 2038 | } else { |
| 2039 | char *TeamsPerCUEnvStr = getenv("OMP_TARGET_TEAMS_PER_PROC"); |
| 2040 | int TeamsPerCU = DefaultTeamsPerCU; |
| 2041 | if (TeamsPerCUEnvStr) { |
| 2042 | TeamsPerCU = std::stoi(TeamsPerCUEnvStr); |
| 2043 | } |
| 2044 | |
| 2045 | DeviceInfo().NumTeams[DeviceId] = |
| 2046 | TeamsPerCU * DeviceInfo().ComputeUnits[DeviceId]; |
| 2047 | DP("Default number of teams = %d * number of compute units %d\n",{} |
| 2048 | TeamsPerCU, DeviceInfo().ComputeUnits[DeviceId]){}; |
| 2049 | } |
| 2050 | |
| 2051 | if (enforceUpperBound(&DeviceInfo().NumTeams[DeviceId], |
| 2052 | DeviceInfo().GroupsPerDevice[DeviceId])) { |
| 2053 | DP("Default number of teams exceeds device limit, capping at %d\n",{} |
| 2054 | DeviceInfo().GroupsPerDevice[DeviceId]){}; |
| 2055 | } |
| 2056 | |
| 2057 | // Adjust threads to the env variables |
| 2058 | if (DeviceInfo().Env.TeamThreadLimit > 0 && |
| 2059 | (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], |
| 2060 | DeviceInfo().Env.TeamThreadLimit))) { |
| 2061 | DP("Capping max number of threads to OMP_TEAMS_THREAD_LIMIT=%d\n",{} |
| 2062 | DeviceInfo().Env.TeamThreadLimit){}; |
| 2063 | } |
| 2064 | |
| 2065 | // Set default number of threads |
| 2066 | DeviceInfo().NumThreads[DeviceId] = RTLDeviceInfoTy::DefaultWgSize; |
| 2067 | DP("Default number of threads set according to library's default %d\n",{} |
| 2068 | RTLDeviceInfoTy::DefaultWgSize){}; |
| 2069 | if (enforceUpperBound(&DeviceInfo().NumThreads[DeviceId], |
| 2070 | DeviceInfo().ThreadsPerGroup[DeviceId])) { |
| 2071 | DP("Default number of threads exceeds device limit, capping at %d\n",{} |
| 2072 | DeviceInfo().ThreadsPerGroup[DeviceId]){}; |
| 2073 | } |
| 2074 | |
| 2075 | DP("Device %d: default limit for groupsPerDevice %d & threadsPerGroup %d\n",{} |
| 2076 | DeviceId, DeviceInfo().GroupsPerDevice[DeviceId],{} |
| 2077 | DeviceInfo().ThreadsPerGroup[DeviceId]){}; |
| 2078 | |
| 2079 | DP("Device %d: wavefront size %d, total threads %d x %d = %d\n", DeviceId,{} |
| 2080 | DeviceInfo().WarpSize[DeviceId], DeviceInfo().ThreadsPerGroup[DeviceId],{} |
| 2081 | DeviceInfo().GroupsPerDevice[DeviceId],{} |
| 2082 | DeviceInfo().GroupsPerDevice[DeviceId] *{} |
| 2083 | DeviceInfo().ThreadsPerGroup[DeviceId]){}; |
| 2084 | |
| 2085 | return OFFLOAD_SUCCESS(0); |
| 2086 | } |
| 2087 | |
| 2088 | static __tgt_target_table * |
| 2089 | __tgt_rtl_load_binary_locked(int32_t DeviceId, __tgt_device_image *Image); |
| 2090 | |
| 2091 | __tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId, |
| 2092 | __tgt_device_image *Image) { |
| 2093 | DeviceInfo().LoadRunLock.lock(); |
| 2094 | __tgt_target_table *Res = __tgt_rtl_load_binary_locked(DeviceId, Image); |
| 2095 | DeviceInfo().LoadRunLock.unlock(); |
| 2096 | return Res; |
| 2097 | } |
| 2098 | |
| 2099 | __tgt_target_table *__tgt_rtl_load_binary_locked(int32_t DeviceId, |
| 2100 | __tgt_device_image *Image) { |
| 2101 | // This function loads the device image onto gpu[DeviceId] and does other |
| 2102 | // per-image initialization work. Specifically: |
| 2103 | // |
| 2104 | // - Initialize an DeviceEnvironmentTy instance embedded in the |
| 2105 | // image at the symbol "__omp_rtl_device_environment" |
| 2106 | // Fields DebugKind, DeviceNum, NumDevices. Used by the deviceRTL. |
| 2107 | // |
| 2108 | // - Allocate a large array per-gpu (could be moved to init_device) |
| 2109 | // - Read a uint64_t at symbol omptarget_nvptx_device_State_size |
| 2110 | // - Allocate at least that many bytes of gpu memory |
| 2111 | // - Zero initialize it |
| 2112 | // - Write the pointer to the symbol omptarget_nvptx_device_State |
| 2113 | // |
| 2114 | // - Pulls some per-kernel information together from various sources and |
| 2115 | // records it in the KernelsList for quicker access later |
| 2116 | // |
| 2117 | // The initialization can be done before or after loading the image onto the |
| 2118 | // gpu. This function presently does a mixture. Using the hsa api to get/set |
| 2119 | // the information is simpler to implement, in exchange for more complicated |
| 2120 | // runtime behaviour. E.g. launching a kernel or using dma to get eight bytes |
| 2121 | // back from the gpu vs a hashtable lookup on the host. |
| 2122 | |
| 2123 | const size_t ImgSize = (char *)Image->ImageEnd - (char *)Image->ImageStart; |
| 2124 | |
| 2125 | DeviceInfo().clearOffloadEntriesTable(DeviceId); |
| 2126 | |
| 2127 | // We do not need to set the ELF version because the caller of this function |
| 2128 | // had to do that to decide the right runtime to use |
| 2129 | |
| 2130 | if (!elfMachineIdIsAmdgcn(Image)) |
| 2131 | return NULL__null; |
| 2132 | |
| 2133 | { |
| 2134 | auto Env = |
| 2135 | DeviceEnvironment(DeviceId, DeviceInfo().NumberOfDevices, |
| 2136 | DeviceInfo().Env.DynamicMemSize, Image, ImgSize); |
| 2137 | |
| 2138 | auto &KernelInfo = DeviceInfo().KernelInfoTable[DeviceId]; |
| 2139 | auto &SymbolInfo = DeviceInfo().SymbolInfoTable[DeviceId]; |
| 2140 | hsa_status_t Err = moduleRegisterFromMemoryToPlace( |
| 2141 | KernelInfo, SymbolInfo, (void *)Image->ImageStart, ImgSize, DeviceId, |
| 2142 | [&](void *Data, size_t Size) { |
| 2143 | if (imageContainsSymbol(Data, Size, "needs_hostcall_buffer")) { |
| 2144 | __atomic_store_n(&DeviceInfo().HostcallRequired, true, |
| 2145 | __ATOMIC_RELEASE3); |
| 2146 | } |
| 2147 | return Env.beforeLoading(Data, Size); |
| 2148 | }, |
| 2149 | DeviceInfo().HSAExecutables); |
| 2150 | |
| 2151 | check("Module registering", Err){}; |
| 2152 | if (Err != HSA_STATUS_SUCCESS) { |
| 2153 | const char *DeviceName = DeviceInfo().GPUName[DeviceId].c_str(); |
| 2154 | const char *ElfName = get_elf_mach_gfx_name(elfEFlags(Image)); |
| 2155 | |
| 2156 | if (strcmp(DeviceName, ElfName) != 0) { |
| 2157 | DP("Possible gpu arch mismatch: device:%s, image:%s please check"{} |
| 2158 | " compiler flag: -march=<gpu>\n",{} |
| 2159 | DeviceName, ElfName){}; |
| 2160 | } else { |
| 2161 | DP("Error loading image onto GPU: %s\n", get_error_string(Err)){}; |
| 2162 | } |
| 2163 | |
| 2164 | return NULL__null; |
| 2165 | } |
| 2166 | |
| 2167 | Err = Env.afterLoading(); |
| 2168 | if (Err != HSA_STATUS_SUCCESS) { |
| 2169 | return NULL__null; |
| 2170 | } |
| 2171 | } |
| 2172 | |
| 2173 | DP("AMDGPU module successfully loaded!\n"){}; |
| 2174 | |
| 2175 | { |
| 2176 | // the device_State array is either large value in bss or a void* that |
| 2177 | // needs to be assigned to a pointer to an array of size device_state_bytes |
| 2178 | // If absent, it has been deadstripped and needs no setup. |
| 2179 | |
| 2180 | void *StatePtr; |
| 2181 | uint32_t StatePtrSize; |
| 2182 | auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; |
| 2183 | hsa_status_t Err = interop_hsa_get_symbol_info( |
| 2184 | SymbolInfoMap, DeviceId, "omptarget_nvptx_device_State", &StatePtr, |
| 2185 | &StatePtrSize); |
| 2186 | |
| 2187 | if (Err != HSA_STATUS_SUCCESS) { |
| 2188 | DP("No device_state symbol found, skipping initialization\n"){}; |
| 2189 | } else { |
| 2190 | if (StatePtrSize < sizeof(void *)) { |
| 2191 | DP("unexpected size of state_ptr %u != %zu\n", StatePtrSize,{} |
| 2192 | sizeof(void *)){}; |
| 2193 | return NULL__null; |
| 2194 | } |
| 2195 | |
| 2196 | // if it's larger than a void*, assume it's a bss array and no further |
| 2197 | // initialization is required. Only try to set up a pointer for |
| 2198 | // sizeof(void*) |
| 2199 | if (StatePtrSize == sizeof(void *)) { |
| 2200 | uint64_t DeviceStateBytes = |
| 2201 | getDeviceStateBytes((char *)Image->ImageStart, ImgSize); |
| 2202 | if (DeviceStateBytes == 0) { |
| 2203 | DP("Can't initialize device_State, missing size information\n"){}; |
| 2204 | return NULL__null; |
| 2205 | } |
| 2206 | |
| 2207 | auto &DSS = DeviceInfo().DeviceStateStore[DeviceId]; |
| 2208 | if (DSS.first.get() == nullptr) { |
| 2209 | assert(DSS.second == 0)(static_cast <bool> (DSS.second == 0) ? void (0) : __assert_fail ("DSS.second == 0", "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp" , 2209, __extension__ __PRETTY_FUNCTION__)); |
| 2210 | void *Ptr = NULL__null; |
| 2211 | hsa_status_t Err = implCalloc(&Ptr, DeviceStateBytes, DeviceId); |
| 2212 | if (Err != HSA_STATUS_SUCCESS) { |
| 2213 | DP("Failed to allocate device_state array\n"){}; |
| 2214 | return NULL__null; |
| 2215 | } |
| 2216 | DSS = { |
| 2217 | std::unique_ptr<void, RTLDeviceInfoTy::ImplFreePtrDeletor>{Ptr}, |
| 2218 | DeviceStateBytes, |
| 2219 | }; |
| 2220 | } |
| 2221 | |
| 2222 | void *Ptr = DSS.first.get(); |
| 2223 | if (DeviceStateBytes != DSS.second) { |
| 2224 | DP("Inconsistent sizes of device_State unsupported\n"){}; |
| 2225 | return NULL__null; |
| 2226 | } |
| 2227 | |
| 2228 | // write ptr to device memory so it can be used by later kernels |
| 2229 | Err = DeviceInfo().freesignalpoolMemcpyH2D(StatePtr, &Ptr, |
| 2230 | sizeof(void *), DeviceId); |
| 2231 | if (Err != HSA_STATUS_SUCCESS) { |
| 2232 | DP("memcpy install of state_ptr failed\n"){}; |
| 2233 | return NULL__null; |
| 2234 | } |
| 2235 | } |
| 2236 | } |
| 2237 | } |
| 2238 | |
| 2239 | // Here, we take advantage of the data that is appended after img_end to get |
| 2240 | // the symbols' name we need to load. This data consist of the host entries |
| 2241 | // begin and end as well as the target name (see the offloading linker script |
| 2242 | // creation in clang compiler). |
| 2243 | |
| 2244 | // Find the symbols in the module by name. The name can be obtain by |
| 2245 | // concatenating the host entry name with the target name |
| 2246 | |
| 2247 | __tgt_offload_entry *HostBegin = Image->EntriesBegin; |
| 2248 | __tgt_offload_entry *HostEnd = Image->EntriesEnd; |
| 2249 | |
| 2250 | for (__tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { |
| 2251 | |
| 2252 | if (!E->addr) { |
| 2253 | // The host should have always something in the address to |
| 2254 | // uniquely identify the target region. |
| 2255 | DP("Analyzing host entry '<null>' (size = %lld)...\n",{} |
| 2256 | (unsigned long long)E->size){}; |
| 2257 | return NULL__null; |
| 2258 | } |
| 2259 | |
| 2260 | if (E->size) { |
| 2261 | __tgt_offload_entry Entry = *E; |
| 2262 | |
| 2263 | void *Varptr; |
| 2264 | uint32_t Varsize; |
| 2265 | |
| 2266 | auto &SymbolInfoMap = DeviceInfo().SymbolInfoTable[DeviceId]; |
| 2267 | hsa_status_t Err = interop_hsa_get_symbol_info( |
| 2268 | SymbolInfoMap, DeviceId, E->name, &Varptr, &Varsize); |
| 2269 | |
| 2270 | if (Err != HSA_STATUS_SUCCESS) { |
| 2271 | // Inform the user what symbol prevented offloading |
| 2272 | DP("Loading global '%s' (Failed)\n", E->name){}; |
| 2273 | return NULL__null; |
| 2274 | } |
| 2275 | |
| 2276 | if (Varsize != E->size) { |
| 2277 | DP("Loading global '%s' - size mismatch (%u != %lu)\n", E->name,{} |
| 2278 | Varsize, E->size){}; |
| 2279 | return NULL__null; |
| 2280 | } |
| 2281 | |
| 2282 | DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n",{} |
| 2283 | DPxPTR(E - HostBegin), E->name, DPxPTR(Varptr)){}; |
| 2284 | Entry.addr = (void *)Varptr; |
| 2285 | |
| 2286 | DeviceInfo().addOffloadEntry(DeviceId, Entry); |
| 2287 | |
| 2288 | if (DeviceInfo().RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && |
| 2289 | E->flags & OMP_DECLARE_TARGET_LINK) { |
| 2290 | // If unified memory is present any target link variables |
| 2291 | // can access host addresses directly. There is no longer a |
| 2292 | // need for device copies. |
| 2293 | Err = DeviceInfo().freesignalpoolMemcpyH2D(Varptr, E->addr, |
| 2294 | sizeof(void *), DeviceId); |
| 2295 | if (Err != HSA_STATUS_SUCCESS) |
| 2296 | DP("Error when copying USM\n"){}; |
| 2297 | DP("Copy linked variable host address (" DPxMOD ")"{} |
| 2298 | "to device address (" DPxMOD ")\n",{} |
| 2299 | DPxPTR(*((void **)E->addr)), DPxPTR(Varptr)){}; |
| 2300 | } |
| 2301 | |
| 2302 | continue; |
| 2303 | } |
| 2304 | |
| 2305 | DP("to find the kernel name: %s size: %lu\n", E->name, strlen(E->name)){}; |
| 2306 | |
| 2307 | // errors in kernarg_segment_size previously treated as = 0 (or as undef) |
| 2308 | uint32_t KernargSegmentSize = 0; |
| 2309 | auto &KernelInfoMap = DeviceInfo().KernelInfoTable[DeviceId]; |
| 2310 | hsa_status_t Err = HSA_STATUS_SUCCESS; |
| 2311 | if (!E->name) { |
| 2312 | Err = HSA_STATUS_ERROR; |
Value stored to 'Err' is never read | |
| 2313 | } else { |
| 2314 | std::string KernelStr = std::string(E->name); |
| 2315 | auto It = KernelInfoMap.find(KernelStr); |
| 2316 | if (It != KernelInfoMap.end()) { |
| 2317 | atl_kernel_info_t Info = It->second; |
| 2318 | KernargSegmentSize = Info.kernel_segment_size; |
| 2319 | } else { |
| 2320 | Err = HSA_STATUS_ERROR; |
| 2321 | } |
| 2322 | } |
| 2323 | |
| 2324 | // default value GENERIC (in case symbol is missing from cubin file) |
| 2325 | llvm::omp::OMPTgtExecModeFlags ExecModeVal = |
| 2326 | llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC; |
| 2327 | |
| 2328 | // get flat group size if present, else Default_WG_Size |
| 2329 | int16_t WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; |
| 2330 | |
| 2331 | // get Kernel Descriptor if present. |
| 2332 | // Keep struct in sync wih getTgtAttributeStructQTy in CGOpenMPRuntime.cpp |
| 2333 | struct KernDescValType { |
| 2334 | uint16_t Version; |
| 2335 | uint16_t TSize; |
| 2336 | uint16_t WGSize; |
| 2337 | }; |
| 2338 | struct KernDescValType KernDescVal; |
| 2339 | std::string KernDescNameStr(E->name); |
| 2340 | KernDescNameStr += "_kern_desc"; |
| 2341 | const char *KernDescName = KernDescNameStr.c_str(); |
| 2342 | |
| 2343 | const void *KernDescPtr; |
| 2344 | uint32_t KernDescSize; |
| 2345 | void *CallStackAddr = nullptr; |
| 2346 | Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, KernDescName, |
| 2347 | &KernDescPtr, &KernDescSize); |
| 2348 | |
| 2349 | if (Err == HSA_STATUS_SUCCESS) { |
| 2350 | if ((size_t)KernDescSize != sizeof(KernDescVal)) |
| 2351 | DP("Loading global computation properties '%s' - size mismatch (%u != "{} |
| 2352 | "%lu)\n",{} |
| 2353 | KernDescName, KernDescSize, sizeof(KernDescVal)){}; |
| 2354 | |
| 2355 | memcpy(&KernDescVal, KernDescPtr, (size_t)KernDescSize); |
| 2356 | |
| 2357 | // Check structure size against recorded size. |
| 2358 | if ((size_t)KernDescSize != KernDescVal.TSize) |
| 2359 | DP("KernDescVal size %lu does not match advertized size %d for '%s'\n",{} |
| 2360 | sizeof(KernDescVal), KernDescVal.TSize, KernDescName){}; |
| 2361 | |
| 2362 | DP("After loading global for %s KernDesc \n", KernDescName){}; |
| 2363 | DP("KernDesc: Version: %d\n", KernDescVal.Version){}; |
| 2364 | DP("KernDesc: TSize: %d\n", KernDescVal.TSize){}; |
| 2365 | DP("KernDesc: WG_Size: %d\n", KernDescVal.WGSize){}; |
| 2366 | |
| 2367 | if (KernDescVal.WGSize == 0) { |
| 2368 | KernDescVal.WGSize = RTLDeviceInfoTy::DefaultWgSize; |
| 2369 | DP("Setting KernDescVal.WG_Size to default %d\n", KernDescVal.WGSize){}; |
| 2370 | } |
| 2371 | WGSizeVal = KernDescVal.WGSize; |
| 2372 | DP("WGSizeVal %d\n", WGSizeVal){}; |
| 2373 | check("Loading KernDesc computation property", Err){}; |
| 2374 | } else { |
| 2375 | DP("Warning: Loading KernDesc '%s' - symbol not found, ", KernDescName){}; |
| 2376 | |
| 2377 | // Flat group size |
| 2378 | std::string WGSizeNameStr(E->name); |
| 2379 | WGSizeNameStr += "_wg_size"; |
| 2380 | const char *WGSizeName = WGSizeNameStr.c_str(); |
| 2381 | |
| 2382 | const void *WGSizePtr; |
| 2383 | uint32_t WGSize; |
| 2384 | Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, WGSizeName, |
| 2385 | &WGSizePtr, &WGSize); |
| 2386 | |
| 2387 | if (Err == HSA_STATUS_SUCCESS) { |
| 2388 | if ((size_t)WGSize != sizeof(int16_t)) { |
| 2389 | DP("Loading global computation properties '%s' - size mismatch (%u "{} |
| 2390 | "!= "{} |
| 2391 | "%lu)\n",{} |
| 2392 | WGSizeName, WGSize, sizeof(int16_t)){}; |
| 2393 | return NULL__null; |
| 2394 | } |
| 2395 | |
| 2396 | memcpy(&WGSizeVal, WGSizePtr, (size_t)WGSize); |
| 2397 | |
| 2398 | DP("After loading global for %s WGSize = %d\n", WGSizeName, WGSizeVal){}; |
| 2399 | |
| 2400 | if (WGSizeVal < RTLDeviceInfoTy::DefaultWgSize || |
| 2401 | WGSizeVal > RTLDeviceInfoTy::MaxWgSize) { |
| 2402 | DP("Error wrong WGSize value specified in HSA code object file: "{} |
| 2403 | "%d\n",{} |
| 2404 | WGSizeVal){}; |
| 2405 | WGSizeVal = RTLDeviceInfoTy::DefaultWgSize; |
| 2406 | } |
| 2407 | } else { |
| 2408 | DP("Warning: Loading WGSize '%s' - symbol not found, "{} |
| 2409 | "using default value %d\n",{} |
| 2410 | WGSizeName, WGSizeVal){}; |
| 2411 | } |
| 2412 | |
| 2413 | check("Loading WGSize computation property", Err){}; |
| 2414 | } |
| 2415 | |
| 2416 | // Read execution mode from global in binary |
| 2417 | std::string ExecModeNameStr(E->name); |
| 2418 | ExecModeNameStr += "_exec_mode"; |
| 2419 | const char *ExecModeName = ExecModeNameStr.c_str(); |
| 2420 | |
| 2421 | const void *ExecModePtr; |
| 2422 | uint32_t VarSize; |
| 2423 | Err = interopGetSymbolInfo((char *)Image->ImageStart, ImgSize, ExecModeName, |
| 2424 | &ExecModePtr, &VarSize); |
| 2425 | |
| 2426 | if (Err == HSA_STATUS_SUCCESS) { |
| 2427 | if ((size_t)VarSize != sizeof(llvm::omp::OMPTgtExecModeFlags)) { |
| 2428 | DP("Loading global computation properties '%s' - size mismatch(%u != "{} |
| 2429 | "%lu)\n",{} |
| 2430 | ExecModeName, VarSize, sizeof(llvm::omp::OMPTgtExecModeFlags)){}; |
| 2431 | return NULL__null; |
| 2432 | } |
| 2433 | |
| 2434 | memcpy(&ExecModeVal, ExecModePtr, (size_t)VarSize); |
| 2435 | |
| 2436 | DP("After loading global for %s ExecMode = %d\n", ExecModeName,{} |
| 2437 | ExecModeVal){}; |
| 2438 | |
| 2439 | if (ExecModeVal < 0 || |
| 2440 | ExecModeVal > llvm::omp::OMP_TGT_EXEC_MODE_GENERIC_SPMD) { |
| 2441 | DP("Error wrong exec_mode value specified in HSA code object file: "{} |
| 2442 | "%d\n",{} |
| 2443 | ExecModeVal){}; |
| 2444 | return NULL__null; |
| 2445 | } |
| 2446 | } else { |
| 2447 | DP("Loading global exec_mode '%s' - symbol missing, using default "{} |
| 2448 | "value "{} |
| 2449 | "GENERIC (1)\n",{} |
| 2450 | ExecModeName){}; |
| 2451 | } |
| 2452 | check("Loading computation property", Err){}; |
| 2453 | |
| 2454 | KernelsList.push_back(KernelTy(ExecModeVal, WGSizeVal, DeviceId, |
| 2455 | CallStackAddr, E->name, KernargSegmentSize, |
| 2456 | DeviceInfo().KernArgPool)); |
| 2457 | __tgt_offload_entry Entry = *E; |
| 2458 | Entry.addr = (void *)&KernelsList.back(); |
| 2459 | DeviceInfo().addOffloadEntry(DeviceId, Entry); |
| 2460 | DP("Entry point %ld maps to %s\n", E - HostBegin, E->name){}; |
| 2461 | } |
| 2462 | |
| 2463 | return DeviceInfo().getOffloadEntriesTable(DeviceId); |
| 2464 | } |
| 2465 | |
| 2466 | void *__tgt_rtl_data_alloc(int DeviceId, int64_t Size, void *, int32_t Kind) { |
| 2467 | void *Ptr = NULL__null; |
| 2468 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2468, __extension__ __PRETTY_FUNCTION__)); |
| 2469 | |
| 2470 | hsa_amd_memory_pool_t MemoryPool; |
| 2471 | switch (Kind) { |
| 2472 | case TARGET_ALLOC_DEFAULT: |
| 2473 | case TARGET_ALLOC_DEVICE: |
| 2474 | // GPU memory |
| 2475 | MemoryPool = DeviceInfo().getDeviceMemoryPool(DeviceId); |
| 2476 | break; |
| 2477 | case TARGET_ALLOC_HOST: |
| 2478 | // non-migratable memory accessible by host and device(s) |
| 2479 | MemoryPool = DeviceInfo().getHostMemoryPool(); |
| 2480 | break; |
| 2481 | default: |
| 2482 | REPORT("Invalid target data allocation kind or requested allocator not "do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not " "implemented yet\n"); } while (0); |
| 2483 | "implemented yet\n")do { fprintf(stderr, "AMDGPU" " error: "); fprintf(stderr, "Invalid target data allocation kind or requested allocator not " "implemented yet\n"); } while (0);; |
| 2484 | return NULL__null; |
| 2485 | } |
| 2486 | |
| 2487 | hsa_status_t Err = hsa_amd_memory_pool_allocate(MemoryPool, Size, 0, &Ptr); |
| 2488 | DP("Tgt alloc data %ld bytes, (tgt:%016llx).\n", Size,{} |
| 2489 | (long long unsigned)(Elf64_Addr)Ptr){}; |
| 2490 | Ptr = (Err == HSA_STATUS_SUCCESS) ? Ptr : NULL__null; |
| 2491 | return Ptr; |
| 2492 | } |
| 2493 | |
| 2494 | int32_t __tgt_rtl_data_submit(int DeviceId, void *TgtPtr, void *HstPtr, |
| 2495 | int64_t Size) { |
| 2496 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2496, __extension__ __PRETTY_FUNCTION__)); |
| 2497 | __tgt_async_info AsyncInfo; |
| 2498 | int32_t Rc = dataSubmit(DeviceId, TgtPtr, HstPtr, Size, &AsyncInfo); |
| 2499 | if (Rc != OFFLOAD_SUCCESS(0)) |
| 2500 | return OFFLOAD_FAIL(~0); |
| 2501 | |
| 2502 | return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); |
| 2503 | } |
| 2504 | |
| 2505 | int32_t __tgt_rtl_data_submit_async(int DeviceId, void *TgtPtr, void *HstPtr, |
| 2506 | int64_t Size, __tgt_async_info *AsyncInfo) { |
| 2507 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2507, __extension__ __PRETTY_FUNCTION__)); |
| 2508 | if (AsyncInfo) { |
| 2509 | initAsyncInfo(AsyncInfo); |
| 2510 | return dataSubmit(DeviceId, TgtPtr, HstPtr, Size, AsyncInfo); |
| 2511 | } |
| 2512 | return __tgt_rtl_data_submit(DeviceId, TgtPtr, HstPtr, Size); |
| 2513 | } |
| 2514 | |
| 2515 | int32_t __tgt_rtl_data_retrieve(int DeviceId, void *HstPtr, void *TgtPtr, |
| 2516 | int64_t Size) { |
| 2517 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2517, __extension__ __PRETTY_FUNCTION__)); |
| 2518 | __tgt_async_info AsyncInfo; |
| 2519 | int32_t Rc = dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, &AsyncInfo); |
| 2520 | if (Rc != OFFLOAD_SUCCESS(0)) |
| 2521 | return OFFLOAD_FAIL(~0); |
| 2522 | |
| 2523 | return __tgt_rtl_synchronize(DeviceId, &AsyncInfo); |
| 2524 | } |
| 2525 | |
| 2526 | int32_t __tgt_rtl_data_retrieve_async(int DeviceId, void *HstPtr, void *TgtPtr, |
| 2527 | int64_t Size, |
| 2528 | __tgt_async_info *AsyncInfo) { |
| 2529 | assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2529, __extension__ __PRETTY_FUNCTION__)); |
| 2530 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2530, __extension__ __PRETTY_FUNCTION__)); |
| 2531 | initAsyncInfo(AsyncInfo); |
| 2532 | return dataRetrieve(DeviceId, HstPtr, TgtPtr, Size, AsyncInfo); |
| 2533 | } |
| 2534 | |
| 2535 | int32_t __tgt_rtl_data_delete(int DeviceId, void *TgtPtr, int32_t) { |
| 2536 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2536, __extension__ __PRETTY_FUNCTION__)); |
| 2537 | // HSA can free pointers allocated from different types of memory pool. |
| 2538 | hsa_status_t Err; |
| 2539 | DP("Tgt free data (tgt:%016llx).\n", (long long unsigned)(Elf64_Addr)TgtPtr){}; |
| 2540 | Err = core::Runtime::Memfree(TgtPtr); |
| 2541 | if (Err != HSA_STATUS_SUCCESS) { |
| 2542 | DP("Error when freeing CUDA memory\n"){}; |
| 2543 | return OFFLOAD_FAIL(~0); |
| 2544 | } |
| 2545 | return OFFLOAD_SUCCESS(0); |
| 2546 | } |
| 2547 | |
| 2548 | int32_t __tgt_rtl_launch_kernel(int32_t DeviceId, void *TgtEntryPtr, |
| 2549 | void **TgtArgs, ptrdiff_t *TgtOffsets, |
| 2550 | KernelArgsTy *KernelArgs, |
| 2551 | __tgt_async_info *AsyncInfo) { |
| 2552 | assert(!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] &&(static_cast <bool> (!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit [1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported." ) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__ __PRETTY_FUNCTION__)) |
| 2553 | !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] &&(static_cast <bool> (!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit [1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported." ) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__ __PRETTY_FUNCTION__)) |
| 2554 | "Only one dimensional kernels supported.")(static_cast <bool> (!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit [1] && !KernelArgs->ThreadLimit[2] && "Only one dimensional kernels supported." ) ? void (0) : __assert_fail ("!KernelArgs->NumTeams[1] && !KernelArgs->NumTeams[2] && !KernelArgs->ThreadLimit[1] && !KernelArgs->ThreadLimit[2] && \"Only one dimensional kernels supported.\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2554, __extension__ __PRETTY_FUNCTION__)); |
| 2555 | assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2555, __extension__ __PRETTY_FUNCTION__)); |
| 2556 | initAsyncInfo(AsyncInfo); |
| 2557 | |
| 2558 | DeviceInfo().LoadRunLock.lock_shared(); |
| 2559 | int32_t Res = |
| 2560 | runRegionLocked(DeviceId, TgtEntryPtr, TgtArgs, TgtOffsets, |
| 2561 | KernelArgs->NumArgs, KernelArgs->NumTeams[0], |
| 2562 | KernelArgs->ThreadLimit[0], KernelArgs->Tripcount); |
| 2563 | |
| 2564 | DeviceInfo().LoadRunLock.unlock_shared(); |
| 2565 | return Res; |
| 2566 | } |
| 2567 | |
| 2568 | int32_t __tgt_rtl_synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfo) { |
| 2569 | assert(AsyncInfo && "AsyncInfo is nullptr")(static_cast <bool> (AsyncInfo && "AsyncInfo is nullptr" ) ? void (0) : __assert_fail ("AsyncInfo && \"AsyncInfo is nullptr\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2569, __extension__ __PRETTY_FUNCTION__)); |
| 2570 | |
| 2571 | // Cuda asserts that AsyncInfo->Queue is non-null, but this invariant |
| 2572 | // is not ensured by devices.cpp for amdgcn |
| 2573 | // assert(AsyncInfo->Queue && "AsyncInfo->Queue is nullptr"); |
| 2574 | if (AsyncInfo->Queue) { |
| 2575 | finiAsyncInfo(AsyncInfo); |
| 2576 | } |
| 2577 | return OFFLOAD_SUCCESS(0); |
| 2578 | } |
| 2579 | |
| 2580 | void __tgt_rtl_print_device_info(int32_t DeviceId) { |
| 2581 | // TODO: Assertion to see if DeviceId is correct |
| 2582 | // NOTE: We don't need to set context for print device info. |
| 2583 | |
| 2584 | DeviceInfo().printDeviceInfo(DeviceId, DeviceInfo().HSAAgents[DeviceId]); |
| 2585 | } |
| 2586 | |
| 2587 | int32_t __tgt_rtl_data_lock(int32_t DeviceId, void *HostPtr, int64_t Size, |
| 2588 | void **LockedHostPtr) { |
| 2589 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2589, __extension__ __PRETTY_FUNCTION__)); |
| 2590 | |
| 2591 | hsa_agent_t Agent = DeviceInfo().HSAAgents[DeviceId]; |
| 2592 | hsa_status_t err = lock_memory(HostPtr, Size, Agent, LockedHostPtr); |
| 2593 | if (err != HSA_STATUS_SUCCESS) { |
| 2594 | DP("Error in tgt_rtl_data_lock\n"){}; |
| 2595 | return OFFLOAD_FAIL(~0); |
| 2596 | } |
| 2597 | DP("Tgt lock host data %ld bytes, (HostPtr:%016llx).\n", Size,{} |
| 2598 | (long long unsigned)(Elf64_Addr)*LockedHostPtr){}; |
| 2599 | return OFFLOAD_SUCCESS(0); |
| 2600 | } |
| 2601 | |
| 2602 | int32_t __tgt_rtl_data_unlock(int DeviceId, void *HostPtr) { |
| 2603 | assert(DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large")(static_cast <bool> (DeviceId < DeviceInfo().NumberOfDevices && "Device ID too large") ? void (0) : __assert_fail ("DeviceId < DeviceInfo().NumberOfDevices && \"Device ID too large\"" , "openmp/libomptarget/plugins/amdgpu/src/rtl.cpp", 2603, __extension__ __PRETTY_FUNCTION__)); |
| 2604 | hsa_status_t err = unlock_memory(HostPtr); |
| 2605 | if (err != HSA_STATUS_SUCCESS) { |
| 2606 | DP("Error in tgt_rtl_data_unlock\n"){}; |
| 2607 | return OFFLOAD_FAIL(~0); |
| 2608 | } |
| 2609 | |
| 2610 | DP("Tgt unlock data (tgt:%016llx).\n",{} |
| 2611 | (long long unsigned)(Elf64_Addr)HostPtr){}; |
| 2612 | return OFFLOAD_SUCCESS(0); |
| 2613 | } |
| 2614 | |
| 2615 | } // extern "C" |