| File: | build/source/openmp/libomptarget/src/omptarget.cpp |
| Warning: | line 883, column 9 Potential leak of memory pointed to by 'PostProcessingPtrs' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// | ||||||
| 2 | // | ||||||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||||||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||||||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
| 6 | // | ||||||
| 7 | //===----------------------------------------------------------------------===// | ||||||
| 8 | // | ||||||
| 9 | // Implementation of the interface to be used by Clang during the codegen of a | ||||||
| 10 | // target region. | ||||||
| 11 | // | ||||||
| 12 | //===----------------------------------------------------------------------===// | ||||||
| 13 | |||||||
| 14 | #include "omptarget.h" | ||||||
| 15 | #include "device.h" | ||||||
| 16 | #include "private.h" | ||||||
| 17 | #include "rtl.h" | ||||||
| 18 | |||||||
| 19 | #include "llvm/ADT/bit.h" | ||||||
| 20 | |||||||
| 21 | #include <cassert> | ||||||
| 22 | #include <cstdint> | ||||||
| 23 | #include <vector> | ||||||
| 24 | |||||||
| 25 | using llvm::SmallVector; | ||||||
| 26 | |||||||
| 27 | int AsyncInfoTy::synchronize() { | ||||||
| 28 | int Result = OFFLOAD_SUCCESS(0); | ||||||
| 29 | if (!isQueueEmpty()) { | ||||||
| 30 | switch (SyncType) { | ||||||
| 31 | case SyncTy::BLOCKING: | ||||||
| 32 | // If we have a queue we need to synchronize it now. | ||||||
| 33 | Result = Device.synchronize(*this); | ||||||
| 34 | assert(AsyncInfo.Queue == nullptr &&(static_cast <bool> (AsyncInfo.Queue == nullptr && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!") ? void (0) : __assert_fail ("AsyncInfo.Queue == nullptr && \"The device plugin should have nulled the queue to indicate there \" \"are no outstanding actions!\"" , "openmp/libomptarget/src/omptarget.cpp", 36, __extension__ __PRETTY_FUNCTION__ )) | ||||||
| 35 | "The device plugin should have nulled the queue to indicate there "(static_cast <bool> (AsyncInfo.Queue == nullptr && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!") ? void (0) : __assert_fail ("AsyncInfo.Queue == nullptr && \"The device plugin should have nulled the queue to indicate there \" \"are no outstanding actions!\"" , "openmp/libomptarget/src/omptarget.cpp", 36, __extension__ __PRETTY_FUNCTION__ )) | ||||||
| 36 | "are no outstanding actions!")(static_cast <bool> (AsyncInfo.Queue == nullptr && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!") ? void (0) : __assert_fail ("AsyncInfo.Queue == nullptr && \"The device plugin should have nulled the queue to indicate there \" \"are no outstanding actions!\"" , "openmp/libomptarget/src/omptarget.cpp", 36, __extension__ __PRETTY_FUNCTION__ )); | ||||||
| 37 | break; | ||||||
| 38 | case SyncTy::NON_BLOCKING: | ||||||
| 39 | Result = Device.queryAsync(*this); | ||||||
| 40 | break; | ||||||
| 41 | } | ||||||
| 42 | } | ||||||
| 43 | |||||||
| 44 | // Run any pending post-processing function registered on this async object. | ||||||
| 45 | if (Result == OFFLOAD_SUCCESS(0) && isQueueEmpty()) | ||||||
| 46 | Result = runPostProcessing(); | ||||||
| 47 | |||||||
| 48 | return Result; | ||||||
| 49 | } | ||||||
| 50 | |||||||
| 51 | void *&AsyncInfoTy::getVoidPtrLocation() { | ||||||
| 52 | BufferLocations.push_back(nullptr); | ||||||
| 53 | return BufferLocations.back(); | ||||||
| 54 | } | ||||||
| 55 | |||||||
| 56 | bool AsyncInfoTy::isDone() const { return isQueueEmpty(); } | ||||||
| 57 | |||||||
| 58 | int32_t AsyncInfoTy::runPostProcessing() { | ||||||
| 59 | size_t Size = PostProcessingFunctions.size(); | ||||||
| 60 | for (size_t I = 0; I < Size; ++I) { | ||||||
| 61 | const int Result = PostProcessingFunctions[I](); | ||||||
| 62 | if (Result != OFFLOAD_SUCCESS(0)) | ||||||
| 63 | return Result; | ||||||
| 64 | } | ||||||
| 65 | |||||||
| 66 | // Clear the vector up until the last known function, since post-processing | ||||||
| 67 | // procedures might add new procedures themselves. | ||||||
| 68 | const auto PrevBegin = PostProcessingFunctions.begin(); | ||||||
| 69 | PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size); | ||||||
| 70 | |||||||
| 71 | return OFFLOAD_SUCCESS(0); | ||||||
| 72 | } | ||||||
| 73 | |||||||
| 74 | bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; } | ||||||
| 75 | |||||||
| 76 | /* All begin addresses for partially mapped structs must be aligned, up to 16, | ||||||
| 77 | * in order to ensure proper alignment of members. E.g. | ||||||
| 78 | * | ||||||
| 79 | * struct S { | ||||||
| 80 | * int a; // 4-aligned | ||||||
| 81 | * int b; // 4-aligned | ||||||
| 82 | * int *p; // 8-aligned | ||||||
| 83 | * } s1; | ||||||
| 84 | * ... | ||||||
| 85 | * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) | ||||||
| 86 | * { | ||||||
| 87 | * s1.b = 5; | ||||||
| 88 | * for (int i...) s1.p[i] = ...; | ||||||
| 89 | * } | ||||||
| 90 | * | ||||||
| 91 | * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and | ||||||
| 92 | * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, | ||||||
| 93 | * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment | ||||||
| 94 | * requirements for its type. Now, when we allocate memory on the device, in | ||||||
| 95 | * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. | ||||||
| 96 | * This means that the chunk of the struct on the device will start at a | ||||||
| 97 | * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and | ||||||
| 98 | * address of p will be a misaligned 0x204 (on the host there was no need to add | ||||||
| 99 | * padding between b and p, so p comes exactly 4 bytes after b). If the device | ||||||
| 100 | * kernel tries to access s1.p, a misaligned address error occurs (as reported | ||||||
| 101 | * by the CUDA plugin). By padding the begin address down to a multiple of 8 and | ||||||
| 102 | * extending the size of the allocated chuck accordingly, the chuck on the | ||||||
| 103 | * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and | ||||||
| 104 | * &s1.p=0x208, as they should be to satisfy the alignment requirements. | ||||||
| 105 | */ | ||||||
| 106 | static const int64_t MaxAlignment = 16; | ||||||
| 107 | |||||||
| 108 | /// Return the alignment requirement of partially mapped structs, see | ||||||
| 109 | /// MaxAlignment above. | ||||||
| 110 | static uint64_t getPartialStructRequiredAlignment(void *HstPtrBase) { | ||||||
| 111 | int LowestOneBit = __builtin_ffsl(reinterpret_cast<uintptr_t>(HstPtrBase)); | ||||||
| 112 | uint64_t BaseAlignment = 1 << (LowestOneBit - 1); | ||||||
| 113 | return MaxAlignment < BaseAlignment ? MaxAlignment : BaseAlignment; | ||||||
| 114 | } | ||||||
| 115 | |||||||
| 116 | /// Map global data and execute pending ctors | ||||||
| 117 | static int initLibrary(DeviceTy &Device) { | ||||||
| 118 | /* | ||||||
| 119 | * Map global data | ||||||
| 120 | */ | ||||||
| 121 | int32_t DeviceId = Device.DeviceID; | ||||||
| 122 | int Rc = OFFLOAD_SUCCESS(0); | ||||||
| 123 | bool SupportsEmptyImages = Device.RTL->supports_empty_images && | ||||||
| 124 | Device.RTL->supports_empty_images() > 0; | ||||||
| 125 | { | ||||||
| 126 | std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx); | ||||||
| 127 | for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) { | ||||||
| 128 | TranslationTable *TransTable = | ||||||
| 129 | &PM->HostEntriesBeginToTransTable[HostEntriesBegin]; | ||||||
| 130 | if (TransTable->HostTable.EntriesBegin == | ||||||
| 131 | TransTable->HostTable.EntriesEnd && | ||||||
| 132 | !SupportsEmptyImages) { | ||||||
| 133 | // No host entry so no need to proceed | ||||||
| 134 | continue; | ||||||
| 135 | } | ||||||
| 136 | |||||||
| 137 | if (TransTable->TargetsTable[DeviceId] != 0) { | ||||||
| 138 | // Library entries have already been processed | ||||||
| 139 | continue; | ||||||
| 140 | } | ||||||
| 141 | |||||||
| 142 | // 1) get image. | ||||||
| 143 | assert(TransTable->TargetsImages.size() > (size_t)DeviceId &&(static_cast <bool> (TransTable->TargetsImages.size( ) > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TransTable->TargetsImages.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 144, __extension__ __PRETTY_FUNCTION__)) | ||||||
| 144 | "Not expecting a device ID outside the table's bounds!")(static_cast <bool> (TransTable->TargetsImages.size( ) > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TransTable->TargetsImages.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 144, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 145 | __tgt_device_image *Img = TransTable->TargetsImages[DeviceId]; | ||||||
| 146 | if (!Img) { | ||||||
| 147 | REPORT("No image loaded for device id %d.\n", DeviceId)do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "No image loaded for device id %d.\n", DeviceId); } while ( 0);; | ||||||
| 148 | Rc = OFFLOAD_FAIL(~0); | ||||||
| 149 | break; | ||||||
| 150 | } | ||||||
| 151 | // 2) load image into the target table. | ||||||
| 152 | __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] = | ||||||
| 153 | Device.loadBinary(Img); | ||||||
| 154 | // Unable to get table for this image: invalidate image and fail. | ||||||
| 155 | if (!TargetTable) { | ||||||
| 156 | REPORT("Unable to generate entries table for device id %d.\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Unable to generate entries table for device id %d.\n", DeviceId ); } while (0); | ||||||
| 157 | DeviceId)do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Unable to generate entries table for device id %d.\n", DeviceId ); } while (0);; | ||||||
| 158 | TransTable->TargetsImages[DeviceId] = 0; | ||||||
| 159 | Rc = OFFLOAD_FAIL(~0); | ||||||
| 160 | break; | ||||||
| 161 | } | ||||||
| 162 | |||||||
| 163 | // Verify whether the two table sizes match. | ||||||
| 164 | size_t Hsize = | ||||||
| 165 | TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; | ||||||
| 166 | size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; | ||||||
| 167 | |||||||
| 168 | // Invalid image for these host entries! | ||||||
| 169 | if (Hsize != Tsize) { | ||||||
| 170 | REPORT(do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host and Target tables mismatch for device id %d [%zx != %zx].\n" , DeviceId, Hsize, Tsize); } while (0); | ||||||
| 171 | "Host and Target tables mismatch for device id %d [%zx != %zx].\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host and Target tables mismatch for device id %d [%zx != %zx].\n" , DeviceId, Hsize, Tsize); } while (0); | ||||||
| 172 | DeviceId, Hsize, Tsize)do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host and Target tables mismatch for device id %d [%zx != %zx].\n" , DeviceId, Hsize, Tsize); } while (0);; | ||||||
| 173 | TransTable->TargetsImages[DeviceId] = 0; | ||||||
| 174 | TransTable->TargetsTable[DeviceId] = 0; | ||||||
| 175 | Rc = OFFLOAD_FAIL(~0); | ||||||
| 176 | break; | ||||||
| 177 | } | ||||||
| 178 | |||||||
| 179 | DeviceTy::HDTTMapAccessorTy HDTTMap = | ||||||
| 180 | Device.HostDataToTargetMap.getExclusiveAccessor(); | ||||||
| 181 | |||||||
| 182 | __tgt_target_table *HostTable = &TransTable->HostTable; | ||||||
| 183 | for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, | ||||||
| 184 | *CurrHostEntry = HostTable->EntriesBegin, | ||||||
| 185 | *EntryDeviceEnd = TargetTable->EntriesEnd; | ||||||
| 186 | CurrDeviceEntry != EntryDeviceEnd; | ||||||
| 187 | CurrDeviceEntry++, CurrHostEntry++) { | ||||||
| 188 | if (CurrDeviceEntry->size != 0) { | ||||||
| 189 | // has data. | ||||||
| 190 | assert(CurrDeviceEntry->size == CurrHostEntry->size &&(static_cast <bool> (CurrDeviceEntry->size == CurrHostEntry ->size && "data size mismatch") ? void (0) : __assert_fail ("CurrDeviceEntry->size == CurrHostEntry->size && \"data size mismatch\"" , "openmp/libomptarget/src/omptarget.cpp", 191, __extension__ __PRETTY_FUNCTION__)) | ||||||
| 191 | "data size mismatch")(static_cast <bool> (CurrDeviceEntry->size == CurrHostEntry ->size && "data size mismatch") ? void (0) : __assert_fail ("CurrDeviceEntry->size == CurrHostEntry->size && \"data size mismatch\"" , "openmp/libomptarget/src/omptarget.cpp", 191, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 192 | |||||||
| 193 | // Fortran may use multiple weak declarations for the same symbol, | ||||||
| 194 | // therefore we must allow for multiple weak symbols to be loaded from | ||||||
| 195 | // the fat binary. Treat these mappings as any other "regular" | ||||||
| 196 | // mapping. Add entry to map. | ||||||
| 197 | if (Device.getTgtPtrBegin(HDTTMap, CurrHostEntry->addr, | ||||||
| 198 | CurrHostEntry->size)) | ||||||
| 199 | continue; | ||||||
| 200 | |||||||
| 201 | DP("Add mapping from host " DPxMOD " to device " DPxMOD{} | ||||||
| 202 | " with size %zu"{} | ||||||
| 203 | "\n",{} | ||||||
| 204 | DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),{} | ||||||
| 205 | CurrDeviceEntry->size){}; | ||||||
| 206 | HDTTMap->emplace(new HostDataToTargetTy( | ||||||
| 207 | (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, | ||||||
| 208 | (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, | ||||||
| 209 | (uintptr_t)CurrHostEntry->addr + | ||||||
| 210 | CurrHostEntry->size /*HstPtrEnd*/, | ||||||
| 211 | (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, | ||||||
| 212 | false /*UseHoldRefCount*/, CurrHostEntry->name, | ||||||
| 213 | true /*IsRefCountINF*/)); | ||||||
| 214 | |||||||
| 215 | // Notify about the new mapping. | ||||||
| 216 | if (Device.notifyDataMapped(CurrHostEntry->addr, CurrHostEntry->size)) | ||||||
| 217 | return OFFLOAD_FAIL(~0); | ||||||
| 218 | } | ||||||
| 219 | } | ||||||
| 220 | } | ||||||
| 221 | } | ||||||
| 222 | |||||||
| 223 | if (Rc
| ||||||
| 224 | return Rc; | ||||||
| 225 | } | ||||||
| 226 | |||||||
| 227 | /* | ||||||
| 228 | * Run ctors for static objects | ||||||
| 229 | */ | ||||||
| 230 | if (!Device.PendingCtorsDtors.empty()) { | ||||||
| 231 | AsyncInfoTy AsyncInfo(Device); | ||||||
| 232 | // Call all ctors for all libraries registered so far | ||||||
| 233 | for (auto &Lib : Device.PendingCtorsDtors) { | ||||||
| 234 | if (!Lib.second.PendingCtors.empty()) { | ||||||
| 235 | DP("Has pending ctors... call now\n"){}; | ||||||
| 236 | for (auto &Entry : Lib.second.PendingCtors) { | ||||||
| 237 | void *Ctor = Entry; | ||||||
| 238 | int Rc = target(nullptr, Device, Ctor, CTorDTorKernelArgs, AsyncInfo); | ||||||
| 239 | if (Rc
| ||||||
| 240 | REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(Ctor))do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Running ctor " "0x%0*" "l" "x" " failed.\n", ((int)(2 * sizeof (uintptr_t))), ((uintptr_t)(Ctor))); } while (0);; | ||||||
| 241 | return OFFLOAD_FAIL(~0); | ||||||
| 242 | } | ||||||
| 243 | } | ||||||
| 244 | // Clear the list to indicate that this device has been used | ||||||
| 245 | Lib.second.PendingCtors.clear(); | ||||||
| 246 | DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(Lib.first)){}; | ||||||
| 247 | } | ||||||
| 248 | } | ||||||
| 249 | // All constructors have been issued, wait for them now. | ||||||
| 250 | if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS(0)) | ||||||
| 251 | return OFFLOAD_FAIL(~0); | ||||||
| 252 | } | ||||||
| 253 | Device.HasPendingGlobals = false; | ||||||
| 254 | |||||||
| 255 | return OFFLOAD_SUCCESS(0); | ||||||
| 256 | } | ||||||
| 257 | |||||||
| 258 | void handleTargetOutcome(bool Success, ident_t *Loc) { | ||||||
| 259 | switch (PM->TargetOffloadPolicy) { | ||||||
| 260 | case tgt_disabled: | ||||||
| 261 | if (Success) { | ||||||
| 262 | FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled")do { fprintf(stderr, "Libomptarget" " fatal error %d: %s\n", 1 , "expected no offloading while offloading is disabled"); abort (); } while (0); | ||||||
| 263 | } | ||||||
| 264 | break; | ||||||
| 265 | case tgt_default: | ||||||
| 266 | FATAL_MESSAGE0(1, "default offloading policy must be switched to "do { fprintf(stderr, "Libomptarget" " fatal error %d: %s\n", 1 , "default offloading policy must be switched to " "mandatory or disabled" ); abort(); } while (0) | ||||||
| 267 | "mandatory or disabled")do { fprintf(stderr, "Libomptarget" " fatal error %d: %s\n", 1 , "default offloading policy must be switched to " "mandatory or disabled" ); abort(); } while (0); | ||||||
| 268 | break; | ||||||
| 269 | case tgt_mandatory: | ||||||
| 270 | if (!Success) { | ||||||
| 271 | if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) | ||||||
| 272 | for (auto &Device : PM->Devices) | ||||||
| 273 | dumpTargetPointerMappings(Loc, *Device); | ||||||
| 274 | else | ||||||
| 275 | FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Consult https://openmp.llvm.org/design/Runtimes.html " "for debugging options.\n" ); } while (0) | ||||||
| 276 | "for debugging options.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Consult https://openmp.llvm.org/design/Runtimes.html " "for debugging options.\n" ); } while (0); | ||||||
| 277 | |||||||
| 278 | if (PM->RTLs.UsedRTLs.empty()) { | ||||||
| 279 | llvm::SmallVector<llvm::StringRef> Archs; | ||||||
| 280 | llvm::transform(PM->Images, std::back_inserter(Archs), | ||||||
| 281 | [](const auto &x) { | ||||||
| 282 | return !x.second.Arch ? "empty" : x.second.Arch; | ||||||
| 283 | }); | ||||||
| 284 | FAILURE_MESSAGE(do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "No images found compatible with the installed hardware. ") ; } while (0) | ||||||
| 285 | "No images found compatible with the installed hardware. ")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "No images found compatible with the installed hardware. ") ; } while (0); | ||||||
| 286 | fprintf(stderrstderr, "Found (%s)\n", llvm::join(Archs, ",").c_str()); | ||||||
| 287 | } | ||||||
| 288 | |||||||
| 289 | SourceInfo Info(Loc); | ||||||
| 290 | if (Info.isAvailible()) | ||||||
| 291 | fprintf(stderrstderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(), | ||||||
| 292 | Info.getColumn()); | ||||||
| 293 | else | ||||||
| 294 | FAILURE_MESSAGE("Source location information not present. Compile with "do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Source location information not present. Compile with " "-g or -gline-tables-only.\n" ); } while (0) | ||||||
| 295 | "-g or -gline-tables-only.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Source location information not present. Compile with " "-g or -gline-tables-only.\n" ); } while (0); | ||||||
| 296 | FATAL_MESSAGE0(do { fprintf(stderr, "Libomptarget" " fatal error %d: %s\n", 1 , "failure of target construct while offloading is mandatory" ); abort(); } while (0) | ||||||
| 297 | 1, "failure of target construct while offloading is mandatory")do { fprintf(stderr, "Libomptarget" " fatal error %d: %s\n", 1 , "failure of target construct while offloading is mandatory" ); abort(); } while (0); | ||||||
| 298 | } else { | ||||||
| 299 | if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) | ||||||
| 300 | for (auto &Device : PM->Devices) | ||||||
| 301 | dumpTargetPointerMappings(Loc, *Device); | ||||||
| 302 | } | ||||||
| 303 | break; | ||||||
| 304 | } | ||||||
| 305 | } | ||||||
| 306 | |||||||
| 307 | static void handleDefaultTargetOffload() { | ||||||
| 308 | std::lock_guard<decltype(PM->TargetOffloadMtx)> LG(PM->TargetOffloadMtx); | ||||||
| 309 | if (PM->TargetOffloadPolicy == tgt_default) { | ||||||
| 310 | if (omp_get_num_devices() > 0) { | ||||||
| 311 | DP("Default TARGET OFFLOAD policy is now mandatory "{} | ||||||
| 312 | "(devices were found)\n"){}; | ||||||
| 313 | PM->TargetOffloadPolicy = tgt_mandatory; | ||||||
| 314 | } else { | ||||||
| 315 | DP("Default TARGET OFFLOAD policy is now disabled "{} | ||||||
| 316 | "(no devices were found)\n"){}; | ||||||
| 317 | PM->TargetOffloadPolicy = tgt_disabled; | ||||||
| 318 | } | ||||||
| 319 | } | ||||||
| 320 | } | ||||||
| 321 | |||||||
| 322 | static bool isOffloadDisabled() { | ||||||
| 323 | if (PM->TargetOffloadPolicy == tgt_default) | ||||||
| 324 | handleDefaultTargetOffload(); | ||||||
| 325 | return PM->TargetOffloadPolicy == tgt_disabled; | ||||||
| 326 | } | ||||||
| 327 | |||||||
| 328 | // If offload is enabled, ensure that device DeviceID has been initialized, | ||||||
| 329 | // global ctors have been executed, and global data has been mapped. | ||||||
| 330 | // | ||||||
| 331 | // The return bool indicates if the offload is to the host device | ||||||
| 332 | // There are three possible results: | ||||||
| 333 | // - Return false if the taregt device is ready for offload | ||||||
| 334 | // - Return true without reporting a runtime error if offload is | ||||||
| 335 | // disabled, perhaps because the initial device was specified. | ||||||
| 336 | // - Report a runtime error and return true. | ||||||
| 337 | // | ||||||
| 338 | // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. | ||||||
| 339 | // This step might be skipped if offload is disabled. | ||||||
| 340 | bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) { | ||||||
| 341 | if (isOffloadDisabled()) { | ||||||
| |||||||
| 342 | DP("Offload is disabled\n"){}; | ||||||
| 343 | return true; | ||||||
| 344 | } | ||||||
| 345 | |||||||
| 346 | if (DeviceID == OFFLOAD_DEVICE_DEFAULT-1) { | ||||||
| 347 | DeviceID = omp_get_default_device(); | ||||||
| 348 | DP("Use default device id %" PRId64 "\n", DeviceID){}; | ||||||
| 349 | } | ||||||
| 350 | |||||||
| 351 | // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. | ||||||
| 352 | if (omp_get_num_devices() == 0) { | ||||||
| 353 | DP("omp_get_num_devices() == 0 but offload is manadatory\n"){}; | ||||||
| 354 | handleTargetOutcome(false, Loc); | ||||||
| 355 | return true; | ||||||
| 356 | } | ||||||
| 357 | |||||||
| 358 | if (DeviceID == omp_get_initial_device()) { | ||||||
| 359 | DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",{} | ||||||
| 360 | DeviceID){}; | ||||||
| 361 | return true; | ||||||
| 362 | } | ||||||
| 363 | |||||||
| 364 | // Is device ready? | ||||||
| 365 | if (!deviceIsReady(DeviceID)) { | ||||||
| 366 | REPORT("Device %" PRId64 " is not ready.\n", DeviceID)do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Device %" "l" "d" " is not ready.\n", DeviceID); } while ( 0);; | ||||||
| 367 | handleTargetOutcome(false, Loc); | ||||||
| 368 | return true; | ||||||
| 369 | } | ||||||
| 370 | |||||||
| 371 | // Get device info. | ||||||
| 372 | DeviceTy &Device = *PM->Devices[DeviceID]; | ||||||
| 373 | |||||||
| 374 | // Check whether global data has been mapped for this device | ||||||
| 375 | { | ||||||
| 376 | std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG( | ||||||
| 377 | Device.PendingGlobalsMtx); | ||||||
| 378 | if (Device.HasPendingGlobals && initLibrary(Device) != OFFLOAD_SUCCESS(0)) { | ||||||
| 379 | REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID)do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to init globals on device %" "l" "d" "\n", DeviceID ); } while (0);; | ||||||
| 380 | handleTargetOutcome(false, Loc); | ||||||
| 381 | return true; | ||||||
| 382 | } | ||||||
| 383 | } | ||||||
| 384 | |||||||
| 385 | return false; | ||||||
| 386 | } | ||||||
| 387 | |||||||
| 388 | static int32_t getParentIndex(int64_t Type) { | ||||||
| 389 | return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; | ||||||
| 390 | } | ||||||
| 391 | |||||||
| 392 | void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, | ||||||
| 393 | const char *Name) { | ||||||
| 394 | TIMESCOPE()llvm::TimeTraceScope TimeScope(__FUNCTION__); | ||||||
| 395 | DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size){}; | ||||||
| 396 | |||||||
| 397 | if (Size <= 0) { | ||||||
| 398 | DP("Call to %s with non-positive length\n", Name){}; | ||||||
| 399 | return NULL__null; | ||||||
| 400 | } | ||||||
| 401 | |||||||
| 402 | void *Rc = NULL__null; | ||||||
| 403 | |||||||
| 404 | if (DeviceNum == omp_get_initial_device()) { | ||||||
| 405 | Rc = malloc(Size); | ||||||
| 406 | DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc)){}; | ||||||
| 407 | return Rc; | ||||||
| 408 | } | ||||||
| 409 | |||||||
| 410 | if (!deviceIsReady(DeviceNum)) { | ||||||
| 411 | DP("%s returns NULL ptr\n", Name){}; | ||||||
| 412 | return NULL__null; | ||||||
| 413 | } | ||||||
| 414 | |||||||
| 415 | DeviceTy &Device = *PM->Devices[DeviceNum]; | ||||||
| 416 | Rc = Device.allocData(Size, nullptr, Kind); | ||||||
| 417 | DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc)){}; | ||||||
| 418 | return Rc; | ||||||
| 419 | } | ||||||
| 420 | |||||||
| 421 | void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, | ||||||
| 422 | const char *Name) { | ||||||
| 423 | TIMESCOPE()llvm::TimeTraceScope TimeScope(__FUNCTION__); | ||||||
| 424 | DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,{} | ||||||
| 425 | DPxPTR(DevicePtr)){}; | ||||||
| 426 | |||||||
| 427 | if (!DevicePtr) { | ||||||
| 428 | DP("Call to %s with NULL ptr\n", Name){}; | ||||||
| 429 | return; | ||||||
| 430 | } | ||||||
| 431 | |||||||
| 432 | if (DeviceNum == omp_get_initial_device()) { | ||||||
| 433 | free(DevicePtr); | ||||||
| 434 | DP("%s deallocated host ptr\n", Name){}; | ||||||
| 435 | return; | ||||||
| 436 | } | ||||||
| 437 | |||||||
| 438 | if (!deviceIsReady(DeviceNum)) { | ||||||
| 439 | DP("%s returns, nothing to do\n", Name){}; | ||||||
| 440 | return; | ||||||
| 441 | } | ||||||
| 442 | |||||||
| 443 | PM->Devices[DeviceNum]->deleteData(DevicePtr, Kind); | ||||||
| 444 | DP("omp_target_free deallocated device ptr\n"){}; | ||||||
| 445 | } | ||||||
| 446 | |||||||
| 447 | void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, | ||||||
| 448 | const char *Name) { | ||||||
| 449 | TIMESCOPE()llvm::TimeTraceScope TimeScope(__FUNCTION__); | ||||||
| 450 | DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size){}; | ||||||
| 451 | |||||||
| 452 | if (Size <= 0) { | ||||||
| 453 | DP("Call to %s with non-positive length\n", Name){}; | ||||||
| 454 | return NULL__null; | ||||||
| 455 | } | ||||||
| 456 | |||||||
| 457 | void *rc = NULL__null; | ||||||
| 458 | |||||||
| 459 | if (!deviceIsReady(DeviceNum)) { | ||||||
| 460 | DP("%s returns NULL ptr\n", Name){}; | ||||||
| 461 | return NULL__null; | ||||||
| 462 | } | ||||||
| 463 | |||||||
| 464 | DeviceTy *DevicePtr = nullptr; | ||||||
| 465 | { | ||||||
| 466 | std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx); | ||||||
| 467 | |||||||
| 468 | if (!PM->Devices[DeviceNum]) { | ||||||
| 469 | DP("%s returns, device %d not available\n", Name, DeviceNum){}; | ||||||
| 470 | return nullptr; | ||||||
| 471 | } | ||||||
| 472 | |||||||
| 473 | DevicePtr = PM->Devices[DeviceNum].get(); | ||||||
| 474 | } | ||||||
| 475 | |||||||
| 476 | int32_t err = 0; | ||||||
| 477 | if (DevicePtr->RTL->data_lock) { | ||||||
| 478 | err = DevicePtr->RTL->data_lock(DeviceNum, HostPtr, Size, &rc); | ||||||
| 479 | if (err) { | ||||||
| 480 | DP("Could not lock ptr %p\n", HostPtr){}; | ||||||
| 481 | return nullptr; | ||||||
| 482 | } | ||||||
| 483 | } | ||||||
| 484 | DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(rc)){}; | ||||||
| 485 | return rc; | ||||||
| 486 | } | ||||||
| 487 | |||||||
| 488 | void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { | ||||||
| 489 | TIMESCOPE()llvm::TimeTraceScope TimeScope(__FUNCTION__); | ||||||
| 490 | DP("Call to %s for device %d unlocking\n", Name, DeviceNum){}; | ||||||
| 491 | |||||||
| 492 | DeviceTy *DevicePtr = nullptr; | ||||||
| 493 | { | ||||||
| 494 | std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx); | ||||||
| 495 | |||||||
| 496 | // Don't check deviceIsReady as it can initialize the device if needed. | ||||||
| 497 | // Just check if DeviceNum exists as targetUnlockExplicit can be called | ||||||
| 498 | // during process exit/free (and it may have been already destroyed) and | ||||||
| 499 | // targetAllocExplicit will have already checked deviceIsReady anyway. | ||||||
| 500 | size_t DevicesSize = PM->Devices.size(); | ||||||
| 501 | |||||||
| 502 | if (DevicesSize <= (size_t)DeviceNum) { | ||||||
| 503 | DP("Device ID %d does not have a matching RTL\n", DeviceNum){}; | ||||||
| 504 | return; | ||||||
| 505 | } | ||||||
| 506 | |||||||
| 507 | if (!PM->Devices[DeviceNum]) { | ||||||
| 508 | DP("%s returns, device %d not available\n", Name, DeviceNum){}; | ||||||
| 509 | return; | ||||||
| 510 | } | ||||||
| 511 | |||||||
| 512 | DevicePtr = PM->Devices[DeviceNum].get(); | ||||||
| 513 | } // unlock RTLsMtx | ||||||
| 514 | |||||||
| 515 | if (DevicePtr->RTL->data_unlock) | ||||||
| 516 | DevicePtr->RTL->data_unlock(DeviceNum, HostPtr); | ||||||
| 517 | |||||||
| 518 | DP("%s returns\n", Name){}; | ||||||
| 519 | } | ||||||
| 520 | |||||||
| 521 | /// Call the user-defined mapper function followed by the appropriate | ||||||
| 522 | // targetData* function (targetData{Begin,End,Update}). | ||||||
| 523 | int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, | ||||||
| 524 | int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames, | ||||||
| 525 | void *ArgMapper, AsyncInfoTy &AsyncInfo, | ||||||
| 526 | TargetDataFuncPtrTy TargetDataFunction) { | ||||||
| 527 | TIMESCOPE_WITH_IDENT(Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope(__FUNCTION__ , SI.getProfileLocation()); | ||||||
| 528 | DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper)){}; | ||||||
| 529 | |||||||
| 530 | // The mapper function fills up Components. | ||||||
| 531 | MapperComponentsTy MapperComponents; | ||||||
| 532 | MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper); | ||||||
| 533 | (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType, | ||||||
| 534 | ArgNames); | ||||||
| 535 | |||||||
| 536 | // Construct new arrays for args_base, args, arg_sizes and arg_types | ||||||
| 537 | // using the information in MapperComponents and call the corresponding | ||||||
| 538 | // targetData* function using these new arrays. | ||||||
| 539 | SmallVector<void *> MapperArgsBase(MapperComponents.Components.size()); | ||||||
| 540 | SmallVector<void *> MapperArgs(MapperComponents.Components.size()); | ||||||
| 541 | SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size()); | ||||||
| 542 | SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size()); | ||||||
| 543 | SmallVector<void *> MapperArgNames(MapperComponents.Components.size()); | ||||||
| 544 | |||||||
| 545 | for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { | ||||||
| 546 | auto &C = MapperComponents.Components[I]; | ||||||
| 547 | MapperArgsBase[I] = C.Base; | ||||||
| 548 | MapperArgs[I] = C.Begin; | ||||||
| 549 | MapperArgSizes[I] = C.Size; | ||||||
| 550 | MapperArgTypes[I] = C.Type; | ||||||
| 551 | MapperArgNames[I] = C.Name; | ||||||
| 552 | } | ||||||
| 553 | |||||||
| 554 | int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(), | ||||||
| 555 | MapperArgsBase.data(), MapperArgs.data(), | ||||||
| 556 | MapperArgSizes.data(), MapperArgTypes.data(), | ||||||
| 557 | MapperArgNames.data(), /*arg_mappers*/ nullptr, | ||||||
| 558 | AsyncInfo, /*FromMapper=*/true); | ||||||
| 559 | |||||||
| 560 | return Rc; | ||||||
| 561 | } | ||||||
| 562 | |||||||
| 563 | /// Internal function to do the mapping and transfer the data to the device | ||||||
| 564 | int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, | ||||||
| 565 | void **ArgsBase, void **Args, int64_t *ArgSizes, | ||||||
| 566 | int64_t *ArgTypes, map_var_info_t *ArgNames, | ||||||
| 567 | void **ArgMappers, AsyncInfoTy &AsyncInfo, | ||||||
| 568 | bool FromMapper) { | ||||||
| 569 | // process each input. | ||||||
| 570 | for (int32_t I = 0; I < ArgNum; ++I) { | ||||||
| 571 | // Ignore private variables and arrays - there is no mapping for them. | ||||||
| 572 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || | ||||||
| 573 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) | ||||||
| 574 | continue; | ||||||
| 575 | |||||||
| 576 | if (ArgMappers && ArgMappers[I]) { | ||||||
| 577 | // Instead of executing the regular path of targetDataBegin, call the | ||||||
| 578 | // targetDataMapper variant which will call targetDataBegin again | ||||||
| 579 | // with new arguments. | ||||||
| 580 | DP("Calling targetDataMapper for the %dth argument\n", I){}; | ||||||
| 581 | |||||||
| 582 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; | ||||||
| 583 | int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], | ||||||
| 584 | ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, | ||||||
| 585 | targetDataBegin); | ||||||
| 586 | |||||||
| 587 | if (Rc != OFFLOAD_SUCCESS(0)) { | ||||||
| 588 | REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataBegin via targetDataMapper for custom mapper" " failed.\n"); } while (0); | ||||||
| 589 | " failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataBegin via targetDataMapper for custom mapper" " failed.\n"); } while (0);; | ||||||
| 590 | return OFFLOAD_FAIL(~0); | ||||||
| 591 | } | ||||||
| 592 | |||||||
| 593 | // Skip the rest of this function, continue to the next argument. | ||||||
| 594 | continue; | ||||||
| 595 | } | ||||||
| 596 | |||||||
| 597 | void *HstPtrBegin = Args[I]; | ||||||
| 598 | void *HstPtrBase = ArgsBase[I]; | ||||||
| 599 | int64_t DataSize = ArgSizes[I]; | ||||||
| 600 | map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; | ||||||
| 601 | |||||||
| 602 | // Adjust for proper alignment if this is a combined entry (for structs). | ||||||
| 603 | // Look at the next argument - if that is MEMBER_OF this one, then this one | ||||||
| 604 | // is a combined entry. | ||||||
| 605 | int64_t Padding = 0; | ||||||
| 606 | const int NextI = I + 1; | ||||||
| 607 | if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && | ||||||
| 608 | getParentIndex(ArgTypes[NextI]) == I) { | ||||||
| 609 | int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase); | ||||||
| 610 | Padding = (int64_t)HstPtrBegin % Alignment; | ||||||
| 611 | if (Padding) { | ||||||
| 612 | DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD{} | ||||||
| 613 | "\n",{} | ||||||
| 614 | Padding, DPxPTR(HstPtrBegin)){}; | ||||||
| 615 | HstPtrBegin = (char *)HstPtrBegin - Padding; | ||||||
| 616 | DataSize += Padding; | ||||||
| 617 | } | ||||||
| 618 | } | ||||||
| 619 | |||||||
| 620 | // Address of pointer on the host and device, respectively. | ||||||
| 621 | void *PointerHstPtrBegin, *PointerTgtPtrBegin; | ||||||
| 622 | TargetPointerResultTy PointerTpr; | ||||||
| 623 | bool IsHostPtr = false; | ||||||
| 624 | bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; | ||||||
| 625 | // Force the creation of a device side copy of the data when: | ||||||
| 626 | // a close map modifier was associated with a map that contained a to. | ||||||
| 627 | bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE; | ||||||
| 628 | bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; | ||||||
| 629 | bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; | ||||||
| 630 | // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we | ||||||
| 631 | // have reached this point via __tgt_target_data_begin and not __tgt_target | ||||||
| 632 | // then no argument is marked as TARGET_PARAM ("omp target data map" is not | ||||||
| 633 | // associated with a target region, so there are no target parameters). This | ||||||
| 634 | // may be considered a hack, we could revise the scheme in the future. | ||||||
| 635 | bool UpdateRef = | ||||||
| 636 | !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0); | ||||||
| 637 | |||||||
| 638 | DeviceTy::HDTTMapAccessorTy HDTTMap = | ||||||
| 639 | Device.HostDataToTargetMap.getExclusiveAccessor(); | ||||||
| 640 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { | ||||||
| 641 | DP("Has a pointer entry: \n"){}; | ||||||
| 642 | // Base is address of pointer. | ||||||
| 643 | // | ||||||
| 644 | // Usually, the pointer is already allocated by this time. For example: | ||||||
| 645 | // | ||||||
| 646 | // #pragma omp target map(s.p[0:N]) | ||||||
| 647 | // | ||||||
| 648 | // The map entry for s comes first, and the PTR_AND_OBJ entry comes | ||||||
| 649 | // afterward, so the pointer is already allocated by the time the | ||||||
| 650 | // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus | ||||||
| 651 | // non-null. However, "declare target link" can produce a PTR_AND_OBJ | ||||||
| 652 | // entry for a global that might not already be allocated by the time the | ||||||
| 653 | // PTR_AND_OBJ entry is handled below, and so the allocation might fail | ||||||
| 654 | // when HasPresentModifier. | ||||||
| 655 | PointerTpr = Device.getTargetPointer( | ||||||
| 656 | HDTTMap, HstPtrBase, HstPtrBase, sizeof(void *), | ||||||
| 657 | /*HstPtrName=*/nullptr, | ||||||
| 658 | /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef, | ||||||
| 659 | HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo, | ||||||
| 660 | /* OwnedTPR */ nullptr, /* ReleaseHDTTMap */ false); | ||||||
| 661 | PointerTgtPtrBegin = PointerTpr.TargetPointer; | ||||||
| 662 | IsHostPtr = PointerTpr.Flags.IsHostPointer; | ||||||
| 663 | if (!PointerTgtPtrBegin) { | ||||||
| 664 | REPORT("Call to getTargetPointer returned null pointer (%s).\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0); | ||||||
| 665 | HasPresentModifier ? "'present' map type modifier"do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0); | ||||||
| 666 | : "device failure or illegal mapping")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0);; | ||||||
| 667 | return OFFLOAD_FAIL(~0); | ||||||
| 668 | } | ||||||
| 669 | DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"{} | ||||||
| 670 | "\n",{} | ||||||
| 671 | sizeof(void *), DPxPTR(PointerTgtPtrBegin),{} | ||||||
| 672 | (PointerTpr.Flags.IsNewEntry ? "" : " not")){}; | ||||||
| 673 | PointerHstPtrBegin = HstPtrBase; | ||||||
| 674 | // modify current entry. | ||||||
| 675 | HstPtrBase = *(void **)HstPtrBase; | ||||||
| 676 | // No need to update pointee ref count for the first element of the | ||||||
| 677 | // subelement that comes from mapper. | ||||||
| 678 | UpdateRef = | ||||||
| 679 | (!FromMapper || I != 0); // subsequently update ref count of pointee | ||||||
| 680 | } | ||||||
| 681 | |||||||
| 682 | const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO; | ||||||
| 683 | const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; | ||||||
| 684 | // Note that HDTTMap will be released in getTargetPointer. | ||||||
| 685 | auto TPR = Device.getTargetPointer( | ||||||
| 686 | HDTTMap, HstPtrBegin, HstPtrBase, DataSize, HstPtrName, HasFlagTo, | ||||||
| 687 | HasFlagAlways, IsImplicit, UpdateRef, HasCloseModifier, | ||||||
| 688 | HasPresentModifier, HasHoldModifier, AsyncInfo, PointerTpr.getEntry()); | ||||||
| 689 | void *TgtPtrBegin = TPR.TargetPointer; | ||||||
| 690 | IsHostPtr = TPR.Flags.IsHostPointer; | ||||||
| 691 | // If data_size==0, then the argument could be a zero-length pointer to | ||||||
| 692 | // NULL, so getOrAlloc() returning NULL is not an error. | ||||||
| 693 | if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { | ||||||
| 694 | REPORT("Call to getTargetPointer returned null pointer (%s).\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0); | ||||||
| 695 | HasPresentModifier ? "'present' map type modifier"do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0); | ||||||
| 696 | : "device failure or illegal mapping")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0);; | ||||||
| 697 | return OFFLOAD_FAIL(~0); | ||||||
| 698 | } | ||||||
| 699 | DP("There are %" PRId64 " bytes allocated at target address " DPxMOD{} | ||||||
| 700 | " - is%s new\n",{} | ||||||
| 701 | DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")){}; | ||||||
| 702 | |||||||
| 703 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) { | ||||||
| 704 | uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; | ||||||
| 705 | void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); | ||||||
| 706 | DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)){}; | ||||||
| 707 | ArgsBase[I] = TgtPtrBase; | ||||||
| 708 | } | ||||||
| 709 | |||||||
| 710 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { | ||||||
| 711 | |||||||
| 712 | uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; | ||||||
| 713 | void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); | ||||||
| 714 | |||||||
| 715 | if (PointerTpr.getEntry()->addShadowPointer(ShadowPtrInfoTy{ | ||||||
| 716 | (void **)PointerHstPtrBegin, HstPtrBase, | ||||||
| 717 | (void **)PointerTgtPtrBegin, ExpectedTgtPtrBase})) { | ||||||
| 718 | DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",{} | ||||||
| 719 | DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)){}; | ||||||
| 720 | |||||||
| 721 | void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); | ||||||
| 722 | TgtPtrBase = ExpectedTgtPtrBase; | ||||||
| 723 | |||||||
| 724 | int Ret = | ||||||
| 725 | Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, sizeof(void *), | ||||||
| 726 | AsyncInfo, PointerTpr.getEntry()); | ||||||
| 727 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 728 | REPORT("Copying data to device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data to device failed.\n"); } while (0);; | ||||||
| 729 | return OFFLOAD_FAIL(~0); | ||||||
| 730 | } | ||||||
| 731 | if (PointerTpr.getEntry()->addEventIfNecessary(Device, AsyncInfo) != | ||||||
| 732 | OFFLOAD_SUCCESS(0)) | ||||||
| 733 | return OFFLOAD_FAIL(~0); | ||||||
| 734 | } | ||||||
| 735 | } | ||||||
| 736 | |||||||
| 737 | // Check if variable can be used on the device: | ||||||
| 738 | bool IsStructMember = ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF; | ||||||
| 739 | if (getInfoLevel() & OMP_INFOTYPE_EMPTY_MAPPING && ArgTypes[I] != 0 && | ||||||
| 740 | !IsStructMember && !IsImplicit && !TPR.isPresent() && | ||||||
| 741 | !TPR.isContained() && !TPR.isHostPointer()) | ||||||
| 742 | INFO(OMP_INFOTYPE_EMPTY_MAPPING, Device.DeviceID,do { if (getDebugLevel() > 0) { {}; } else if (getInfoLevel () & OMP_INFOTYPE_EMPTY_MAPPING) { do { fprintf(stderr, "Libomptarget" " device %d info: ", (int)Device.DeviceID); fprintf(stderr, "variable %s does not have a valid device counterpart\n" , (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown" ); } while (0); } } while (false) | ||||||
| 743 | "variable %s does not have a valid device counterpart\n",do { if (getDebugLevel() > 0) { {}; } else if (getInfoLevel () & OMP_INFOTYPE_EMPTY_MAPPING) { do { fprintf(stderr, "Libomptarget" " device %d info: ", (int)Device.DeviceID); fprintf(stderr, "variable %s does not have a valid device counterpart\n" , (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown" ); } while (0); } } while (false) | ||||||
| 744 | (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown")do { if (getDebugLevel() > 0) { {}; } else if (getInfoLevel () & OMP_INFOTYPE_EMPTY_MAPPING) { do { fprintf(stderr, "Libomptarget" " device %d info: ", (int)Device.DeviceID); fprintf(stderr, "variable %s does not have a valid device counterpart\n" , (HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown" ); } while (0); } } while (false); | ||||||
| 745 | } | ||||||
| 746 | |||||||
| 747 | return OFFLOAD_SUCCESS(0); | ||||||
| 748 | } | ||||||
| 749 | |||||||
| 750 | namespace { | ||||||
| 751 | /// This structure contains information to deallocate a target pointer, aka. | ||||||
| 752 | /// used to fix up the shadow map and potentially delete the entry from the | ||||||
| 753 | /// mapping table via \p DeviceTy::deallocTgtPtr. | ||||||
| 754 | struct PostProcessingInfo { | ||||||
| 755 | /// Host pointer used to look up into the map table | ||||||
| 756 | void *HstPtrBegin; | ||||||
| 757 | |||||||
| 758 | /// Size of the data | ||||||
| 759 | int64_t DataSize; | ||||||
| 760 | |||||||
| 761 | /// The mapping type (bitfield). | ||||||
| 762 | int64_t ArgType; | ||||||
| 763 | |||||||
| 764 | /// The target pointer information. | ||||||
| 765 | TargetPointerResultTy TPR; | ||||||
| 766 | |||||||
| 767 | PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, | ||||||
| 768 | TargetPointerResultTy &&TPR) | ||||||
| 769 | : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), | ||||||
| 770 | TPR(std::move(TPR)) {} | ||||||
| 771 | }; | ||||||
| 772 | |||||||
| 773 | } // namespace | ||||||
| 774 | |||||||
| 775 | /// Applies the necessary post-processing procedures to entries listed in \p | ||||||
| 776 | /// EntriesInfo after the execution of all device side operations from a target | ||||||
| 777 | /// data end. This includes the update of pointers at the host and removal of | ||||||
| 778 | /// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS | ||||||
| 779 | /// according to the successfulness of the operations. | ||||||
| 780 | [[nodiscard]] static int | ||||||
| 781 | postProcessingTargetDataEnd(DeviceTy *Device, | ||||||
| 782 | SmallVector<PostProcessingInfo> &EntriesInfo) { | ||||||
| 783 | int Ret = OFFLOAD_SUCCESS(0); | ||||||
| 784 | |||||||
| 785 | for (auto &[HstPtrBegin, DataSize, ArgType, TPR] : EntriesInfo) { | ||||||
| 786 | bool DelEntry = !TPR.isHostPointer(); | ||||||
| 787 | |||||||
| 788 | // If the last element from the mapper (for end transfer args comes in | ||||||
| 789 | // reverse order), do not remove the partial entry, the parent struct still | ||||||
| 790 | // exists. | ||||||
| 791 | if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) && | ||||||
| 792 | !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { | ||||||
| 793 | DelEntry = false; // protect parent struct from being deallocated | ||||||
| 794 | } | ||||||
| 795 | |||||||
| 796 | // If we marked the entry to be deleted we need to verify no other | ||||||
| 797 | // thread reused it by now. If deletion is still supposed to happen by | ||||||
| 798 | // this thread LR will be set and exclusive access to the HDTT map | ||||||
| 799 | // will avoid another thread reusing the entry now. Note that we do | ||||||
| 800 | // not request (exclusive) access to the HDTT map if DelEntry is | ||||||
| 801 | // not set. | ||||||
| 802 | DeviceTy::HDTTMapAccessorTy HDTTMap = | ||||||
| 803 | Device->HostDataToTargetMap.getExclusiveAccessor(); | ||||||
| 804 | |||||||
| 805 | // We cannot use a lock guard because we may end up delete the mutex. | ||||||
| 806 | // We also explicitly unlocked the entry after it was put in the EntriesInfo | ||||||
| 807 | // so it can be reused. | ||||||
| 808 | TPR.getEntry()->lock(); | ||||||
| 809 | auto *Entry = TPR.getEntry(); | ||||||
| 810 | |||||||
| 811 | const bool IsNotLastUser = Entry->decDataEndThreadCount() != 0; | ||||||
| 812 | if (DelEntry && (Entry->getTotalRefCount() != 0 || IsNotLastUser)) { | ||||||
| 813 | // The thread is not in charge of deletion anymore. Give up access | ||||||
| 814 | // to the HDTT map and unset the deletion flag. | ||||||
| 815 | HDTTMap.destroy(); | ||||||
| 816 | DelEntry = false; | ||||||
| 817 | } | ||||||
| 818 | |||||||
| 819 | // If we copied back to the host a struct/array containing pointers, | ||||||
| 820 | // we need to restore the original host pointer values from their | ||||||
| 821 | // shadow copies. If the struct is going to be deallocated, remove any | ||||||
| 822 | // remaining shadow pointer entries for this struct. | ||||||
| 823 | const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM; | ||||||
| 824 | if (HasFrom) { | ||||||
| 825 | Entry->foreachShadowPointerInfo( | ||||||
| 826 | [&](const ShadowPtrInfoTy &ShadowPtr) { | ||||||
| 827 | *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; | ||||||
| 828 | DP("Restoring original host pointer value " DPxMOD " for host "{} | ||||||
| 829 | "pointer " DPxMOD "\n",{} | ||||||
| 830 | DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)){}; | ||||||
| 831 | return OFFLOAD_SUCCESS(0); | ||||||
| 832 | }); | ||||||
| 833 | } | ||||||
| 834 | |||||||
| 835 | // Give up the lock as we either don't need it anymore (e.g., done with | ||||||
| 836 | // TPR), or erase TPR. | ||||||
| 837 | TPR.setEntry(nullptr); | ||||||
| 838 | |||||||
| 839 | if (!DelEntry) | ||||||
| 840 | continue; | ||||||
| 841 | |||||||
| 842 | Ret = Device->eraseMapEntry(HDTTMap, Entry, DataSize); | ||||||
| 843 | // Entry is already remove from the map, we can unlock it now. | ||||||
| 844 | HDTTMap.destroy(); | ||||||
| 845 | Ret |= Device->deallocTgtPtrAndEntry(Entry, DataSize); | ||||||
| 846 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 847 | REPORT("Deallocating data from device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Deallocating data from device failed.\n"); } while (0);; | ||||||
| 848 | break; | ||||||
| 849 | } | ||||||
| 850 | } | ||||||
| 851 | |||||||
| 852 | delete &EntriesInfo; | ||||||
| 853 | return Ret; | ||||||
| 854 | } | ||||||
| 855 | |||||||
| 856 | /// Internal function to undo the mapping and retrieve the data from the device. | ||||||
| 857 | int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, | ||||||
| 858 | void **ArgBases, void **Args, int64_t *ArgSizes, | ||||||
| 859 | int64_t *ArgTypes, map_var_info_t *ArgNames, | ||||||
| 860 | void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { | ||||||
| 861 | int Ret = OFFLOAD_SUCCESS(0); | ||||||
| 862 | auto *PostProcessingPtrs = new SmallVector<PostProcessingInfo>(); | ||||||
| 863 | // process each input. | ||||||
| 864 | for (int32_t I = ArgNum - 1; I >= 0; --I) { | ||||||
| 865 | // Ignore private variables and arrays - there is no mapping for them. | ||||||
| 866 | // Also, ignore the use_device_ptr directive, it has no effect here. | ||||||
| 867 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || | ||||||
| 868 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) | ||||||
| 869 | continue; | ||||||
| 870 | |||||||
| 871 | if (ArgMappers && ArgMappers[I]) { | ||||||
| 872 | // Instead of executing the regular path of targetDataEnd, call the | ||||||
| 873 | // targetDataMapper variant which will call targetDataEnd again | ||||||
| 874 | // with new arguments. | ||||||
| 875 | DP("Calling targetDataMapper for the %dth argument\n", I){}; | ||||||
| 876 | |||||||
| 877 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; | ||||||
| 878 | Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I], | ||||||
| 879 | ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, | ||||||
| 880 | targetDataEnd); | ||||||
| 881 | |||||||
| 882 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 883 | REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataEnd via targetDataMapper for custom mapper" " failed.\n"); } while (0); | ||||||
| |||||||
| 884 | " failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataEnd via targetDataMapper for custom mapper" " failed.\n"); } while (0);; | ||||||
| 885 | return OFFLOAD_FAIL(~0); | ||||||
| 886 | } | ||||||
| 887 | |||||||
| 888 | // Skip the rest of this function, continue to the next argument. | ||||||
| 889 | continue; | ||||||
| 890 | } | ||||||
| 891 | |||||||
| 892 | void *HstPtrBegin = Args[I]; | ||||||
| 893 | void *HstPtrBase = ArgBases[I]; | ||||||
| 894 | int64_t DataSize = ArgSizes[I]; | ||||||
| 895 | // Adjust for proper alignment if this is a combined entry (for structs). | ||||||
| 896 | // Look at the next argument - if that is MEMBER_OF this one, then this one | ||||||
| 897 | // is a combined entry. | ||||||
| 898 | const int NextI = I + 1; | ||||||
| 899 | if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && | ||||||
| 900 | getParentIndex(ArgTypes[NextI]) == I) { | ||||||
| 901 | int64_t Alignment = getPartialStructRequiredAlignment(HstPtrBase); | ||||||
| 902 | int64_t Padding = (int64_t)HstPtrBegin % Alignment; | ||||||
| 903 | if (Padding) { | ||||||
| 904 | DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD{} | ||||||
| 905 | "\n",{} | ||||||
| 906 | Padding, DPxPTR(HstPtrBegin)){}; | ||||||
| 907 | HstPtrBegin = (char *)HstPtrBegin - Padding; | ||||||
| 908 | DataSize += Padding; | ||||||
| 909 | } | ||||||
| 910 | } | ||||||
| 911 | |||||||
| 912 | bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; | ||||||
| 913 | bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || | ||||||
| 914 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && | ||||||
| 915 | !(FromMapper && I == 0); | ||||||
| 916 | bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; | ||||||
| 917 | bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; | ||||||
| 918 | bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; | ||||||
| 919 | |||||||
| 920 | // If PTR_AND_OBJ, HstPtrBegin is address of pointee | ||||||
| 921 | TargetPointerResultTy TPR = | ||||||
| 922 | Device.getTgtPtrBegin(HstPtrBegin, DataSize, UpdateRef, HasHoldModifier, | ||||||
| 923 | !IsImplicit, ForceDelete, /*FromDataEnd=*/true); | ||||||
| 924 | void *TgtPtrBegin = TPR.TargetPointer; | ||||||
| 925 | if (!TPR.isPresent() && !TPR.isHostPointer() && | ||||||
| 926 | (DataSize || HasPresentModifier)) { | ||||||
| 927 | DP("Mapping does not exist (%s)\n",{} | ||||||
| 928 | (HasPresentModifier ? "'present' map type modifier" : "ignored")){}; | ||||||
| 929 | if (HasPresentModifier) { | ||||||
| 930 | // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13: | ||||||
| 931 | // "If a map clause appears on a target, target data, target enter data | ||||||
| 932 | // or target exit data construct with a present map-type-modifier then | ||||||
| 933 | // on entry to the region if the corresponding list item does not appear | ||||||
| 934 | // in the device data environment then an error occurs and the program | ||||||
| 935 | // terminates." | ||||||
| 936 | // | ||||||
| 937 | // This should be an error upon entering an "omp target exit data". It | ||||||
| 938 | // should not be an error upon exiting an "omp target data" or "omp | ||||||
| 939 | // target". For "omp target data", Clang thus doesn't include present | ||||||
| 940 | // modifiers for end calls. For "omp target", we have not found a valid | ||||||
| 941 | // OpenMP program for which the error matters: it appears that, if a | ||||||
| 942 | // program can guarantee that data is present at the beginning of an | ||||||
| 943 | // "omp target" region so that there's no error there, that data is also | ||||||
| 944 | // guaranteed to be present at the end. | ||||||
| 945 | MESSAGE("device mapping required by 'present' map type modifier does "do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' map type modifier does " "not exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), DataSize); } while (0) | ||||||
| 946 | "not exist for host address " DPxMOD " (%" PRId64 " bytes)",do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' map type modifier does " "not exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), DataSize); } while (0) | ||||||
| 947 | DPxPTR(HstPtrBegin), DataSize)do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' map type modifier does " "not exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), DataSize); } while (0); | ||||||
| 948 | return OFFLOAD_FAIL(~0); | ||||||
| 949 | } | ||||||
| 950 | } else { | ||||||
| 951 | DP("There are %" PRId64 " bytes allocated at target address " DPxMOD{} | ||||||
| 952 | " - is%s last\n",{} | ||||||
| 953 | DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsLast ? "" : " not")){}; | ||||||
| 954 | } | ||||||
| 955 | |||||||
| 956 | // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: | ||||||
| 957 | // "If the map clause appears on a target, target data, or target exit data | ||||||
| 958 | // construct and a corresponding list item of the original list item is not | ||||||
| 959 | // present in the device data environment on exit from the region then the | ||||||
| 960 | // list item is ignored." | ||||||
| 961 | if (!TPR.isPresent()) | ||||||
| 962 | continue; | ||||||
| 963 | |||||||
| 964 | // Move data back to the host | ||||||
| 965 | const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; | ||||||
| 966 | const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM; | ||||||
| 967 | if (HasFrom && (HasAlways || TPR.Flags.IsLast) && | ||||||
| 968 | !TPR.Flags.IsHostPointer && DataSize != 0) { | ||||||
| 969 | DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",{} | ||||||
| 970 | DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)){}; | ||||||
| 971 | |||||||
| 972 | // Wait for any previous transfer if an event is present. | ||||||
| 973 | if (void *Event = TPR.getEntry()->getEvent()) { | ||||||
| 974 | if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS(0)) { | ||||||
| 975 | REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event))do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to wait for event " "0x%0*" "l" "x" ".\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(Event))); } while (0);; | ||||||
| 976 | return OFFLOAD_FAIL(~0); | ||||||
| 977 | } | ||||||
| 978 | } | ||||||
| 979 | |||||||
| 980 | Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo, | ||||||
| 981 | TPR.getEntry()); | ||||||
| 982 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 983 | REPORT("Copying data from device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data from device failed.\n"); } while (0);; | ||||||
| 984 | return OFFLOAD_FAIL(~0); | ||||||
| 985 | } | ||||||
| 986 | |||||||
| 987 | // As we are expecting to delete the entry the d2h copy might race | ||||||
| 988 | // with another one that also tries to delete the entry. This happens | ||||||
| 989 | // as the entry can be reused and the reuse might happen after the | ||||||
| 990 | // copy-back was issued but before it completed. Since the reuse might | ||||||
| 991 | // also copy-back a value we would race. | ||||||
| 992 | if (TPR.Flags.IsLast) { | ||||||
| 993 | if (TPR.getEntry()->addEventIfNecessary(Device, AsyncInfo) != | ||||||
| 994 | OFFLOAD_SUCCESS(0)) | ||||||
| 995 | return OFFLOAD_FAIL(~0); | ||||||
| 996 | } | ||||||
| 997 | } | ||||||
| 998 | |||||||
| 999 | // Add pointer to the buffer for post-synchronize processing. | ||||||
| 1000 | PostProcessingPtrs->emplace_back(HstPtrBegin, DataSize, ArgTypes[I], | ||||||
| 1001 | std::move(TPR)); | ||||||
| 1002 | PostProcessingPtrs->back().TPR.getEntry()->unlock(); | ||||||
| 1003 | } | ||||||
| 1004 | |||||||
| 1005 | // Add post-processing functions | ||||||
| 1006 | // TODO: We might want to remove `mutable` in the future by not changing the | ||||||
| 1007 | // captured variables somehow. | ||||||
| 1008 | AsyncInfo.addPostProcessingFunction([=, Device = &Device]() mutable -> int { | ||||||
| 1009 | return postProcessingTargetDataEnd(Device, *PostProcessingPtrs); | ||||||
| 1010 | }); | ||||||
| 1011 | |||||||
| 1012 | return Ret; | ||||||
| 1013 | } | ||||||
| 1014 | |||||||
| 1015 | static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase, | ||||||
| 1016 | void *HstPtrBegin, int64_t ArgSize, | ||||||
| 1017 | int64_t ArgType, AsyncInfoTy &AsyncInfo) { | ||||||
| 1018 | TIMESCOPE_WITH_IDENT(Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope(__FUNCTION__ , SI.getProfileLocation()); | ||||||
| 1019 | TargetPointerResultTy TPR = | ||||||
| 1020 | Device.getTgtPtrBegin(HstPtrBegin, ArgSize, /*UpdateRefCount=*/false, | ||||||
| 1021 | /*UseHoldRefCount=*/false, /*MustContain=*/true); | ||||||
| 1022 | void *TgtPtrBegin = TPR.TargetPointer; | ||||||
| 1023 | if (!TPR.isPresent()) { | ||||||
| 1024 | DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)){}; | ||||||
| 1025 | if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { | ||||||
| 1026 | MESSAGE("device mapping required by 'present' motion modifier does not "do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' motion modifier does not " "exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), ArgSize); } while (0) | ||||||
| 1027 | "exist for host address " DPxMOD " (%" PRId64 " bytes)",do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' motion modifier does not " "exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), ArgSize); } while (0) | ||||||
| 1028 | DPxPTR(HstPtrBegin), ArgSize)do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' motion modifier does not " "exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), ArgSize); } while (0); | ||||||
| 1029 | return OFFLOAD_FAIL(~0); | ||||||
| 1030 | } | ||||||
| 1031 | return OFFLOAD_SUCCESS(0); | ||||||
| 1032 | } | ||||||
| 1033 | |||||||
| 1034 | if (TPR.Flags.IsHostPointer) { | ||||||
| 1035 | DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",{} | ||||||
| 1036 | DPxPTR(HstPtrBegin)){}; | ||||||
| 1037 | return OFFLOAD_SUCCESS(0); | ||||||
| 1038 | } | ||||||
| 1039 | |||||||
| 1040 | if (ArgType & OMP_TGT_MAPTYPE_TO) { | ||||||
| 1041 | DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",{} | ||||||
| 1042 | ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)){}; | ||||||
| 1043 | int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo, | ||||||
| 1044 | TPR.getEntry()); | ||||||
| 1045 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1046 | REPORT("Copying data to device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data to device failed.\n"); } while (0);; | ||||||
| 1047 | return OFFLOAD_FAIL(~0); | ||||||
| 1048 | } | ||||||
| 1049 | if (TPR.getEntry()) { | ||||||
| 1050 | int Ret = TPR.getEntry()->foreachShadowPointerInfo( | ||||||
| 1051 | [&](ShadowPtrInfoTy &ShadowPtr) { | ||||||
| 1052 | DP("Restoring original target pointer value " DPxMOD " for target "{} | ||||||
| 1053 | "pointer " DPxMOD "\n",{} | ||||||
| 1054 | DPxPTR(ShadowPtr.TgtPtrVal), DPxPTR(ShadowPtr.TgtPtrAddr)){}; | ||||||
| 1055 | Ret = Device.submitData(ShadowPtr.TgtPtrAddr, | ||||||
| 1056 | (void *)&ShadowPtr.TgtPtrVal, | ||||||
| 1057 | sizeof(void *), AsyncInfo); | ||||||
| 1058 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1059 | REPORT("Copying data to device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data to device failed.\n"); } while (0);; | ||||||
| 1060 | return OFFLOAD_FAIL(~0); | ||||||
| 1061 | } | ||||||
| 1062 | return OFFLOAD_SUCCESS(0); | ||||||
| 1063 | }); | ||||||
| 1064 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1065 | DP("Updating shadow map failed\n"){}; | ||||||
| 1066 | return Ret; | ||||||
| 1067 | } | ||||||
| 1068 | } | ||||||
| 1069 | } | ||||||
| 1070 | |||||||
| 1071 | if (ArgType & OMP_TGT_MAPTYPE_FROM) { | ||||||
| 1072 | DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",{} | ||||||
| 1073 | ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)){}; | ||||||
| 1074 | int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo, | ||||||
| 1075 | TPR.getEntry()); | ||||||
| 1076 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1077 | REPORT("Copying data from device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data from device failed.\n"); } while (0);; | ||||||
| 1078 | return OFFLOAD_FAIL(~0); | ||||||
| 1079 | } | ||||||
| 1080 | |||||||
| 1081 | // Wait for device-to-host memcopies for whole struct to complete, | ||||||
| 1082 | // before restoring the correct host pointer. | ||||||
| 1083 | if (auto *Entry = TPR.getEntry()) { | ||||||
| 1084 | AsyncInfo.addPostProcessingFunction([=]() -> int { | ||||||
| 1085 | int Ret = Entry->foreachShadowPointerInfo( | ||||||
| 1086 | [&](const ShadowPtrInfoTy &ShadowPtr) { | ||||||
| 1087 | *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal; | ||||||
| 1088 | DP("Restoring original host pointer value " DPxMOD{} | ||||||
| 1089 | " for host pointer " DPxMOD "\n",{} | ||||||
| 1090 | DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr)){}; | ||||||
| 1091 | return OFFLOAD_SUCCESS(0); | ||||||
| 1092 | }); | ||||||
| 1093 | Entry->unlock(); | ||||||
| 1094 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1095 | DP("Updating shadow map failed\n"){}; | ||||||
| 1096 | return Ret; | ||||||
| 1097 | } | ||||||
| 1098 | return OFFLOAD_SUCCESS(0); | ||||||
| 1099 | }); | ||||||
| 1100 | } | ||||||
| 1101 | } | ||||||
| 1102 | |||||||
| 1103 | return OFFLOAD_SUCCESS(0); | ||||||
| 1104 | } | ||||||
| 1105 | |||||||
| 1106 | static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device, | ||||||
| 1107 | void *ArgsBase, | ||||||
| 1108 | __tgt_target_non_contig *NonContig, | ||||||
| 1109 | uint64_t Size, int64_t ArgType, | ||||||
| 1110 | int CurrentDim, int DimSize, uint64_t Offset, | ||||||
| 1111 | AsyncInfoTy &AsyncInfo) { | ||||||
| 1112 | TIMESCOPE_WITH_IDENT(Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope(__FUNCTION__ , SI.getProfileLocation()); | ||||||
| 1113 | int Ret = OFFLOAD_SUCCESS(0); | ||||||
| 1114 | if (CurrentDim < DimSize) { | ||||||
| 1115 | for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { | ||||||
| 1116 | uint64_t CurOffset = | ||||||
| 1117 | (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; | ||||||
| 1118 | // we only need to transfer the first element for the last dimension | ||||||
| 1119 | // since we've already got a contiguous piece. | ||||||
| 1120 | if (CurrentDim != DimSize - 1 || I == 0) { | ||||||
| 1121 | Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size, | ||||||
| 1122 | ArgType, CurrentDim + 1, DimSize, | ||||||
| 1123 | Offset + CurOffset, AsyncInfo); | ||||||
| 1124 | // Stop the whole process if any contiguous piece returns anything | ||||||
| 1125 | // other than OFFLOAD_SUCCESS. | ||||||
| 1126 | if (Ret != OFFLOAD_SUCCESS(0)) | ||||||
| 1127 | return Ret; | ||||||
| 1128 | } | ||||||
| 1129 | } | ||||||
| 1130 | } else { | ||||||
| 1131 | char *Ptr = (char *)ArgsBase + Offset; | ||||||
| 1132 | DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64{} | ||||||
| 1133 | " len %" PRIu64 "\n",{} | ||||||
| 1134 | DPxPTR(Ptr), Offset, Size){}; | ||||||
| 1135 | Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType, | ||||||
| 1136 | AsyncInfo); | ||||||
| 1137 | } | ||||||
| 1138 | return Ret; | ||||||
| 1139 | } | ||||||
| 1140 | |||||||
| 1141 | static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, | ||||||
| 1142 | int32_t DimSize) { | ||||||
| 1143 | int RemovedDim = 0; | ||||||
| 1144 | for (int I = DimSize - 1; I > 0; --I) { | ||||||
| 1145 | if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) | ||||||
| 1146 | RemovedDim++; | ||||||
| 1147 | } | ||||||
| 1148 | return RemovedDim; | ||||||
| 1149 | } | ||||||
| 1150 | |||||||
| 1151 | /// Internal function to pass data to/from the target. | ||||||
| 1152 | int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, | ||||||
| 1153 | void **ArgsBase, void **Args, int64_t *ArgSizes, | ||||||
| 1154 | int64_t *ArgTypes, map_var_info_t *ArgNames, | ||||||
| 1155 | void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { | ||||||
| 1156 | // process each input. | ||||||
| 1157 | for (int32_t I = 0; I < ArgNum; ++I) { | ||||||
| 1158 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || | ||||||
| 1159 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) | ||||||
| 1160 | continue; | ||||||
| 1161 | |||||||
| 1162 | if (ArgMappers && ArgMappers[I]) { | ||||||
| 1163 | // Instead of executing the regular path of targetDataUpdate, call the | ||||||
| 1164 | // targetDataMapper variant which will call targetDataUpdate again | ||||||
| 1165 | // with new arguments. | ||||||
| 1166 | DP("Calling targetDataMapper for the %dth argument\n", I){}; | ||||||
| 1167 | |||||||
| 1168 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; | ||||||
| 1169 | int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], | ||||||
| 1170 | ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, | ||||||
| 1171 | targetDataUpdate); | ||||||
| 1172 | |||||||
| 1173 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1174 | REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataUpdate via targetDataMapper for custom mapper" " failed.\n"); } while (0); | ||||||
| 1175 | " failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataUpdate via targetDataMapper for custom mapper" " failed.\n"); } while (0);; | ||||||
| 1176 | return OFFLOAD_FAIL(~0); | ||||||
| 1177 | } | ||||||
| 1178 | |||||||
| 1179 | // Skip the rest of this function, continue to the next argument. | ||||||
| 1180 | continue; | ||||||
| 1181 | } | ||||||
| 1182 | |||||||
| 1183 | int Ret = OFFLOAD_SUCCESS(0); | ||||||
| 1184 | |||||||
| 1185 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { | ||||||
| 1186 | __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; | ||||||
| 1187 | int32_t DimSize = ArgSizes[I]; | ||||||
| 1188 | uint64_t Size = | ||||||
| 1189 | NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; | ||||||
| 1190 | int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); | ||||||
| 1191 | Ret = targetDataNonContiguous( | ||||||
| 1192 | Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], | ||||||
| 1193 | /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); | ||||||
| 1194 | } else { | ||||||
| 1195 | Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], | ||||||
| 1196 | ArgTypes[I], AsyncInfo); | ||||||
| 1197 | } | ||||||
| 1198 | if (Ret == OFFLOAD_FAIL(~0)) | ||||||
| 1199 | return OFFLOAD_FAIL(~0); | ||||||
| 1200 | } | ||||||
| 1201 | return OFFLOAD_SUCCESS(0); | ||||||
| 1202 | } | ||||||
| 1203 | |||||||
| 1204 | static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | | ||||||
| 1205 | OMP_TGT_MAPTYPE_LITERAL | | ||||||
| 1206 | OMP_TGT_MAPTYPE_IMPLICIT; | ||||||
| 1207 | static bool isLambdaMapping(int64_t Mapping) { | ||||||
| 1208 | return (Mapping & LambdaMapping) == LambdaMapping; | ||||||
| 1209 | } | ||||||
| 1210 | |||||||
| 1211 | namespace { | ||||||
| 1212 | /// Find the table information in the map or look it up in the translation | ||||||
| 1213 | /// tables. | ||||||
| 1214 | TableMap *getTableMap(void *HostPtr) { | ||||||
| 1215 | std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); | ||||||
| 1216 | HostPtrToTableMapTy::iterator TableMapIt = | ||||||
| 1217 | PM->HostPtrToTableMap.find(HostPtr); | ||||||
| 1218 | |||||||
| 1219 | if (TableMapIt != PM->HostPtrToTableMap.end()) | ||||||
| 1220 | return &TableMapIt->second; | ||||||
| 1221 | |||||||
| 1222 | // We don't have a map. So search all the registered libraries. | ||||||
| 1223 | TableMap *TM = nullptr; | ||||||
| 1224 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); | ||||||
| 1225 | for (HostEntriesBeginToTransTableTy::iterator Itr = | ||||||
| 1226 | PM->HostEntriesBeginToTransTable.begin(); | ||||||
| 1227 | Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) { | ||||||
| 1228 | // get the translation table (which contains all the good info). | ||||||
| 1229 | TranslationTable *TransTable = &Itr->second; | ||||||
| 1230 | // iterate over all the host table entries to see if we can locate the | ||||||
| 1231 | // host_ptr. | ||||||
| 1232 | __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin; | ||||||
| 1233 | for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { | ||||||
| 1234 | if (Cur->addr != HostPtr) | ||||||
| 1235 | continue; | ||||||
| 1236 | // we got a match, now fill the HostPtrToTableMap so that we | ||||||
| 1237 | // may avoid this search next time. | ||||||
| 1238 | TM = &(PM->HostPtrToTableMap)[HostPtr]; | ||||||
| 1239 | TM->Table = TransTable; | ||||||
| 1240 | TM->Index = I; | ||||||
| 1241 | return TM; | ||||||
| 1242 | } | ||||||
| 1243 | } | ||||||
| 1244 | |||||||
| 1245 | return nullptr; | ||||||
| 1246 | } | ||||||
| 1247 | |||||||
| 1248 | /// A class manages private arguments in a target region. | ||||||
| 1249 | class PrivateArgumentManagerTy { | ||||||
| 1250 | /// A data structure for the information of first-private arguments. We can | ||||||
| 1251 | /// use this information to optimize data transfer by packing all | ||||||
| 1252 | /// first-private arguments and transfer them all at once. | ||||||
| 1253 | struct FirstPrivateArgInfoTy { | ||||||
| 1254 | /// Host pointer begin | ||||||
| 1255 | char *HstPtrBegin; | ||||||
| 1256 | /// Host pointer end | ||||||
| 1257 | char *HstPtrEnd; | ||||||
| 1258 | /// The index of the element in \p TgtArgs corresponding to the argument | ||||||
| 1259 | int Index; | ||||||
| 1260 | /// Alignment of the entry (base of the entry, not after the entry). | ||||||
| 1261 | uint32_t Alignment; | ||||||
| 1262 | /// Size (without alignment, see padding) | ||||||
| 1263 | uint32_t Size; | ||||||
| 1264 | /// Padding used to align this argument entry, if necessary. | ||||||
| 1265 | uint32_t Padding; | ||||||
| 1266 | /// Host pointer name | ||||||
| 1267 | map_var_info_t HstPtrName = nullptr; | ||||||
| 1268 | |||||||
| 1269 | FirstPrivateArgInfoTy(int Index, void *HstPtr, uint32_t Size, | ||||||
| 1270 | uint32_t Alignment, uint32_t Padding, | ||||||
| 1271 | const map_var_info_t HstPtrName = nullptr) | ||||||
| 1272 | : HstPtrBegin(reinterpret_cast<char *>(HstPtr)), | ||||||
| 1273 | HstPtrEnd(HstPtrBegin + Size), Index(Index), Alignment(Alignment), | ||||||
| 1274 | Size(Size), Padding(Padding), HstPtrName(HstPtrName) {} | ||||||
| 1275 | }; | ||||||
| 1276 | |||||||
| 1277 | /// A vector of target pointers for all private arguments | ||||||
| 1278 | SmallVector<void *> TgtPtrs; | ||||||
| 1279 | |||||||
| 1280 | /// A vector of information of all first-private arguments to be packed | ||||||
| 1281 | SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo; | ||||||
| 1282 | /// Host buffer for all arguments to be packed | ||||||
| 1283 | SmallVector<char> FirstPrivateArgBuffer; | ||||||
| 1284 | /// The total size of all arguments to be packed | ||||||
| 1285 | int64_t FirstPrivateArgSize = 0; | ||||||
| 1286 | |||||||
| 1287 | /// A reference to the \p DeviceTy object | ||||||
| 1288 | DeviceTy &Device; | ||||||
| 1289 | /// A pointer to a \p AsyncInfoTy object | ||||||
| 1290 | AsyncInfoTy &AsyncInfo; | ||||||
| 1291 | |||||||
| 1292 | // TODO: What would be the best value here? Should we make it configurable? | ||||||
| 1293 | // If the size is larger than this threshold, we will allocate and transfer it | ||||||
| 1294 | // immediately instead of packing it. | ||||||
| 1295 | static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; | ||||||
| 1296 | |||||||
| 1297 | public: | ||||||
| 1298 | /// Constructor | ||||||
| 1299 | PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) | ||||||
| 1300 | : Device(Dev), AsyncInfo(AsyncInfo) {} | ||||||
| 1301 | |||||||
| 1302 | /// Add a private argument | ||||||
| 1303 | int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, | ||||||
| 1304 | bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, | ||||||
| 1305 | const map_var_info_t HstPtrName = nullptr, | ||||||
| 1306 | const bool AllocImmediately = false) { | ||||||
| 1307 | // If the argument is not first-private, or its size is greater than a | ||||||
| 1308 | // predefined threshold, we will allocate memory and issue the transfer | ||||||
| 1309 | // immediately. | ||||||
| 1310 | if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || | ||||||
| 1311 | AllocImmediately) { | ||||||
| 1312 | TgtPtr = Device.allocData(ArgSize, HstPtr); | ||||||
| 1313 | if (!TgtPtr) { | ||||||
| 1314 | DP("Data allocation for %sprivate array " DPxMOD " failed.\n",{} | ||||||
| 1315 | (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr)){}; | ||||||
| 1316 | return OFFLOAD_FAIL(~0); | ||||||
| 1317 | } | ||||||
| 1318 | #ifdef OMPTARGET_DEBUG | ||||||
| 1319 | void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset); | ||||||
| 1320 | DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD{} | ||||||
| 1321 | " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD{} | ||||||
| 1322 | "\n",{} | ||||||
| 1323 | ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),{} | ||||||
| 1324 | DPxPTR(HstPtr), DPxPTR(TgtPtrBase)){}; | ||||||
| 1325 | #endif | ||||||
| 1326 | // If first-private, copy data from host | ||||||
| 1327 | if (IsFirstPrivate) { | ||||||
| 1328 | DP("Submitting firstprivate data to the device.\n"){}; | ||||||
| 1329 | int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); | ||||||
| 1330 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1331 | DP("Copying data to device failed, failed.\n"){}; | ||||||
| 1332 | return OFFLOAD_FAIL(~0); | ||||||
| 1333 | } | ||||||
| 1334 | } | ||||||
| 1335 | TgtPtrs.push_back(TgtPtr); | ||||||
| 1336 | } else { | ||||||
| 1337 | DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",{} | ||||||
| 1338 | DPxPTR(HstPtr), ArgSize){}; | ||||||
| 1339 | // When reach this point, the argument must meet all following | ||||||
| 1340 | // requirements: | ||||||
| 1341 | // 1. Its size does not exceed the threshold (see the comment for | ||||||
| 1342 | // FirstPrivateArgSizeThreshold); | ||||||
| 1343 | // 2. It must be first-private (needs to be mapped to target device). | ||||||
| 1344 | // We will pack all this kind of arguments to transfer them all at once | ||||||
| 1345 | // to reduce the number of data transfer. We will not take | ||||||
| 1346 | // non-first-private arguments, aka. private arguments that doesn't need | ||||||
| 1347 | // to be mapped to target device, into account because data allocation | ||||||
| 1348 | // can be very efficient with memory manager. | ||||||
| 1349 | |||||||
| 1350 | // Placeholder value | ||||||
| 1351 | TgtPtr = nullptr; | ||||||
| 1352 | auto *LastFPArgInfo = | ||||||
| 1353 | FirstPrivateArgInfo.empty() ? nullptr : &FirstPrivateArgInfo.back(); | ||||||
| 1354 | |||||||
| 1355 | // Compute the start alignment of this entry, add padding if necessary. | ||||||
| 1356 | // TODO: Consider sorting instead. | ||||||
| 1357 | uint32_t Padding = 0; | ||||||
| 1358 | uint32_t StartAlignment = | ||||||
| 1359 | LastFPArgInfo ? LastFPArgInfo->Alignment : MaxAlignment; | ||||||
| 1360 | if (LastFPArgInfo) { | ||||||
| 1361 | // Check if we keep the start alignment or if it is shrunk due to the | ||||||
| 1362 | // size of the last element. | ||||||
| 1363 | uint32_t Offset = LastFPArgInfo->Size % StartAlignment; | ||||||
| 1364 | if (Offset) | ||||||
| 1365 | StartAlignment = Offset; | ||||||
| 1366 | // We only need as much alignment as the host pointer had (since we | ||||||
| 1367 | // don't know the alignment information from the source we might end up | ||||||
| 1368 | // overaligning accesses but not too much). | ||||||
| 1369 | uint32_t RequiredAlignment = | ||||||
| 1370 | llvm::bit_floor(getPartialStructRequiredAlignment(HstPtr)); | ||||||
| 1371 | if (RequiredAlignment > StartAlignment) { | ||||||
| 1372 | Padding = RequiredAlignment - StartAlignment; | ||||||
| 1373 | StartAlignment = RequiredAlignment; | ||||||
| 1374 | } | ||||||
| 1375 | } | ||||||
| 1376 | |||||||
| 1377 | FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, | ||||||
| 1378 | StartAlignment, Padding, HstPtrName); | ||||||
| 1379 | FirstPrivateArgSize += Padding + ArgSize; | ||||||
| 1380 | } | ||||||
| 1381 | |||||||
| 1382 | return OFFLOAD_SUCCESS(0); | ||||||
| 1383 | } | ||||||
| 1384 | |||||||
| 1385 | /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, | ||||||
| 1386 | /// and start the transfer. | ||||||
| 1387 | int packAndTransfer(SmallVector<void *> &TgtArgs) { | ||||||
| 1388 | if (!FirstPrivateArgInfo.empty()) { | ||||||
| 1389 | assert(FirstPrivateArgSize != 0 &&(static_cast <bool> (FirstPrivateArgSize != 0 && "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty") ? void (0) : __assert_fail ("FirstPrivateArgSize != 0 && \"FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty\"" , "openmp/libomptarget/src/omptarget.cpp", 1390, __extension__ __PRETTY_FUNCTION__)) | ||||||
| 1390 | "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty")(static_cast <bool> (FirstPrivateArgSize != 0 && "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty") ? void (0) : __assert_fail ("FirstPrivateArgSize != 0 && \"FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty\"" , "openmp/libomptarget/src/omptarget.cpp", 1390, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 1391 | FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0); | ||||||
| 1392 | auto Itr = FirstPrivateArgBuffer.begin(); | ||||||
| 1393 | // Copy all host data to this buffer | ||||||
| 1394 | for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { | ||||||
| 1395 | // First pad the pointer as we (have to) pad it on the device too. | ||||||
| 1396 | Itr = std::next(Itr, Info.Padding); | ||||||
| 1397 | std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); | ||||||
| 1398 | Itr = std::next(Itr, Info.Size); | ||||||
| 1399 | } | ||||||
| 1400 | // Allocate target memory | ||||||
| 1401 | void *TgtPtr = | ||||||
| 1402 | Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); | ||||||
| 1403 | if (TgtPtr == nullptr) { | ||||||
| 1404 | DP("Failed to allocate target memory for private arguments.\n"){}; | ||||||
| 1405 | return OFFLOAD_FAIL(~0); | ||||||
| 1406 | } | ||||||
| 1407 | TgtPtrs.push_back(TgtPtr); | ||||||
| 1408 | DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",{} | ||||||
| 1409 | FirstPrivateArgSize, DPxPTR(TgtPtr)){}; | ||||||
| 1410 | // Transfer data to target device | ||||||
| 1411 | int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), | ||||||
| 1412 | FirstPrivateArgSize, AsyncInfo); | ||||||
| 1413 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1414 | DP("Failed to submit data of private arguments.\n"){}; | ||||||
| 1415 | return OFFLOAD_FAIL(~0); | ||||||
| 1416 | } | ||||||
| 1417 | // Fill in all placeholder pointers | ||||||
| 1418 | auto TP = reinterpret_cast<uintptr_t>(TgtPtr); | ||||||
| 1419 | for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { | ||||||
| 1420 | void *&Ptr = TgtArgs[Info.Index]; | ||||||
| 1421 | assert(Ptr == nullptr && "Target pointer is already set by mistaken")(static_cast <bool> (Ptr == nullptr && "Target pointer is already set by mistaken" ) ? void (0) : __assert_fail ("Ptr == nullptr && \"Target pointer is already set by mistaken\"" , "openmp/libomptarget/src/omptarget.cpp", 1421, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 1422 | // Pad the device pointer to get the right alignment. | ||||||
| 1423 | TP += Info.Padding; | ||||||
| 1424 | Ptr = reinterpret_cast<void *>(TP); | ||||||
| 1425 | TP += Info.Size; | ||||||
| 1426 | DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD{} | ||||||
| 1427 | "\n",{} | ||||||
| 1428 | DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,{} | ||||||
| 1429 | DPxPTR(Ptr)){}; | ||||||
| 1430 | } | ||||||
| 1431 | } | ||||||
| 1432 | |||||||
| 1433 | return OFFLOAD_SUCCESS(0); | ||||||
| 1434 | } | ||||||
| 1435 | |||||||
| 1436 | /// Free all target memory allocated for private arguments | ||||||
| 1437 | int free() { | ||||||
| 1438 | for (void *P : TgtPtrs) { | ||||||
| 1439 | int Ret = Device.deleteData(P); | ||||||
| 1440 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1441 | DP("Deallocation of (first-)private arrays failed.\n"){}; | ||||||
| 1442 | return OFFLOAD_FAIL(~0); | ||||||
| 1443 | } | ||||||
| 1444 | } | ||||||
| 1445 | |||||||
| 1446 | TgtPtrs.clear(); | ||||||
| 1447 | |||||||
| 1448 | return OFFLOAD_SUCCESS(0); | ||||||
| 1449 | } | ||||||
| 1450 | }; | ||||||
| 1451 | |||||||
| 1452 | /// Process data before launching the kernel, including calling targetDataBegin | ||||||
| 1453 | /// to map and transfer data to target device, transferring (first-)private | ||||||
| 1454 | /// variables. | ||||||
| 1455 | static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, | ||||||
| 1456 | int32_t ArgNum, void **ArgBases, void **Args, | ||||||
| 1457 | int64_t *ArgSizes, int64_t *ArgTypes, | ||||||
| 1458 | map_var_info_t *ArgNames, void **ArgMappers, | ||||||
| 1459 | SmallVector<void *> &TgtArgs, | ||||||
| 1460 | SmallVector<ptrdiff_t> &TgtOffsets, | ||||||
| 1461 | PrivateArgumentManagerTy &PrivateArgumentManager, | ||||||
| 1462 | AsyncInfoTy &AsyncInfo) { | ||||||
| 1463 | TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope("mappingBeforeTargetRegion" , SI.getProfileLocation()); | ||||||
| 1464 | DeviceTy &Device = *PM->Devices[DeviceId]; | ||||||
| 1465 | int Ret = targetDataBegin(Loc, Device, ArgNum, ArgBases, Args, ArgSizes, | ||||||
| 1466 | ArgTypes, ArgNames, ArgMappers, AsyncInfo); | ||||||
| 1467 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1468 | REPORT("Call to targetDataBegin failed, abort target.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataBegin failed, abort target.\n"); } while (0);; | ||||||
| 1469 | return OFFLOAD_FAIL(~0); | ||||||
| 1470 | } | ||||||
| 1471 | |||||||
| 1472 | // List of (first-)private arrays allocated for this target region | ||||||
| 1473 | SmallVector<int> TgtArgsPositions(ArgNum, -1); | ||||||
| 1474 | |||||||
| 1475 | for (int32_t I = 0; I < ArgNum; ++I) { | ||||||
| 1476 | if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { | ||||||
| 1477 | // This is not a target parameter, do not push it into TgtArgs. | ||||||
| 1478 | // Check for lambda mapping. | ||||||
| 1479 | if (isLambdaMapping(ArgTypes[I])) { | ||||||
| 1480 | assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&(static_cast <bool> ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF ) && "PTR_AND_OBJ must be also MEMBER_OF.") ? void (0 ) : __assert_fail ("(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && \"PTR_AND_OBJ must be also MEMBER_OF.\"" , "openmp/libomptarget/src/omptarget.cpp", 1481, __extension__ __PRETTY_FUNCTION__)) | ||||||
| 1481 | "PTR_AND_OBJ must be also MEMBER_OF.")(static_cast <bool> ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF ) && "PTR_AND_OBJ must be also MEMBER_OF.") ? void (0 ) : __assert_fail ("(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && \"PTR_AND_OBJ must be also MEMBER_OF.\"" , "openmp/libomptarget/src/omptarget.cpp", 1481, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 1482 | unsigned Idx = getParentIndex(ArgTypes[I]); | ||||||
| 1483 | int TgtIdx = TgtArgsPositions[Idx]; | ||||||
| 1484 | assert(TgtIdx != -1 && "Base address must be translated already.")(static_cast <bool> (TgtIdx != -1 && "Base address must be translated already." ) ? void (0) : __assert_fail ("TgtIdx != -1 && \"Base address must be translated already.\"" , "openmp/libomptarget/src/omptarget.cpp", 1484, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 1485 | // The parent lambda must be processed already and it must be the last | ||||||
| 1486 | // in TgtArgs and TgtOffsets arrays. | ||||||
| 1487 | void *HstPtrVal = Args[I]; | ||||||
| 1488 | void *HstPtrBegin = ArgBases[I]; | ||||||
| 1489 | void *HstPtrBase = Args[Idx]; | ||||||
| 1490 | void *TgtPtrBase = | ||||||
| 1491 | (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); | ||||||
| 1492 | DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)){}; | ||||||
| 1493 | uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; | ||||||
| 1494 | void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); | ||||||
| 1495 | void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); | ||||||
| 1496 | TargetPointerResultTy TPR = Device.getTgtPtrBegin( | ||||||
| 1497 | HstPtrVal, ArgSizes[I], /*UpdateRefCount=*/false, | ||||||
| 1498 | /*UseHoldRefCount=*/false); | ||||||
| 1499 | PointerTgtPtrBegin = TPR.TargetPointer; | ||||||
| 1500 | if (!TPR.isPresent()) { | ||||||
| 1501 | DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",{} | ||||||
| 1502 | DPxPTR(HstPtrVal)){}; | ||||||
| 1503 | continue; | ||||||
| 1504 | } | ||||||
| 1505 | if (TPR.Flags.IsHostPointer) { | ||||||
| 1506 | DP("Unified memory is active, no need to map lambda captured"{} | ||||||
| 1507 | "variable (" DPxMOD ")\n",{} | ||||||
| 1508 | DPxPTR(HstPtrVal)){}; | ||||||
| 1509 | continue; | ||||||
| 1510 | } | ||||||
| 1511 | DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",{} | ||||||
| 1512 | DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)){}; | ||||||
| 1513 | Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, | ||||||
| 1514 | sizeof(void *), AsyncInfo, TPR.getEntry()); | ||||||
| 1515 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1516 | REPORT("Copying data to device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data to device failed.\n"); } while (0);; | ||||||
| 1517 | return OFFLOAD_FAIL(~0); | ||||||
| 1518 | } | ||||||
| 1519 | } | ||||||
| 1520 | continue; | ||||||
| 1521 | } | ||||||
| 1522 | void *HstPtrBegin = Args[I]; | ||||||
| 1523 | void *HstPtrBase = ArgBases[I]; | ||||||
| 1524 | void *TgtPtrBegin; | ||||||
| 1525 | map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; | ||||||
| 1526 | ptrdiff_t TgtBaseOffset; | ||||||
| 1527 | TargetPointerResultTy TPR; | ||||||
| 1528 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { | ||||||
| 1529 | DP("Forwarding first-private value " DPxMOD " to the target construct\n",{} | ||||||
| 1530 | DPxPTR(HstPtrBase)){}; | ||||||
| 1531 | TgtPtrBegin = HstPtrBase; | ||||||
| 1532 | TgtBaseOffset = 0; | ||||||
| 1533 | } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { | ||||||
| 1534 | TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; | ||||||
| 1535 | const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); | ||||||
| 1536 | // If there is a next argument and it depends on the current one, we need | ||||||
| 1537 | // to allocate the private memory immediately. If this is not the case, | ||||||
| 1538 | // then the argument can be marked for optimization and packed with the | ||||||
| 1539 | // other privates. | ||||||
| 1540 | const bool AllocImmediately = | ||||||
| 1541 | (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); | ||||||
| 1542 | Ret = PrivateArgumentManager.addArg( | ||||||
| 1543 | HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, | ||||||
| 1544 | TgtArgs.size(), HstPtrName, AllocImmediately); | ||||||
| 1545 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1546 | REPORT("Failed to process %sprivate argument " DPxMOD "\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to process %sprivate argument " "0x%0*" "l" "x" "\n" , (IsFirstPrivate ? "first-" : ""), ((int)(2 * sizeof(uintptr_t ))), ((uintptr_t)(HstPtrBegin))); } while (0); | ||||||
| 1547 | (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin))do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to process %sprivate argument " "0x%0*" "l" "x" "\n" , (IsFirstPrivate ? "first-" : ""), ((int)(2 * sizeof(uintptr_t ))), ((uintptr_t)(HstPtrBegin))); } while (0);; | ||||||
| 1548 | return OFFLOAD_FAIL(~0); | ||||||
| 1549 | } | ||||||
| 1550 | } else { | ||||||
| 1551 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) | ||||||
| 1552 | HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); | ||||||
| 1553 | TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], | ||||||
| 1554 | /*UpdateRefCount=*/false, | ||||||
| 1555 | /*UseHoldRefCount=*/false); | ||||||
| 1556 | TgtPtrBegin = TPR.TargetPointer; | ||||||
| 1557 | TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; | ||||||
| 1558 | #ifdef OMPTARGET_DEBUG | ||||||
| 1559 | void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); | ||||||
| 1560 | DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",{} | ||||||
| 1561 | DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)){}; | ||||||
| 1562 | #endif | ||||||
| 1563 | } | ||||||
| 1564 | TgtArgsPositions[I] = TgtArgs.size(); | ||||||
| 1565 | TgtArgs.push_back(TgtPtrBegin); | ||||||
| 1566 | TgtOffsets.push_back(TgtBaseOffset); | ||||||
| 1567 | } | ||||||
| 1568 | |||||||
| 1569 | assert(TgtArgs.size() == TgtOffsets.size() &&(static_cast <bool> (TgtArgs.size() == TgtOffsets.size( ) && "Size mismatch in arguments and offsets") ? void (0) : __assert_fail ("TgtArgs.size() == TgtOffsets.size() && \"Size mismatch in arguments and offsets\"" , "openmp/libomptarget/src/omptarget.cpp", 1570, __extension__ __PRETTY_FUNCTION__)) | ||||||
| 1570 | "Size mismatch in arguments and offsets")(static_cast <bool> (TgtArgs.size() == TgtOffsets.size( ) && "Size mismatch in arguments and offsets") ? void (0) : __assert_fail ("TgtArgs.size() == TgtOffsets.size() && \"Size mismatch in arguments and offsets\"" , "openmp/libomptarget/src/omptarget.cpp", 1570, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 1571 | |||||||
| 1572 | // Pack and transfer first-private arguments | ||||||
| 1573 | Ret = PrivateArgumentManager.packAndTransfer(TgtArgs); | ||||||
| 1574 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1575 | DP("Failed to pack and transfer first private arguments\n"){}; | ||||||
| 1576 | return OFFLOAD_FAIL(~0); | ||||||
| 1577 | } | ||||||
| 1578 | |||||||
| 1579 | return OFFLOAD_SUCCESS(0); | ||||||
| 1580 | } | ||||||
| 1581 | |||||||
| 1582 | /// Process data after launching the kernel, including transferring data back to | ||||||
| 1583 | /// host if needed and deallocating target memory of (first-)private variables. | ||||||
| 1584 | static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, | ||||||
| 1585 | int32_t ArgNum, void **ArgBases, void **Args, | ||||||
| 1586 | int64_t *ArgSizes, int64_t *ArgTypes, | ||||||
| 1587 | map_var_info_t *ArgNames, void **ArgMappers, | ||||||
| 1588 | PrivateArgumentManagerTy &PrivateArgumentManager, | ||||||
| 1589 | AsyncInfoTy &AsyncInfo) { | ||||||
| 1590 | TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope("mappingAfterTargetRegion" , SI.getProfileLocation()); | ||||||
| 1591 | DeviceTy &Device = *PM->Devices[DeviceId]; | ||||||
| 1592 | |||||||
| 1593 | // Move data from device. | ||||||
| 1594 | int Ret = targetDataEnd(Loc, Device, ArgNum, ArgBases, Args, ArgSizes, | ||||||
| 1595 | ArgTypes, ArgNames, ArgMappers, AsyncInfo); | ||||||
| 1596 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1597 | REPORT("Call to targetDataEnd failed, abort target.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataEnd failed, abort target.\n"); } while ( 0);; | ||||||
| 1598 | return OFFLOAD_FAIL(~0); | ||||||
| 1599 | } | ||||||
| 1600 | |||||||
| 1601 | // Free target memory for private arguments after synchronization. | ||||||
| 1602 | // TODO: We might want to remove `mutable` in the future by not changing the | ||||||
| 1603 | // captured variables somehow. | ||||||
| 1604 | AsyncInfo.addPostProcessingFunction( | ||||||
| 1605 | [PrivateArgumentManager = | ||||||
| 1606 | std::move(PrivateArgumentManager)]() mutable -> int { | ||||||
| 1607 | int Ret = PrivateArgumentManager.free(); | ||||||
| 1608 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1609 | REPORT("Failed to deallocate target memory for private args\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to deallocate target memory for private args\n"); } while (0);; | ||||||
| 1610 | return OFFLOAD_FAIL(~0); | ||||||
| 1611 | } | ||||||
| 1612 | return Ret; | ||||||
| 1613 | }); | ||||||
| 1614 | |||||||
| 1615 | return OFFLOAD_SUCCESS(0); | ||||||
| 1616 | } | ||||||
| 1617 | } // namespace | ||||||
| 1618 | |||||||
| 1619 | /// performs the same actions as data_begin in case arg_num is | ||||||
| 1620 | /// non-zero and initiates run of the offloaded region on the target platform; | ||||||
| 1621 | /// if arg_num is non-zero after the region execution is done it also | ||||||
| 1622 | /// performs the same action as data_update and data_end above. This function | ||||||
| 1623 | /// returns 0 if it was able to transfer the execution to a target and an | ||||||
| 1624 | /// integer different from zero otherwise. | ||||||
| 1625 | int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, | ||||||
| 1626 | KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) { | ||||||
| 1627 | int32_t DeviceId = Device.DeviceID; | ||||||
| 1628 | TableMap *TM = getTableMap(HostPtr); | ||||||
| 1629 | // No map for this host pointer found! | ||||||
| 1630 | if (!TM) { | ||||||
| 1631 | REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host ptr " "0x%0*" "l" "x" " does not have a matching target pointer.\n" , ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HostPtr))); } while (0); | ||||||
| 1632 | DPxPTR(HostPtr))do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host ptr " "0x%0*" "l" "x" " does not have a matching target pointer.\n" , ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HostPtr))); } while (0);; | ||||||
| 1633 | return OFFLOAD_FAIL(~0); | ||||||
| 1634 | } | ||||||
| 1635 | |||||||
| 1636 | // get target table. | ||||||
| 1637 | __tgt_target_table *TargetTable = nullptr; | ||||||
| 1638 | { | ||||||
| 1639 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); | ||||||
| 1640 | assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&(static_cast <bool> (TM->Table->TargetsTable.size () > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TM->Table->TargetsTable.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 1641, __extension__ __PRETTY_FUNCTION__)) | ||||||
| 1641 | "Not expecting a device ID outside the table's bounds!")(static_cast <bool> (TM->Table->TargetsTable.size () > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TM->Table->TargetsTable.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 1641, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 1642 | TargetTable = TM->Table->TargetsTable[DeviceId]; | ||||||
| 1643 | } | ||||||
| 1644 | assert(TargetTable && "Global data has not been mapped\n")(static_cast <bool> (TargetTable && "Global data has not been mapped\n" ) ? void (0) : __assert_fail ("TargetTable && \"Global data has not been mapped\\n\"" , "openmp/libomptarget/src/omptarget.cpp", 1644, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 1645 | |||||||
| 1646 | DP("loop trip count is %" PRIu64 ".\n", KernelArgs.Tripcount){}; | ||||||
| 1647 | |||||||
| 1648 | // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we | ||||||
| 1649 | // need to manifest base pointers prior to launching a kernel. Even if we have | ||||||
| 1650 | // mapped an object only partially, e.g. A[N:M], although the kernel is | ||||||
| 1651 | // expected to access elements starting at address &A[N] and beyond, we still | ||||||
| 1652 | // need to manifest the base of the array &A[0]. In other cases, e.g. the COI | ||||||
| 1653 | // API, we need the begin address itself, i.e. &A[N], as the API operates on | ||||||
| 1654 | // begin addresses, not bases. That's why we pass args and offsets as two | ||||||
| 1655 | // separate entities so that each plugin can do what it needs. This behavior | ||||||
| 1656 | // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c. | ||||||
| 1657 | SmallVector<void *> TgtArgs; | ||||||
| 1658 | SmallVector<ptrdiff_t> TgtOffsets; | ||||||
| 1659 | |||||||
| 1660 | PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); | ||||||
| 1661 | |||||||
| 1662 | int NumClangLaunchArgs = KernelArgs.NumArgs; | ||||||
| 1663 | int Ret = OFFLOAD_SUCCESS(0); | ||||||
| 1664 | if (NumClangLaunchArgs) { | ||||||
| 1665 | // Process data, such as data mapping, before launching the kernel | ||||||
| 1666 | Ret = processDataBefore(Loc, DeviceId, HostPtr, NumClangLaunchArgs, | ||||||
| 1667 | KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs, | ||||||
| 1668 | KernelArgs.ArgSizes, KernelArgs.ArgTypes, | ||||||
| 1669 | KernelArgs.ArgNames, KernelArgs.ArgMappers, TgtArgs, | ||||||
| 1670 | TgtOffsets, PrivateArgumentManager, AsyncInfo); | ||||||
| 1671 | if (Ret
| ||||||
| 1672 | REPORT("Failed to process data before launching the kernel.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to process data before launching the kernel.\n"); } while (0);; | ||||||
| 1673 | return OFFLOAD_FAIL(~0); | ||||||
| 1674 | } | ||||||
| 1675 | |||||||
| 1676 | // Clang might pass more values via the ArgPtrs to the runtime that we pass | ||||||
| 1677 | // on to the kernel. | ||||||
| 1678 | // TOOD: Next time we adjust the KernelArgsTy we should introduce a new | ||||||
| 1679 | // NumKernelArgs field. | ||||||
| 1680 | KernelArgs.NumArgs = TgtArgs.size(); | ||||||
| 1681 | } | ||||||
| 1682 | |||||||
| 1683 | // Launch device execution. | ||||||
| 1684 | void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; | ||||||
| 1685 | DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",{} | ||||||
| 1686 | TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index){}; | ||||||
| 1687 | |||||||
| 1688 | { | ||||||
| 1689 | assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!")(static_cast <bool> (KernelArgs.NumArgs == TgtArgs.size () && "Argument count mismatch!") ? void (0) : __assert_fail ("KernelArgs.NumArgs == TgtArgs.size() && \"Argument count mismatch!\"" , "openmp/libomptarget/src/omptarget.cpp", 1689, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 1690 | TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope("Initiate Kernel Launch" , SI.getProfileLocation()); | ||||||
| 1691 | Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(), | ||||||
| 1692 | KernelArgs, AsyncInfo); | ||||||
| 1693 | } | ||||||
| 1694 | |||||||
| 1695 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1696 | REPORT("Executing target region abort target.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Executing target region abort target.\n"); } while (0);; | ||||||
| 1697 | return OFFLOAD_FAIL(~0); | ||||||
| 1698 | } | ||||||
| 1699 | |||||||
| 1700 | if (NumClangLaunchArgs
| ||||||
| 1701 | // Transfer data back and deallocate target memory for (first-)private | ||||||
| 1702 | // variables | ||||||
| 1703 | Ret = processDataAfter(Loc, DeviceId, HostPtr, NumClangLaunchArgs, | ||||||
| 1704 | KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs, | ||||||
| 1705 | KernelArgs.ArgSizes, KernelArgs.ArgTypes, | ||||||
| 1706 | KernelArgs.ArgNames, KernelArgs.ArgMappers, | ||||||
| 1707 | PrivateArgumentManager, AsyncInfo); | ||||||
| 1708 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1709 | REPORT("Failed to process data after launching the kernel.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to process data after launching the kernel.\n"); } while (0);; | ||||||
| 1710 | return OFFLOAD_FAIL(~0); | ||||||
| 1711 | } | ||||||
| 1712 | } | ||||||
| 1713 | |||||||
| 1714 | return OFFLOAD_SUCCESS(0); | ||||||
| 1715 | } | ||||||
| 1716 | |||||||
| 1717 | /// Executes a kernel using pre-recorded information for loading to | ||||||
| 1718 | /// device memory to launch the target kernel with the pre-recorded | ||||||
| 1719 | /// configuration. | ||||||
| 1720 | int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, | ||||||
| 1721 | void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs, | ||||||
| 1722 | ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, | ||||||
| 1723 | int32_t ThreadLimit, uint64_t LoopTripCount, | ||||||
| 1724 | AsyncInfoTy &AsyncInfo) { | ||||||
| 1725 | int32_t DeviceId = Device.DeviceID; | ||||||
| 1726 | TableMap *TM = getTableMap(HostPtr); | ||||||
| 1727 | // Fail if the table map fails to find the target kernel pointer for the | ||||||
| 1728 | // provided host pointer. | ||||||
| 1729 | if (!TM) { | ||||||
| 1730 | REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host ptr " "0x%0*" "l" "x" " does not have a matching target pointer.\n" , ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HostPtr))); } while (0); | ||||||
| 1731 | DPxPTR(HostPtr))do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host ptr " "0x%0*" "l" "x" " does not have a matching target pointer.\n" , ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HostPtr))); } while (0);; | ||||||
| 1732 | return OFFLOAD_FAIL(~0); | ||||||
| 1733 | } | ||||||
| 1734 | |||||||
| 1735 | // Retrieve the target table of offloading entries. | ||||||
| 1736 | __tgt_target_table *TargetTable = nullptr; | ||||||
| 1737 | { | ||||||
| 1738 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); | ||||||
| 1739 | assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&(static_cast <bool> (TM->Table->TargetsTable.size () > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TM->Table->TargetsTable.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 1740, __extension__ __PRETTY_FUNCTION__)) | ||||||
| 1740 | "Not expecting a device ID outside the table's bounds!")(static_cast <bool> (TM->Table->TargetsTable.size () > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TM->Table->TargetsTable.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 1740, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 1741 | TargetTable = TM->Table->TargetsTable[DeviceId]; | ||||||
| 1742 | } | ||||||
| 1743 | assert(TargetTable && "Global data has not been mapped\n")(static_cast <bool> (TargetTable && "Global data has not been mapped\n" ) ? void (0) : __assert_fail ("TargetTable && \"Global data has not been mapped\\n\"" , "openmp/libomptarget/src/omptarget.cpp", 1743, __extension__ __PRETTY_FUNCTION__)); | ||||||
| 1744 | |||||||
| 1745 | // Retrieve the target kernel pointer, allocate and store the recorded device | ||||||
| 1746 | // memory data, and launch device execution. | ||||||
| 1747 | void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; | ||||||
| 1748 | DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",{} | ||||||
| 1749 | TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index){}; | ||||||
| 1750 | |||||||
| 1751 | void *TgtPtr = Device.allocData(DeviceMemorySize, /* HstPtr */ nullptr, | ||||||
| 1752 | TARGET_ALLOC_DEFAULT); | ||||||
| 1753 | Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); | ||||||
| 1754 | |||||||
| 1755 | KernelArgsTy KernelArgs = {0}; | ||||||
| 1756 | KernelArgs.Version = 2; | ||||||
| 1757 | KernelArgs.NumArgs = NumArgs; | ||||||
| 1758 | KernelArgs.Tripcount = LoopTripCount; | ||||||
| 1759 | KernelArgs.NumTeams[0] = NumTeams; | ||||||
| 1760 | KernelArgs.ThreadLimit[0] = ThreadLimit; | ||||||
| 1761 | |||||||
| 1762 | int Ret = Device.launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs, | ||||||
| 1763 | AsyncInfo); | ||||||
| 1764 | |||||||
| 1765 | if (Ret != OFFLOAD_SUCCESS(0)) { | ||||||
| 1766 | REPORT("Executing target region abort target.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Executing target region abort target.\n"); } while (0);; | ||||||
| 1767 | return OFFLOAD_FAIL(~0); | ||||||
| 1768 | } | ||||||
| 1769 | |||||||
| 1770 | return OFFLOAD_SUCCESS(0); | ||||||
| 1771 | } |