File: | build/source/openmp/libomptarget/src/omptarget.cpp |
Warning: | line 635, column 7 Value stored to 'IsHostPtr' is never read |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
1 | //===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// |
2 | // |
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | // See https://llvm.org/LICENSE.txt for license information. |
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | // |
7 | //===----------------------------------------------------------------------===// |
8 | // |
9 | // Implementation of the interface to be used by Clang during the codegen of a |
10 | // target region. |
11 | // |
12 | //===----------------------------------------------------------------------===// |
13 | |
14 | #include "omptarget.h" |
15 | #include "device.h" |
16 | #include "private.h" |
17 | #include "rtl.h" |
18 | |
19 | #include <cassert> |
20 | #include <cstdint> |
21 | #include <vector> |
22 | |
23 | using llvm::SmallVector; |
24 | |
25 | int AsyncInfoTy::synchronize() { |
26 | int Result = OFFLOAD_SUCCESS(0); |
27 | if (!isQueueEmpty()) { |
28 | switch (SyncType) { |
29 | case SyncTy::BLOCKING: |
30 | // If we have a queue we need to synchronize it now. |
31 | Result = Device.synchronize(*this); |
32 | assert(AsyncInfo.Queue == nullptr &&(static_cast <bool> (AsyncInfo.Queue == nullptr && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!") ? void (0) : __assert_fail ("AsyncInfo.Queue == nullptr && \"The device plugin should have nulled the queue to indicate there \" \"are no outstanding actions!\"" , "openmp/libomptarget/src/omptarget.cpp", 34, __extension__ __PRETTY_FUNCTION__ )) |
33 | "The device plugin should have nulled the queue to indicate there "(static_cast <bool> (AsyncInfo.Queue == nullptr && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!") ? void (0) : __assert_fail ("AsyncInfo.Queue == nullptr && \"The device plugin should have nulled the queue to indicate there \" \"are no outstanding actions!\"" , "openmp/libomptarget/src/omptarget.cpp", 34, __extension__ __PRETTY_FUNCTION__ )) |
34 | "are no outstanding actions!")(static_cast <bool> (AsyncInfo.Queue == nullptr && "The device plugin should have nulled the queue to indicate there " "are no outstanding actions!") ? void (0) : __assert_fail ("AsyncInfo.Queue == nullptr && \"The device plugin should have nulled the queue to indicate there \" \"are no outstanding actions!\"" , "openmp/libomptarget/src/omptarget.cpp", 34, __extension__ __PRETTY_FUNCTION__ )); |
35 | break; |
36 | case SyncTy::NON_BLOCKING: |
37 | Result = Device.queryAsync(*this); |
38 | break; |
39 | } |
40 | } |
41 | |
42 | // Run any pending post-processing function registered on this async object. |
43 | if (Result == OFFLOAD_SUCCESS(0) && isQueueEmpty()) |
44 | Result = runPostProcessing(); |
45 | |
46 | return Result; |
47 | } |
48 | |
49 | void *&AsyncInfoTy::getVoidPtrLocation() { |
50 | BufferLocations.push_back(nullptr); |
51 | return BufferLocations.back(); |
52 | } |
53 | |
54 | bool AsyncInfoTy::isDone() { |
55 | synchronize(); |
56 | // The async info operations are completed when the internal queue is empty. |
57 | return isQueueEmpty(); |
58 | } |
59 | |
60 | int32_t AsyncInfoTy::runPostProcessing() { |
61 | size_t Size = PostProcessingFunctions.size(); |
62 | for (size_t I = 0; I < Size; ++I) { |
63 | const int Result = PostProcessingFunctions[I](); |
64 | if (Result != OFFLOAD_SUCCESS(0)) |
65 | return Result; |
66 | } |
67 | |
68 | // Clear the vector up until the last known function, since post-processing |
69 | // procedures might add new procedures themselves. |
70 | const auto PrevBegin = PostProcessingFunctions.begin(); |
71 | PostProcessingFunctions.erase(PrevBegin, PrevBegin + Size); |
72 | |
73 | return OFFLOAD_SUCCESS(0); |
74 | } |
75 | |
76 | bool AsyncInfoTy::isQueueEmpty() const { return AsyncInfo.Queue == nullptr; } |
77 | |
78 | /* All begin addresses for partially mapped structs must be 8-aligned in order |
79 | * to ensure proper alignment of members. E.g. |
80 | * |
81 | * struct S { |
82 | * int a; // 4-aligned |
83 | * int b; // 4-aligned |
84 | * int *p; // 8-aligned |
85 | * } s1; |
86 | * ... |
87 | * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) |
88 | * { |
89 | * s1.b = 5; |
90 | * for (int i...) s1.p[i] = ...; |
91 | * } |
92 | * |
93 | * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and |
94 | * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, |
95 | * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment |
96 | * requirements for its type. Now, when we allocate memory on the device, in |
97 | * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. |
98 | * This means that the chunk of the struct on the device will start at a |
99 | * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and |
100 | * address of p will be a misaligned 0x204 (on the host there was no need to add |
101 | * padding between b and p, so p comes exactly 4 bytes after b). If the device |
102 | * kernel tries to access s1.p, a misaligned address error occurs (as reported |
103 | * by the CUDA plugin). By padding the begin address down to a multiple of 8 and |
104 | * extending the size of the allocated chuck accordingly, the chuck on the |
105 | * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and |
106 | * &s1.p=0x208, as they should be to satisfy the alignment requirements. |
107 | */ |
108 | static const int64_t Alignment = 8; |
109 | |
110 | /// Map global data and execute pending ctors |
111 | static int initLibrary(DeviceTy &Device) { |
112 | /* |
113 | * Map global data |
114 | */ |
115 | int32_t DeviceId = Device.DeviceID; |
116 | int Rc = OFFLOAD_SUCCESS(0); |
117 | bool SupportsEmptyImages = Device.RTL->supports_empty_images && |
118 | Device.RTL->supports_empty_images() > 0; |
119 | { |
120 | std::lock_guard<decltype(PM->TrlTblMtx)> LG(PM->TrlTblMtx); |
121 | for (auto *HostEntriesBegin : PM->HostEntriesBeginRegistrationOrder) { |
122 | TranslationTable *TransTable = |
123 | &PM->HostEntriesBeginToTransTable[HostEntriesBegin]; |
124 | if (TransTable->HostTable.EntriesBegin == |
125 | TransTable->HostTable.EntriesEnd && |
126 | !SupportsEmptyImages) { |
127 | // No host entry so no need to proceed |
128 | continue; |
129 | } |
130 | |
131 | if (TransTable->TargetsTable[DeviceId] != 0) { |
132 | // Library entries have already been processed |
133 | continue; |
134 | } |
135 | |
136 | // 1) get image. |
137 | assert(TransTable->TargetsImages.size() > (size_t)DeviceId &&(static_cast <bool> (TransTable->TargetsImages.size( ) > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TransTable->TargetsImages.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 138, __extension__ __PRETTY_FUNCTION__)) |
138 | "Not expecting a device ID outside the table's bounds!")(static_cast <bool> (TransTable->TargetsImages.size( ) > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TransTable->TargetsImages.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 138, __extension__ __PRETTY_FUNCTION__)); |
139 | __tgt_device_image *Img = TransTable->TargetsImages[DeviceId]; |
140 | if (!Img) { |
141 | REPORT("No image loaded for device id %d.\n", DeviceId)do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "No image loaded for device id %d.\n", DeviceId); } while ( 0);; |
142 | Rc = OFFLOAD_FAIL(~0); |
143 | break; |
144 | } |
145 | // 2) load image into the target table. |
146 | __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] = |
147 | Device.loadBinary(Img); |
148 | // Unable to get table for this image: invalidate image and fail. |
149 | if (!TargetTable) { |
150 | REPORT("Unable to generate entries table for device id %d.\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Unable to generate entries table for device id %d.\n", DeviceId ); } while (0); |
151 | DeviceId)do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Unable to generate entries table for device id %d.\n", DeviceId ); } while (0);; |
152 | TransTable->TargetsImages[DeviceId] = 0; |
153 | Rc = OFFLOAD_FAIL(~0); |
154 | break; |
155 | } |
156 | |
157 | // Verify whether the two table sizes match. |
158 | size_t Hsize = |
159 | TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; |
160 | size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; |
161 | |
162 | // Invalid image for these host entries! |
163 | if (Hsize != Tsize) { |
164 | REPORT(do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host and Target tables mismatch for device id %d [%zx != %zx].\n" , DeviceId, Hsize, Tsize); } while (0); |
165 | "Host and Target tables mismatch for device id %d [%zx != %zx].\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host and Target tables mismatch for device id %d [%zx != %zx].\n" , DeviceId, Hsize, Tsize); } while (0); |
166 | DeviceId, Hsize, Tsize)do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host and Target tables mismatch for device id %d [%zx != %zx].\n" , DeviceId, Hsize, Tsize); } while (0);; |
167 | TransTable->TargetsImages[DeviceId] = 0; |
168 | TransTable->TargetsTable[DeviceId] = 0; |
169 | Rc = OFFLOAD_FAIL(~0); |
170 | break; |
171 | } |
172 | |
173 | DeviceTy::HDTTMapAccessorTy HDTTMap = |
174 | Device.HostDataToTargetMap.getExclusiveAccessor(); |
175 | |
176 | __tgt_target_table *HostTable = &TransTable->HostTable; |
177 | for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, |
178 | *CurrHostEntry = HostTable->EntriesBegin, |
179 | *EntryDeviceEnd = TargetTable->EntriesEnd; |
180 | CurrDeviceEntry != EntryDeviceEnd; |
181 | CurrDeviceEntry++, CurrHostEntry++) { |
182 | if (CurrDeviceEntry->size != 0) { |
183 | // has data. |
184 | assert(CurrDeviceEntry->size == CurrHostEntry->size &&(static_cast <bool> (CurrDeviceEntry->size == CurrHostEntry ->size && "data size mismatch") ? void (0) : __assert_fail ("CurrDeviceEntry->size == CurrHostEntry->size && \"data size mismatch\"" , "openmp/libomptarget/src/omptarget.cpp", 185, __extension__ __PRETTY_FUNCTION__)) |
185 | "data size mismatch")(static_cast <bool> (CurrDeviceEntry->size == CurrHostEntry ->size && "data size mismatch") ? void (0) : __assert_fail ("CurrDeviceEntry->size == CurrHostEntry->size && \"data size mismatch\"" , "openmp/libomptarget/src/omptarget.cpp", 185, __extension__ __PRETTY_FUNCTION__)); |
186 | |
187 | // Fortran may use multiple weak declarations for the same symbol, |
188 | // therefore we must allow for multiple weak symbols to be loaded from |
189 | // the fat binary. Treat these mappings as any other "regular" |
190 | // mapping. Add entry to map. |
191 | if (Device.getTgtPtrBegin(HDTTMap, CurrHostEntry->addr, |
192 | CurrHostEntry->size)) |
193 | continue; |
194 | |
195 | DP("Add mapping from host " DPxMOD " to device " DPxMOD{} |
196 | " with size %zu"{} |
197 | "\n",{} |
198 | DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),{} |
199 | CurrDeviceEntry->size){}; |
200 | HDTTMap->emplace(new HostDataToTargetTy( |
201 | (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, |
202 | (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, |
203 | (uintptr_t)CurrHostEntry->addr + |
204 | CurrHostEntry->size /*HstPtrEnd*/, |
205 | (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, |
206 | false /*UseHoldRefCount*/, CurrHostEntry->name, |
207 | true /*IsRefCountINF*/)); |
208 | } |
209 | } |
210 | } |
211 | } |
212 | |
213 | if (Rc != OFFLOAD_SUCCESS(0)) { |
214 | return Rc; |
215 | } |
216 | |
217 | /* |
218 | * Run ctors for static objects |
219 | */ |
220 | if (!Device.PendingCtorsDtors.empty()) { |
221 | AsyncInfoTy AsyncInfo(Device); |
222 | // Call all ctors for all libraries registered so far |
223 | for (auto &Lib : Device.PendingCtorsDtors) { |
224 | if (!Lib.second.PendingCtors.empty()) { |
225 | DP("Has pending ctors... call now\n"){}; |
226 | for (auto &Entry : Lib.second.PendingCtors) { |
227 | void *Ctor = Entry; |
228 | int Rc = target(nullptr, Device, Ctor, CTorDTorKernelArgs, AsyncInfo); |
229 | if (Rc != OFFLOAD_SUCCESS(0)) { |
230 | REPORT("Running ctor " DPxMOD " failed.\n", DPxPTR(Ctor))do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Running ctor " "0x%0*" "l" "x" " failed.\n", ((int)(2 * sizeof (uintptr_t))), ((uintptr_t)(Ctor))); } while (0);; |
231 | return OFFLOAD_FAIL(~0); |
232 | } |
233 | } |
234 | // Clear the list to indicate that this device has been used |
235 | Lib.second.PendingCtors.clear(); |
236 | DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(Lib.first)){}; |
237 | } |
238 | } |
239 | // All constructors have been issued, wait for them now. |
240 | if (AsyncInfo.synchronize() != OFFLOAD_SUCCESS(0)) |
241 | return OFFLOAD_FAIL(~0); |
242 | } |
243 | Device.HasPendingGlobals = false; |
244 | |
245 | return OFFLOAD_SUCCESS(0); |
246 | } |
247 | |
248 | void handleTargetOutcome(bool Success, ident_t *Loc) { |
249 | switch (PM->TargetOffloadPolicy) { |
250 | case tgt_disabled: |
251 | if (Success) { |
252 | FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled")do { fprintf(stderr, "Libomptarget" " fatal error %d: %s\n", 1 , "expected no offloading while offloading is disabled"); abort (); } while (0); |
253 | } |
254 | break; |
255 | case tgt_default: |
256 | FATAL_MESSAGE0(1, "default offloading policy must be switched to "do { fprintf(stderr, "Libomptarget" " fatal error %d: %s\n", 1 , "default offloading policy must be switched to " "mandatory or disabled" ); abort(); } while (0) |
257 | "mandatory or disabled")do { fprintf(stderr, "Libomptarget" " fatal error %d: %s\n", 1 , "default offloading policy must be switched to " "mandatory or disabled" ); abort(); } while (0); |
258 | break; |
259 | case tgt_mandatory: |
260 | if (!Success) { |
261 | if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) |
262 | for (auto &Device : PM->Devices) |
263 | dumpTargetPointerMappings(Loc, *Device); |
264 | else |
265 | FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html "do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Consult https://openmp.llvm.org/design/Runtimes.html " "for debugging options.\n" ); } while (0) |
266 | "for debugging options.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Consult https://openmp.llvm.org/design/Runtimes.html " "for debugging options.\n" ); } while (0); |
267 | |
268 | SourceInfo Info(Loc); |
269 | if (Info.isAvailible()) |
270 | fprintf(stderrstderr, "%s:%d:%d: ", Info.getFilename(), Info.getLine(), |
271 | Info.getColumn()); |
272 | else |
273 | FAILURE_MESSAGE("Source location information not present. Compile with "do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Source location information not present. Compile with " "-g or -gline-tables-only.\n" ); } while (0) |
274 | "-g or -gline-tables-only.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Source location information not present. Compile with " "-g or -gline-tables-only.\n" ); } while (0); |
275 | FATAL_MESSAGE0(do { fprintf(stderr, "Libomptarget" " fatal error %d: %s\n", 1 , "failure of target construct while offloading is mandatory" ); abort(); } while (0) |
276 | 1, "failure of target construct while offloading is mandatory")do { fprintf(stderr, "Libomptarget" " fatal error %d: %s\n", 1 , "failure of target construct while offloading is mandatory" ); abort(); } while (0); |
277 | } else { |
278 | if (getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE) |
279 | for (auto &Device : PM->Devices) |
280 | dumpTargetPointerMappings(Loc, *Device); |
281 | } |
282 | break; |
283 | } |
284 | } |
285 | |
286 | static void handleDefaultTargetOffload() { |
287 | std::lock_guard<decltype(PM->TargetOffloadMtx)> LG(PM->TargetOffloadMtx); |
288 | if (PM->TargetOffloadPolicy == tgt_default) { |
289 | if (omp_get_num_devices() > 0) { |
290 | DP("Default TARGET OFFLOAD policy is now mandatory "{} |
291 | "(devices were found)\n"){}; |
292 | PM->TargetOffloadPolicy = tgt_mandatory; |
293 | } else { |
294 | DP("Default TARGET OFFLOAD policy is now disabled "{} |
295 | "(no devices were found)\n"){}; |
296 | PM->TargetOffloadPolicy = tgt_disabled; |
297 | } |
298 | } |
299 | } |
300 | |
301 | static bool isOffloadDisabled() { |
302 | if (PM->TargetOffloadPolicy == tgt_default) |
303 | handleDefaultTargetOffload(); |
304 | return PM->TargetOffloadPolicy == tgt_disabled; |
305 | } |
306 | |
307 | // If offload is enabled, ensure that device DeviceID has been initialized, |
308 | // global ctors have been executed, and global data has been mapped. |
309 | // |
310 | // The return bool indicates if the offload is to the host device |
311 | // There are three possible results: |
312 | // - Return false if the taregt device is ready for offload |
313 | // - Return true without reporting a runtime error if offload is |
314 | // disabled, perhaps because the initial device was specified. |
315 | // - Report a runtime error and return true. |
316 | // |
317 | // If DeviceID == OFFLOAD_DEVICE_DEFAULT, set DeviceID to the default device. |
318 | // This step might be skipped if offload is disabled. |
319 | bool checkDeviceAndCtors(int64_t &DeviceID, ident_t *Loc) { |
320 | if (isOffloadDisabled()) { |
321 | DP("Offload is disabled\n"){}; |
322 | return true; |
323 | } |
324 | |
325 | if (DeviceID == OFFLOAD_DEVICE_DEFAULT-1) { |
326 | DeviceID = omp_get_default_device(); |
327 | DP("Use default device id %" PRId64 "\n", DeviceID){}; |
328 | } |
329 | |
330 | // Proposed behavior for OpenMP 5.2 in OpenMP spec github issue 2669. |
331 | if (omp_get_num_devices() == 0) { |
332 | DP("omp_get_num_devices() == 0 but offload is manadatory\n"){}; |
333 | handleTargetOutcome(false, Loc); |
334 | return true; |
335 | } |
336 | |
337 | if (DeviceID == omp_get_initial_device()) { |
338 | DP("Device is host (%" PRId64 "), returning as if offload is disabled\n",{} |
339 | DeviceID){}; |
340 | return true; |
341 | } |
342 | |
343 | // Is device ready? |
344 | if (!deviceIsReady(DeviceID)) { |
345 | REPORT("Device %" PRId64 " is not ready.\n", DeviceID)do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Device %" "l" "d" " is not ready.\n", DeviceID); } while ( 0);; |
346 | handleTargetOutcome(false, Loc); |
347 | return true; |
348 | } |
349 | |
350 | // Get device info. |
351 | DeviceTy &Device = *PM->Devices[DeviceID]; |
352 | |
353 | // Check whether global data has been mapped for this device |
354 | { |
355 | std::lock_guard<decltype(Device.PendingGlobalsMtx)> LG( |
356 | Device.PendingGlobalsMtx); |
357 | if (Device.HasPendingGlobals && initLibrary(Device) != OFFLOAD_SUCCESS(0)) { |
358 | REPORT("Failed to init globals on device %" PRId64 "\n", DeviceID)do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to init globals on device %" "l" "d" "\n", DeviceID ); } while (0);; |
359 | handleTargetOutcome(false, Loc); |
360 | return true; |
361 | } |
362 | } |
363 | |
364 | return false; |
365 | } |
366 | |
367 | static int32_t getParentIndex(int64_t Type) { |
368 | return ((Type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; |
369 | } |
370 | |
371 | void *targetAllocExplicit(size_t Size, int DeviceNum, int Kind, |
372 | const char *Name) { |
373 | TIMESCOPE()llvm::TimeTraceScope TimeScope(__FUNCTION__); |
374 | DP("Call to %s for device %d requesting %zu bytes\n", Name, DeviceNum, Size){}; |
375 | |
376 | if (Size <= 0) { |
377 | DP("Call to %s with non-positive length\n", Name){}; |
378 | return NULL__null; |
379 | } |
380 | |
381 | void *Rc = NULL__null; |
382 | |
383 | if (DeviceNum == omp_get_initial_device()) { |
384 | Rc = malloc(Size); |
385 | DP("%s returns host ptr " DPxMOD "\n", Name, DPxPTR(Rc)){}; |
386 | return Rc; |
387 | } |
388 | |
389 | if (!deviceIsReady(DeviceNum)) { |
390 | DP("%s returns NULL ptr\n", Name){}; |
391 | return NULL__null; |
392 | } |
393 | |
394 | DeviceTy &Device = *PM->Devices[DeviceNum]; |
395 | Rc = Device.allocData(Size, nullptr, Kind); |
396 | DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(Rc)){}; |
397 | return Rc; |
398 | } |
399 | |
400 | void targetFreeExplicit(void *DevicePtr, int DeviceNum, int Kind, |
401 | const char *Name) { |
402 | TIMESCOPE()llvm::TimeTraceScope TimeScope(__FUNCTION__); |
403 | DP("Call to %s for device %d and address " DPxMOD "\n", Name, DeviceNum,{} |
404 | DPxPTR(DevicePtr)){}; |
405 | |
406 | if (!DevicePtr) { |
407 | DP("Call to %s with NULL ptr\n", Name){}; |
408 | return; |
409 | } |
410 | |
411 | if (DeviceNum == omp_get_initial_device()) { |
412 | free(DevicePtr); |
413 | DP("%s deallocated host ptr\n", Name){}; |
414 | return; |
415 | } |
416 | |
417 | if (!deviceIsReady(DeviceNum)) { |
418 | DP("%s returns, nothing to do\n", Name){}; |
419 | return; |
420 | } |
421 | |
422 | PM->Devices[DeviceNum]->deleteData(DevicePtr, Kind); |
423 | DP("omp_target_free deallocated device ptr\n"){}; |
424 | } |
425 | |
426 | void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum, |
427 | const char *Name) { |
428 | TIMESCOPE()llvm::TimeTraceScope TimeScope(__FUNCTION__); |
429 | DP("Call to %s for device %d locking %zu bytes\n", Name, DeviceNum, Size){}; |
430 | |
431 | if (Size <= 0) { |
432 | DP("Call to %s with non-positive length\n", Name){}; |
433 | return NULL__null; |
434 | } |
435 | |
436 | void *rc = NULL__null; |
437 | |
438 | if (!deviceIsReady(DeviceNum)) { |
439 | DP("%s returns NULL ptr\n", Name){}; |
440 | return NULL__null; |
441 | } |
442 | |
443 | DeviceTy *DevicePtr = nullptr; |
444 | { |
445 | std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx); |
446 | |
447 | if (!PM->Devices[DeviceNum]) { |
448 | DP("%s returns, device %d not available\n", Name, DeviceNum){}; |
449 | return nullptr; |
450 | } |
451 | |
452 | DevicePtr = PM->Devices[DeviceNum].get(); |
453 | } |
454 | |
455 | int32_t err = 0; |
456 | if (DevicePtr->RTL->data_lock) { |
457 | err = DevicePtr->RTL->data_lock(DeviceNum, HostPtr, Size, &rc); |
458 | if (err) { |
459 | DP("Could not lock ptr %p\n", HostPtr){}; |
460 | return nullptr; |
461 | } |
462 | } |
463 | DP("%s returns device ptr " DPxMOD "\n", Name, DPxPTR(rc)){}; |
464 | return rc; |
465 | } |
466 | |
467 | void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name) { |
468 | TIMESCOPE()llvm::TimeTraceScope TimeScope(__FUNCTION__); |
469 | DP("Call to %s for device %d unlocking\n", Name, DeviceNum){}; |
470 | |
471 | DeviceTy *DevicePtr = nullptr; |
472 | { |
473 | std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx); |
474 | |
475 | // Don't check deviceIsReady as it can initialize the device if needed. |
476 | // Just check if DeviceNum exists as targetUnlockExplicit can be called |
477 | // during process exit/free (and it may have been already destroyed) and |
478 | // targetAllocExplicit will have already checked deviceIsReady anyway. |
479 | size_t DevicesSize = PM->Devices.size(); |
480 | |
481 | if (DevicesSize <= (size_t)DeviceNum) { |
482 | DP("Device ID %d does not have a matching RTL\n", DeviceNum){}; |
483 | return; |
484 | } |
485 | |
486 | if (!PM->Devices[DeviceNum]) { |
487 | DP("%s returns, device %d not available\n", Name, DeviceNum){}; |
488 | return; |
489 | } |
490 | |
491 | DevicePtr = PM->Devices[DeviceNum].get(); |
492 | } // unlock RTLsMtx |
493 | |
494 | if (DevicePtr->RTL->data_unlock) |
495 | DevicePtr->RTL->data_unlock(DeviceNum, HostPtr); |
496 | |
497 | DP("%s returns\n", Name){}; |
498 | } |
499 | |
500 | /// Call the user-defined mapper function followed by the appropriate |
501 | // targetData* function (targetData{Begin,End,Update}). |
502 | int targetDataMapper(ident_t *Loc, DeviceTy &Device, void *ArgBase, void *Arg, |
503 | int64_t ArgSize, int64_t ArgType, map_var_info_t ArgNames, |
504 | void *ArgMapper, AsyncInfoTy &AsyncInfo, |
505 | TargetDataFuncPtrTy TargetDataFunction) { |
506 | TIMESCOPE_WITH_IDENT(Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope(__FUNCTION__ , SI.getProfileLocation()); |
507 | DP("Calling the mapper function " DPxMOD "\n", DPxPTR(ArgMapper)){}; |
508 | |
509 | // The mapper function fills up Components. |
510 | MapperComponentsTy MapperComponents; |
511 | MapperFuncPtrTy MapperFuncPtr = (MapperFuncPtrTy)(ArgMapper); |
512 | (*MapperFuncPtr)((void *)&MapperComponents, ArgBase, Arg, ArgSize, ArgType, |
513 | ArgNames); |
514 | |
515 | // Construct new arrays for args_base, args, arg_sizes and arg_types |
516 | // using the information in MapperComponents and call the corresponding |
517 | // targetData* function using these new arrays. |
518 | SmallVector<void *> MapperArgsBase(MapperComponents.Components.size()); |
519 | SmallVector<void *> MapperArgs(MapperComponents.Components.size()); |
520 | SmallVector<int64_t> MapperArgSizes(MapperComponents.Components.size()); |
521 | SmallVector<int64_t> MapperArgTypes(MapperComponents.Components.size()); |
522 | SmallVector<void *> MapperArgNames(MapperComponents.Components.size()); |
523 | |
524 | for (unsigned I = 0, E = MapperComponents.Components.size(); I < E; ++I) { |
525 | auto &C = MapperComponents.Components[I]; |
526 | MapperArgsBase[I] = C.Base; |
527 | MapperArgs[I] = C.Begin; |
528 | MapperArgSizes[I] = C.Size; |
529 | MapperArgTypes[I] = C.Type; |
530 | MapperArgNames[I] = C.Name; |
531 | } |
532 | |
533 | int Rc = TargetDataFunction(Loc, Device, MapperComponents.Components.size(), |
534 | MapperArgsBase.data(), MapperArgs.data(), |
535 | MapperArgSizes.data(), MapperArgTypes.data(), |
536 | MapperArgNames.data(), /*arg_mappers*/ nullptr, |
537 | AsyncInfo, /*FromMapper=*/true); |
538 | |
539 | return Rc; |
540 | } |
541 | |
542 | /// Internal function to do the mapping and transfer the data to the device |
543 | int targetDataBegin(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, |
544 | void **ArgsBase, void **Args, int64_t *ArgSizes, |
545 | int64_t *ArgTypes, map_var_info_t *ArgNames, |
546 | void **ArgMappers, AsyncInfoTy &AsyncInfo, |
547 | bool FromMapper) { |
548 | // process each input. |
549 | for (int32_t I = 0; I < ArgNum; ++I) { |
550 | // Ignore private variables and arrays - there is no mapping for them. |
551 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || |
552 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) |
553 | continue; |
554 | |
555 | if (ArgMappers && ArgMappers[I]) { |
556 | // Instead of executing the regular path of targetDataBegin, call the |
557 | // targetDataMapper variant which will call targetDataBegin again |
558 | // with new arguments. |
559 | DP("Calling targetDataMapper for the %dth argument\n", I){}; |
560 | |
561 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; |
562 | int Rc = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], |
563 | ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, |
564 | targetDataBegin); |
565 | |
566 | if (Rc != OFFLOAD_SUCCESS(0)) { |
567 | REPORT("Call to targetDataBegin via targetDataMapper for custom mapper"do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataBegin via targetDataMapper for custom mapper" " failed.\n"); } while (0); |
568 | " failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataBegin via targetDataMapper for custom mapper" " failed.\n"); } while (0);; |
569 | return OFFLOAD_FAIL(~0); |
570 | } |
571 | |
572 | // Skip the rest of this function, continue to the next argument. |
573 | continue; |
574 | } |
575 | |
576 | void *HstPtrBegin = Args[I]; |
577 | void *HstPtrBase = ArgsBase[I]; |
578 | int64_t DataSize = ArgSizes[I]; |
579 | map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; |
580 | |
581 | // Adjust for proper alignment if this is a combined entry (for structs). |
582 | // Look at the next argument - if that is MEMBER_OF this one, then this one |
583 | // is a combined entry. |
584 | int64_t Padding = 0; |
585 | const int NextI = I + 1; |
586 | if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && |
587 | getParentIndex(ArgTypes[NextI]) == I) { |
588 | Padding = (int64_t)HstPtrBegin % Alignment; |
589 | if (Padding) { |
590 | DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD{} |
591 | "\n",{} |
592 | Padding, DPxPTR(HstPtrBegin)){}; |
593 | HstPtrBegin = (char *)HstPtrBegin - Padding; |
594 | DataSize += Padding; |
595 | } |
596 | } |
597 | |
598 | // Address of pointer on the host and device, respectively. |
599 | void *PointerHstPtrBegin, *PointerTgtPtrBegin; |
600 | TargetPointerResultTy PointerTpr; |
601 | bool IsHostPtr = false; |
602 | bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; |
603 | // Force the creation of a device side copy of the data when: |
604 | // a close map modifier was associated with a map that contained a to. |
605 | bool HasCloseModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_CLOSE; |
606 | bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; |
607 | bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; |
608 | // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we |
609 | // have reached this point via __tgt_target_data_begin and not __tgt_target |
610 | // then no argument is marked as TARGET_PARAM ("omp target data map" is not |
611 | // associated with a target region, so there are no target parameters). This |
612 | // may be considered a hack, we could revise the scheme in the future. |
613 | bool UpdateRef = |
614 | !(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && !(FromMapper && I == 0); |
615 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { |
616 | DP("Has a pointer entry: \n"){}; |
617 | // Base is address of pointer. |
618 | // |
619 | // Usually, the pointer is already allocated by this time. For example: |
620 | // |
621 | // #pragma omp target map(s.p[0:N]) |
622 | // |
623 | // The map entry for s comes first, and the PTR_AND_OBJ entry comes |
624 | // afterward, so the pointer is already allocated by the time the |
625 | // PTR_AND_OBJ entry is handled below, and PointerTgtPtrBegin is thus |
626 | // non-null. However, "declare target link" can produce a PTR_AND_OBJ |
627 | // entry for a global that might not already be allocated by the time the |
628 | // PTR_AND_OBJ entry is handled below, and so the allocation might fail |
629 | // when HasPresentModifier. |
630 | PointerTpr = Device.getTargetPointer( |
631 | HstPtrBase, HstPtrBase, sizeof(void *), /*HstPtrName=*/nullptr, |
632 | /*HasFlagTo=*/false, /*HasFlagAlways=*/false, IsImplicit, UpdateRef, |
633 | HasCloseModifier, HasPresentModifier, HasHoldModifier, AsyncInfo); |
634 | PointerTgtPtrBegin = PointerTpr.TargetPointer; |
635 | IsHostPtr = PointerTpr.Flags.IsHostPointer; |
Value stored to 'IsHostPtr' is never read | |
636 | if (!PointerTgtPtrBegin) { |
637 | REPORT("Call to getTargetPointer returned null pointer (%s).\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0); |
638 | HasPresentModifier ? "'present' map type modifier"do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0); |
639 | : "device failure or illegal mapping")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0);; |
640 | return OFFLOAD_FAIL(~0); |
641 | } |
642 | DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"{} |
643 | "\n",{} |
644 | sizeof(void *), DPxPTR(PointerTgtPtrBegin),{} |
645 | (PointerTpr.Flags.IsNewEntry ? "" : " not")){}; |
646 | PointerHstPtrBegin = HstPtrBase; |
647 | // modify current entry. |
648 | HstPtrBase = *(void **)HstPtrBase; |
649 | // No need to update pointee ref count for the first element of the |
650 | // subelement that comes from mapper. |
651 | UpdateRef = |
652 | (!FromMapper || I != 0); // subsequently update ref count of pointee |
653 | } |
654 | |
655 | const bool HasFlagTo = ArgTypes[I] & OMP_TGT_MAPTYPE_TO; |
656 | const bool HasFlagAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; |
657 | auto TPR = Device.getTargetPointer( |
658 | HstPtrBegin, HstPtrBase, DataSize, HstPtrName, HasFlagTo, HasFlagAlways, |
659 | IsImplicit, UpdateRef, HasCloseModifier, HasPresentModifier, |
660 | HasHoldModifier, AsyncInfo); |
661 | void *TgtPtrBegin = TPR.TargetPointer; |
662 | IsHostPtr = TPR.Flags.IsHostPointer; |
663 | // If data_size==0, then the argument could be a zero-length pointer to |
664 | // NULL, so getOrAlloc() returning NULL is not an error. |
665 | if (!TgtPtrBegin && (DataSize || HasPresentModifier)) { |
666 | REPORT("Call to getTargetPointer returned null pointer (%s).\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0); |
667 | HasPresentModifier ? "'present' map type modifier"do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0); |
668 | : "device failure or illegal mapping")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to getTargetPointer returned null pointer (%s).\n", HasPresentModifier ? "'present' map type modifier" : "device failure or illegal mapping" ); } while (0);; |
669 | return OFFLOAD_FAIL(~0); |
670 | } |
671 | DP("There are %" PRId64 " bytes allocated at target address " DPxMOD{} |
672 | " - is%s new\n",{} |
673 | DataSize, DPxPTR(TgtPtrBegin), (TPR.Flags.IsNewEntry ? "" : " not")){}; |
674 | |
675 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_RETURN_PARAM) { |
676 | uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; |
677 | void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); |
678 | DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)){}; |
679 | ArgsBase[I] = TgtPtrBase; |
680 | } |
681 | |
682 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { |
683 | // Check whether we need to update the pointer on the device |
684 | bool UpdateDevPtr = false; |
685 | |
686 | uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; |
687 | void *ExpectedTgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); |
688 | |
689 | Device.ShadowMtx.lock(); |
690 | auto Entry = Device.ShadowPtrMap.find(PointerHstPtrBegin); |
691 | // If this pointer is not in the map we need to insert it. If the map |
692 | // contains a stale entry, we need to update it (e.g. if the pointee was |
693 | // deallocated and later on is reallocated at another device address). The |
694 | // latter scenario is the subject of LIT test env/base_ptr_ref_count.c. An |
695 | // entry is removed from ShadowPtrMap only when the PTR of a PTR_AND_OBJ |
696 | // pair is deallocated, not when the OBJ is deallocated. In |
697 | // env/base_ptr_ref_count.c the PTR is a global "declare target" pointer, |
698 | // so it stays in the map for the lifetime of the application. When the |
699 | // OBJ is deallocated and later on allocated again (at a different device |
700 | // address), ShadowPtrMap still contains an entry for Pointer_HstPtrBegin |
701 | // which is stale, pointing to the old ExpectedTgtPtrBase of the OBJ. |
702 | if (Entry == Device.ShadowPtrMap.end() || |
703 | Entry->second.TgtPtrVal != ExpectedTgtPtrBase) { |
704 | // create or update shadow pointers for this entry |
705 | Device.ShadowPtrMap[PointerHstPtrBegin] = { |
706 | HstPtrBase, PointerTgtPtrBegin, ExpectedTgtPtrBase}; |
707 | PointerTpr.Entry->setMayContainAttachedPointers(); |
708 | UpdateDevPtr = true; |
709 | } |
710 | |
711 | if (UpdateDevPtr) { |
712 | std::lock_guard<decltype(*PointerTpr.Entry)> LG(*PointerTpr.Entry); |
713 | Device.ShadowMtx.unlock(); |
714 | |
715 | DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",{} |
716 | DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)){}; |
717 | |
718 | void *&TgtPtrBase = AsyncInfo.getVoidPtrLocation(); |
719 | TgtPtrBase = ExpectedTgtPtrBase; |
720 | |
721 | int Ret = Device.submitData(PointerTgtPtrBegin, &TgtPtrBase, |
722 | sizeof(void *), AsyncInfo); |
723 | if (Ret != OFFLOAD_SUCCESS(0)) { |
724 | REPORT("Copying data to device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data to device failed.\n"); } while (0);; |
725 | return OFFLOAD_FAIL(~0); |
726 | } |
727 | if (PointerTpr.Entry->addEventIfNecessary(Device, AsyncInfo) != |
728 | OFFLOAD_SUCCESS(0)) |
729 | return OFFLOAD_FAIL(~0); |
730 | } else |
731 | Device.ShadowMtx.unlock(); |
732 | } |
733 | } |
734 | |
735 | return OFFLOAD_SUCCESS(0); |
736 | } |
737 | |
738 | namespace { |
739 | /// This structure contains information to deallocate a target pointer, aka. |
740 | /// used to fix up the shadow map and potentially delete the entry from the |
741 | /// mapping table via \p DeviceTy::deallocTgtPtr. |
742 | struct PostProcessingInfo { |
743 | /// Host pointer used to look up into the map table |
744 | void *HstPtrBegin; |
745 | |
746 | /// Size of the data |
747 | int64_t DataSize; |
748 | |
749 | /// The mapping type (bitfield). |
750 | int64_t ArgType; |
751 | |
752 | /// Index of the argument in the data mapping scheme. |
753 | int32_t ArgIndex; |
754 | |
755 | /// The target pointer information. |
756 | TargetPointerResultTy TPR; |
757 | |
758 | PostProcessingInfo(void *HstPtr, int64_t Size, int64_t ArgType, |
759 | int32_t ArgIndex, TargetPointerResultTy TPR) |
760 | : HstPtrBegin(HstPtr), DataSize(Size), ArgType(ArgType), |
761 | ArgIndex(ArgIndex), TPR(TPR) {} |
762 | }; |
763 | |
764 | /// Apply \p CB to the shadow map pointer entries in the range \p Begin, to |
765 | /// \p Begin + \p Size. \p CB is called with a locked shadow pointer map and the |
766 | /// passed iterator can be updated. If the callback returns OFFLOAD_FAIL the |
767 | /// rest of the map is not checked anymore. |
768 | template <typename CBTy> |
769 | static void applyToShadowMapEntries(DeviceTy &Device, CBTy CB, void *Begin, |
770 | uintptr_t Size, |
771 | const TargetPointerResultTy &TPR) { |
772 | // If we have an object that is too small to hold a pointer subobject, no need |
773 | // to do any checking. |
774 | if (Size < sizeof(void *)) |
775 | return; |
776 | |
777 | // If the map entry for the object was never marked as containing attached |
778 | // pointers, no need to do any checking. |
779 | if (!TPR.Entry || !TPR.Entry->getMayContainAttachedPointers()) |
780 | return; |
781 | |
782 | uintptr_t LB = (uintptr_t)Begin; |
783 | uintptr_t UB = LB + Size; |
784 | // Now we are looking into the shadow map so we need to lock it. |
785 | std::lock_guard<decltype(Device.ShadowMtx)> LG(Device.ShadowMtx); |
786 | for (ShadowPtrListTy::iterator Itr = Device.ShadowPtrMap.begin(); |
787 | Itr != Device.ShadowPtrMap.end();) { |
788 | uintptr_t ShadowHstPtrAddr = (uintptr_t)Itr->first; |
789 | |
790 | // An STL map is sorted on its keys; use this property |
791 | // to quickly determine when to break out of the loop. |
792 | if (ShadowHstPtrAddr < LB) { |
793 | ++Itr; |
794 | continue; |
795 | } |
796 | if (ShadowHstPtrAddr >= UB) |
797 | break; |
798 | |
799 | if (CB(Itr) == OFFLOAD_FAIL(~0)) |
800 | break; |
801 | } |
802 | } |
803 | |
804 | } // namespace |
805 | |
806 | /// Applies the necessary post-processing procedures to entries listed in \p |
807 | /// EntriesInfo after the execution of all device side operations from a target |
808 | /// data end. This includes the update of pointers at the host and removal of |
809 | /// device buffer when needed. It returns OFFLOAD_FAIL or OFFLOAD_SUCCESS |
810 | /// according to the successfulness of the operations. |
811 | [[nodiscard]] static int |
812 | postProcessingTargetDataEnd(DeviceTy *Device, |
813 | SmallVector<PostProcessingInfo> EntriesInfo, |
814 | bool FromMapper) { |
815 | int Ret = OFFLOAD_SUCCESS(0); |
816 | void *FromMapperBase = nullptr; |
817 | |
818 | for (auto &[HstPtrBegin, DataSize, ArgType, ArgIndex, TPR] : EntriesInfo) { |
819 | bool DelEntry = !TPR.isHostPointer(); |
820 | |
821 | // If the last element from the mapper (for end transfer args comes in |
822 | // reverse order), do not remove the partial entry, the parent struct still |
823 | // exists. |
824 | if ((ArgType & OMP_TGT_MAPTYPE_MEMBER_OF) && |
825 | !(ArgType & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { |
826 | DelEntry = false; // protect parent struct from being deallocated |
827 | } |
828 | |
829 | if (DelEntry && FromMapper && ArgIndex == 0) { |
830 | DelEntry = false; |
831 | FromMapperBase = HstPtrBegin; |
832 | } |
833 | |
834 | // If we marked the entry to be deleted we need to verify no other |
835 | // thread reused it by now. If deletion is still supposed to happen by |
836 | // this thread LR will be set and exclusive access to the HDTT map |
837 | // will avoid another thread reusing the entry now. Note that we do |
838 | // not request (exclusive) access to the HDTT map if DelEntry is |
839 | // not set. |
840 | DeviceTy::HDTTMapAccessorTy HDTTMap = |
841 | Device->HostDataToTargetMap.getExclusiveAccessor(!DelEntry); |
842 | |
843 | const bool IsNotLastUser = TPR.Entry->decDataEndThreadCount() != 0; |
844 | if (DelEntry && (TPR.Entry->getTotalRefCount() != 0 || IsNotLastUser)) { |
845 | // The thread is not in charge of deletion anymore. Give up access |
846 | // to the HDTT map and unset the deletion flag. |
847 | HDTTMap.destroy(); |
848 | DelEntry = false; |
849 | } |
850 | |
851 | // If we copied back to the host a struct/array containing pointers, |
852 | // we need to restore the original host pointer values from their |
853 | // shadow copies. If the struct is going to be deallocated, remove any |
854 | // remaining shadow pointer entries for this struct. |
855 | const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM; |
856 | auto CB = [&](ShadowPtrListTy::iterator &Itr) { |
857 | // If we copied the struct to the host, we need to restore the |
858 | // pointer. |
859 | if (HasFrom) { |
860 | void **ShadowHstPtrAddr = (void **)Itr->first; |
861 | *ShadowHstPtrAddr = Itr->second.HstPtrVal; |
862 | DP("Restoring original host pointer value " DPxMOD " for host "{} |
863 | "pointer " DPxMOD "\n",{} |
864 | DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)){}; |
865 | } |
866 | // If the struct is to be deallocated, remove the shadow entry. |
867 | if (DelEntry) { |
868 | DP("Removing shadow pointer " DPxMOD "\n", DPxPTR((void **)Itr->first)){}; |
869 | auto OldItr = Itr; |
870 | Itr++; |
871 | Device->ShadowPtrMap.erase(OldItr); |
872 | } else { |
873 | ++Itr; |
874 | } |
875 | return OFFLOAD_SUCCESS(0); |
876 | }; |
877 | applyToShadowMapEntries(*Device, CB, HstPtrBegin, DataSize, TPR); |
878 | |
879 | if (!DelEntry || (FromMapperBase && FromMapperBase == HstPtrBegin)) |
880 | continue; |
881 | |
882 | // If we are deleting the entry the DataMapMtx is locked and we own |
883 | // the entry. |
884 | Ret = Device->eraseMapEntry(HDTTMap, TPR.Entry, DataSize); |
885 | // Entry is already remove from the map, we can unlock it now. |
886 | HDTTMap.destroy(); |
887 | Ret |= Device->deallocTgtPtrAndEntry(TPR.Entry, DataSize); |
888 | if (Ret != OFFLOAD_SUCCESS(0)) { |
889 | REPORT("Deallocating data from device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Deallocating data from device failed.\n"); } while (0);; |
890 | break; |
891 | } |
892 | } |
893 | |
894 | return Ret; |
895 | } |
896 | |
897 | /// Internal function to undo the mapping and retrieve the data from the device. |
898 | int targetDataEnd(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, |
899 | void **ArgBases, void **Args, int64_t *ArgSizes, |
900 | int64_t *ArgTypes, map_var_info_t *ArgNames, |
901 | void **ArgMappers, AsyncInfoTy &AsyncInfo, bool FromMapper) { |
902 | int Ret = OFFLOAD_SUCCESS(0); |
903 | SmallVector<PostProcessingInfo> PostProcessingPtrs; |
904 | void *FromMapperBase = nullptr; |
905 | // process each input. |
906 | for (int32_t I = ArgNum - 1; I >= 0; --I) { |
907 | // Ignore private variables and arrays - there is no mapping for them. |
908 | // Also, ignore the use_device_ptr directive, it has no effect here. |
909 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || |
910 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) |
911 | continue; |
912 | |
913 | if (ArgMappers && ArgMappers[I]) { |
914 | // Instead of executing the regular path of targetDataEnd, call the |
915 | // targetDataMapper variant which will call targetDataEnd again |
916 | // with new arguments. |
917 | DP("Calling targetDataMapper for the %dth argument\n", I){}; |
918 | |
919 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; |
920 | Ret = targetDataMapper(Loc, Device, ArgBases[I], Args[I], ArgSizes[I], |
921 | ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, |
922 | targetDataEnd); |
923 | |
924 | if (Ret != OFFLOAD_SUCCESS(0)) { |
925 | REPORT("Call to targetDataEnd via targetDataMapper for custom mapper"do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataEnd via targetDataMapper for custom mapper" " failed.\n"); } while (0); |
926 | " failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataEnd via targetDataMapper for custom mapper" " failed.\n"); } while (0);; |
927 | return OFFLOAD_FAIL(~0); |
928 | } |
929 | |
930 | // Skip the rest of this function, continue to the next argument. |
931 | continue; |
932 | } |
933 | |
934 | void *HstPtrBegin = Args[I]; |
935 | int64_t DataSize = ArgSizes[I]; |
936 | // Adjust for proper alignment if this is a combined entry (for structs). |
937 | // Look at the next argument - if that is MEMBER_OF this one, then this one |
938 | // is a combined entry. |
939 | const int NextI = I + 1; |
940 | if (getParentIndex(ArgTypes[I]) < 0 && NextI < ArgNum && |
941 | getParentIndex(ArgTypes[NextI]) == I) { |
942 | int64_t Padding = (int64_t)HstPtrBegin % Alignment; |
943 | if (Padding) { |
944 | DP("Using a Padding of %" PRId64 " bytes for begin address " DPxMOD{} |
945 | "\n",{} |
946 | Padding, DPxPTR(HstPtrBegin)){}; |
947 | HstPtrBegin = (char *)HstPtrBegin - Padding; |
948 | DataSize += Padding; |
949 | } |
950 | } |
951 | |
952 | bool IsLast, IsHostPtr; |
953 | bool IsImplicit = ArgTypes[I] & OMP_TGT_MAPTYPE_IMPLICIT; |
954 | bool UpdateRef = (!(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) || |
955 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) && |
956 | !(FromMapper && I == 0); |
957 | bool ForceDelete = ArgTypes[I] & OMP_TGT_MAPTYPE_DELETE; |
958 | bool HasPresentModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_PRESENT; |
959 | bool HasHoldModifier = ArgTypes[I] & OMP_TGT_MAPTYPE_OMPX_HOLD; |
960 | |
961 | // If PTR_AND_OBJ, HstPtrBegin is address of pointee |
962 | TargetPointerResultTy TPR = Device.getTgtPtrBegin( |
963 | HstPtrBegin, DataSize, IsLast, UpdateRef, HasHoldModifier, IsHostPtr, |
964 | !IsImplicit, ForceDelete, /*FromDataEnd=*/true); |
965 | void *TgtPtrBegin = TPR.TargetPointer; |
966 | if (!TPR.isPresent() && !TPR.isHostPointer() && |
967 | (DataSize || HasPresentModifier)) { |
968 | DP("Mapping does not exist (%s)\n",{} |
969 | (HasPresentModifier ? "'present' map type modifier" : "ignored")){}; |
970 | if (HasPresentModifier) { |
971 | // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 350 L10-13: |
972 | // "If a map clause appears on a target, target data, target enter data |
973 | // or target exit data construct with a present map-type-modifier then |
974 | // on entry to the region if the corresponding list item does not appear |
975 | // in the device data environment then an error occurs and the program |
976 | // terminates." |
977 | // |
978 | // This should be an error upon entering an "omp target exit data". It |
979 | // should not be an error upon exiting an "omp target data" or "omp |
980 | // target". For "omp target data", Clang thus doesn't include present |
981 | // modifiers for end calls. For "omp target", we have not found a valid |
982 | // OpenMP program for which the error matters: it appears that, if a |
983 | // program can guarantee that data is present at the beginning of an |
984 | // "omp target" region so that there's no error there, that data is also |
985 | // guaranteed to be present at the end. |
986 | MESSAGE("device mapping required by 'present' map type modifier does "do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' map type modifier does " "not exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), DataSize); } while (0) |
987 | "not exist for host address " DPxMOD " (%" PRId64 " bytes)",do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' map type modifier does " "not exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), DataSize); } while (0) |
988 | DPxPTR(HstPtrBegin), DataSize)do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' map type modifier does " "not exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), DataSize); } while (0); |
989 | return OFFLOAD_FAIL(~0); |
990 | } |
991 | } else { |
992 | DP("There are %" PRId64 " bytes allocated at target address " DPxMOD{} |
993 | " - is%s last\n",{} |
994 | DataSize, DPxPTR(TgtPtrBegin), (IsLast ? "" : " not")){}; |
995 | } |
996 | |
997 | // OpenMP 5.1, sec. 2.21.7.1 "map Clause", p. 351 L14-16: |
998 | // "If the map clause appears on a target, target data, or target exit data |
999 | // construct and a corresponding list item of the original list item is not |
1000 | // present in the device data environment on exit from the region then the |
1001 | // list item is ignored." |
1002 | if (!TPR.isPresent()) |
1003 | continue; |
1004 | |
1005 | // Move data back to the host |
1006 | const bool HasAlways = ArgTypes[I] & OMP_TGT_MAPTYPE_ALWAYS; |
1007 | const bool HasFrom = ArgTypes[I] & OMP_TGT_MAPTYPE_FROM; |
1008 | if (HasFrom && (HasAlways || IsLast) && !IsHostPtr) { |
1009 | DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",{} |
1010 | DataSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)){}; |
1011 | |
1012 | std::lock_guard<decltype(*TPR.Entry)> LG(*TPR.Entry); |
1013 | // Wait for any previous transfer if an event is present. |
1014 | if (void *Event = TPR.Entry->getEvent()) { |
1015 | if (Device.waitEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS(0)) { |
1016 | REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event))do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to wait for event " "0x%0*" "l" "x" ".\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(Event))); } while (0);; |
1017 | return OFFLOAD_FAIL(~0); |
1018 | } |
1019 | } |
1020 | |
1021 | Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, DataSize, AsyncInfo); |
1022 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1023 | REPORT("Copying data from device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data from device failed.\n"); } while (0);; |
1024 | return OFFLOAD_FAIL(~0); |
1025 | } |
1026 | |
1027 | // As we are expecting to delete the entry the d2h copy might race |
1028 | // with another one that also tries to delete the entry. This happens |
1029 | // as the entry can be reused and the reuse might happen after the |
1030 | // copy-back was issued but before it completed. Since the reuse might |
1031 | // also copy-back a value we would race. |
1032 | if (IsLast) { |
1033 | if (TPR.Entry->addEventIfNecessary(Device, AsyncInfo) != |
1034 | OFFLOAD_SUCCESS(0)) |
1035 | return OFFLOAD_FAIL(~0); |
1036 | } |
1037 | } |
1038 | |
1039 | // Add pointer to the buffer for post-synchronize processing. |
1040 | PostProcessingPtrs.emplace_back(HstPtrBegin, DataSize, ArgTypes[I], I, TPR); |
1041 | } |
1042 | |
1043 | // Add post-processing functions |
1044 | // TODO: We might want to remove `mutable` in the future by not changing the |
1045 | // captured variables somehow. |
1046 | AsyncInfo.addPostProcessingFunction( |
1047 | [=, Device = &Device, |
1048 | PostProcessingPtrs = std::move(PostProcessingPtrs)]() mutable -> int { |
1049 | return postProcessingTargetDataEnd(Device, PostProcessingPtrs, |
1050 | FromMapperBase); |
1051 | }); |
1052 | |
1053 | return Ret; |
1054 | } |
1055 | |
1056 | static int targetDataContiguous(ident_t *Loc, DeviceTy &Device, void *ArgsBase, |
1057 | void *HstPtrBegin, int64_t ArgSize, |
1058 | int64_t ArgType, AsyncInfoTy &AsyncInfo) { |
1059 | TIMESCOPE_WITH_IDENT(Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope(__FUNCTION__ , SI.getProfileLocation()); |
1060 | bool IsLast, IsHostPtr; |
1061 | TargetPointerResultTy TPR = Device.getTgtPtrBegin( |
1062 | HstPtrBegin, ArgSize, IsLast, /*UpdateRefCount=*/false, |
1063 | /*UseHoldRefCount=*/false, IsHostPtr, /*MustContain=*/true); |
1064 | void *TgtPtrBegin = TPR.TargetPointer; |
1065 | if (!TPR.isPresent()) { |
1066 | DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)){}; |
1067 | if (ArgType & OMP_TGT_MAPTYPE_PRESENT) { |
1068 | MESSAGE("device mapping required by 'present' motion modifier does not "do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' motion modifier does not " "exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), ArgSize); } while (0) |
1069 | "exist for host address " DPxMOD " (%" PRId64 " bytes)",do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' motion modifier does not " "exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), ArgSize); } while (0) |
1070 | DPxPTR(HstPtrBegin), ArgSize)do { fprintf(stderr, "Libomptarget" " message: " "device mapping required by 'present' motion modifier does not " "exist for host address " "0x%0*" "l" "x" " (%" "l" "d" " bytes)" "\n", ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HstPtrBegin )), ArgSize); } while (0); |
1071 | return OFFLOAD_FAIL(~0); |
1072 | } |
1073 | return OFFLOAD_SUCCESS(0); |
1074 | } |
1075 | |
1076 | if (IsHostPtr) { |
1077 | DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",{} |
1078 | DPxPTR(HstPtrBegin)){}; |
1079 | return OFFLOAD_SUCCESS(0); |
1080 | } |
1081 | |
1082 | if (ArgType & OMP_TGT_MAPTYPE_FROM) { |
1083 | DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",{} |
1084 | ArgSize, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)){}; |
1085 | int Ret = Device.retrieveData(HstPtrBegin, TgtPtrBegin, ArgSize, AsyncInfo); |
1086 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1087 | REPORT("Copying data from device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data from device failed.\n"); } while (0);; |
1088 | return OFFLOAD_FAIL(~0); |
1089 | } |
1090 | |
1091 | // Wait for device-to-host memcopies for whole struct to complete, |
1092 | // before restoring the correct host pointer. |
1093 | AsyncInfo.addPostProcessingFunction([=, Device = &Device]() -> int { |
1094 | auto CB = [&](ShadowPtrListTy::iterator &Itr) { |
1095 | void **ShadowHstPtrAddr = (void **)Itr->first; |
1096 | *ShadowHstPtrAddr = Itr->second.HstPtrVal; |
1097 | DP("Restoring original host pointer value " DPxMOD{} |
1098 | " for host pointer " DPxMOD "\n",{} |
1099 | DPxPTR(Itr->second.HstPtrVal), DPxPTR(ShadowHstPtrAddr)){}; |
1100 | ++Itr; |
1101 | return OFFLOAD_SUCCESS(0); |
1102 | }; |
1103 | applyToShadowMapEntries(*Device, CB, HstPtrBegin, ArgSize, TPR); |
1104 | |
1105 | return OFFLOAD_SUCCESS(0); |
1106 | }); |
1107 | } |
1108 | |
1109 | if (ArgType & OMP_TGT_MAPTYPE_TO) { |
1110 | DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",{} |
1111 | ArgSize, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)){}; |
1112 | int Ret = Device.submitData(TgtPtrBegin, HstPtrBegin, ArgSize, AsyncInfo); |
1113 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1114 | REPORT("Copying data to device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data to device failed.\n"); } while (0);; |
1115 | return OFFLOAD_FAIL(~0); |
1116 | } |
1117 | |
1118 | auto CB = [&](ShadowPtrListTy::iterator &Itr) { |
1119 | DP("Restoring original target pointer value " DPxMOD " for target "{} |
1120 | "pointer " DPxMOD "\n",{} |
1121 | DPxPTR(Itr->second.TgtPtrVal), DPxPTR(Itr->second.TgtPtrAddr)){}; |
1122 | Ret = Device.submitData(Itr->second.TgtPtrAddr, &Itr->second.TgtPtrVal, |
1123 | sizeof(void *), AsyncInfo); |
1124 | if (Ret != OFFLOAD_SUCCESS(0)) |
1125 | REPORT("Copying data to device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data to device failed.\n"); } while (0);; |
1126 | ++Itr; |
1127 | return Ret; |
1128 | }; |
1129 | applyToShadowMapEntries(Device, CB, HstPtrBegin, ArgSize, TPR); |
1130 | } |
1131 | return OFFLOAD_SUCCESS(0); |
1132 | } |
1133 | |
1134 | static int targetDataNonContiguous(ident_t *Loc, DeviceTy &Device, |
1135 | void *ArgsBase, |
1136 | __tgt_target_non_contig *NonContig, |
1137 | uint64_t Size, int64_t ArgType, |
1138 | int CurrentDim, int DimSize, uint64_t Offset, |
1139 | AsyncInfoTy &AsyncInfo) { |
1140 | TIMESCOPE_WITH_IDENT(Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope(__FUNCTION__ , SI.getProfileLocation()); |
1141 | int Ret = OFFLOAD_SUCCESS(0); |
1142 | if (CurrentDim < DimSize) { |
1143 | for (unsigned int I = 0; I < NonContig[CurrentDim].Count; ++I) { |
1144 | uint64_t CurOffset = |
1145 | (NonContig[CurrentDim].Offset + I) * NonContig[CurrentDim].Stride; |
1146 | // we only need to transfer the first element for the last dimension |
1147 | // since we've already got a contiguous piece. |
1148 | if (CurrentDim != DimSize - 1 || I == 0) { |
1149 | Ret = targetDataNonContiguous(Loc, Device, ArgsBase, NonContig, Size, |
1150 | ArgType, CurrentDim + 1, DimSize, |
1151 | Offset + CurOffset, AsyncInfo); |
1152 | // Stop the whole process if any contiguous piece returns anything |
1153 | // other than OFFLOAD_SUCCESS. |
1154 | if (Ret != OFFLOAD_SUCCESS(0)) |
1155 | return Ret; |
1156 | } |
1157 | } |
1158 | } else { |
1159 | char *Ptr = (char *)ArgsBase + Offset; |
1160 | DP("Transfer of non-contiguous : host ptr " DPxMOD " offset %" PRIu64{} |
1161 | " len %" PRIu64 "\n",{} |
1162 | DPxPTR(Ptr), Offset, Size){}; |
1163 | Ret = targetDataContiguous(Loc, Device, ArgsBase, Ptr, Size, ArgType, |
1164 | AsyncInfo); |
1165 | } |
1166 | return Ret; |
1167 | } |
1168 | |
1169 | static int getNonContigMergedDimension(__tgt_target_non_contig *NonContig, |
1170 | int32_t DimSize) { |
1171 | int RemovedDim = 0; |
1172 | for (int I = DimSize - 1; I > 0; --I) { |
1173 | if (NonContig[I].Count * NonContig[I].Stride == NonContig[I - 1].Stride) |
1174 | RemovedDim++; |
1175 | } |
1176 | return RemovedDim; |
1177 | } |
1178 | |
1179 | /// Internal function to pass data to/from the target. |
1180 | int targetDataUpdate(ident_t *Loc, DeviceTy &Device, int32_t ArgNum, |
1181 | void **ArgsBase, void **Args, int64_t *ArgSizes, |
1182 | int64_t *ArgTypes, map_var_info_t *ArgNames, |
1183 | void **ArgMappers, AsyncInfoTy &AsyncInfo, bool) { |
1184 | // process each input. |
1185 | for (int32_t I = 0; I < ArgNum; ++I) { |
1186 | if ((ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) || |
1187 | (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE)) |
1188 | continue; |
1189 | |
1190 | if (ArgMappers && ArgMappers[I]) { |
1191 | // Instead of executing the regular path of targetDataUpdate, call the |
1192 | // targetDataMapper variant which will call targetDataUpdate again |
1193 | // with new arguments. |
1194 | DP("Calling targetDataMapper for the %dth argument\n", I){}; |
1195 | |
1196 | map_var_info_t ArgName = (!ArgNames) ? nullptr : ArgNames[I]; |
1197 | int Ret = targetDataMapper(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], |
1198 | ArgTypes[I], ArgName, ArgMappers[I], AsyncInfo, |
1199 | targetDataUpdate); |
1200 | |
1201 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1202 | REPORT("Call to targetDataUpdate via targetDataMapper for custom mapper"do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataUpdate via targetDataMapper for custom mapper" " failed.\n"); } while (0); |
1203 | " failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataUpdate via targetDataMapper for custom mapper" " failed.\n"); } while (0);; |
1204 | return OFFLOAD_FAIL(~0); |
1205 | } |
1206 | |
1207 | // Skip the rest of this function, continue to the next argument. |
1208 | continue; |
1209 | } |
1210 | |
1211 | int Ret = OFFLOAD_SUCCESS(0); |
1212 | |
1213 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_NON_CONTIG) { |
1214 | __tgt_target_non_contig *NonContig = (__tgt_target_non_contig *)Args[I]; |
1215 | int32_t DimSize = ArgSizes[I]; |
1216 | uint64_t Size = |
1217 | NonContig[DimSize - 1].Count * NonContig[DimSize - 1].Stride; |
1218 | int32_t MergedDim = getNonContigMergedDimension(NonContig, DimSize); |
1219 | Ret = targetDataNonContiguous( |
1220 | Loc, Device, ArgsBase[I], NonContig, Size, ArgTypes[I], |
1221 | /*current_dim=*/0, DimSize - MergedDim, /*offset=*/0, AsyncInfo); |
1222 | } else { |
1223 | Ret = targetDataContiguous(Loc, Device, ArgsBase[I], Args[I], ArgSizes[I], |
1224 | ArgTypes[I], AsyncInfo); |
1225 | } |
1226 | if (Ret == OFFLOAD_FAIL(~0)) |
1227 | return OFFLOAD_FAIL(~0); |
1228 | } |
1229 | return OFFLOAD_SUCCESS(0); |
1230 | } |
1231 | |
1232 | static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | |
1233 | OMP_TGT_MAPTYPE_LITERAL | |
1234 | OMP_TGT_MAPTYPE_IMPLICIT; |
1235 | static bool isLambdaMapping(int64_t Mapping) { |
1236 | return (Mapping & LambdaMapping) == LambdaMapping; |
1237 | } |
1238 | |
1239 | namespace { |
1240 | /// Find the table information in the map or look it up in the translation |
1241 | /// tables. |
1242 | TableMap *getTableMap(void *HostPtr) { |
1243 | std::lock_guard<std::mutex> TblMapLock(PM->TblMapMtx); |
1244 | HostPtrToTableMapTy::iterator TableMapIt = |
1245 | PM->HostPtrToTableMap.find(HostPtr); |
1246 | |
1247 | if (TableMapIt != PM->HostPtrToTableMap.end()) |
1248 | return &TableMapIt->second; |
1249 | |
1250 | // We don't have a map. So search all the registered libraries. |
1251 | TableMap *TM = nullptr; |
1252 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); |
1253 | for (HostEntriesBeginToTransTableTy::iterator Itr = |
1254 | PM->HostEntriesBeginToTransTable.begin(); |
1255 | Itr != PM->HostEntriesBeginToTransTable.end(); ++Itr) { |
1256 | // get the translation table (which contains all the good info). |
1257 | TranslationTable *TransTable = &Itr->second; |
1258 | // iterate over all the host table entries to see if we can locate the |
1259 | // host_ptr. |
1260 | __tgt_offload_entry *Cur = TransTable->HostTable.EntriesBegin; |
1261 | for (uint32_t I = 0; Cur < TransTable->HostTable.EntriesEnd; ++Cur, ++I) { |
1262 | if (Cur->addr != HostPtr) |
1263 | continue; |
1264 | // we got a match, now fill the HostPtrToTableMap so that we |
1265 | // may avoid this search next time. |
1266 | TM = &(PM->HostPtrToTableMap)[HostPtr]; |
1267 | TM->Table = TransTable; |
1268 | TM->Index = I; |
1269 | return TM; |
1270 | } |
1271 | } |
1272 | |
1273 | return nullptr; |
1274 | } |
1275 | |
1276 | /// A class manages private arguments in a target region. |
1277 | class PrivateArgumentManagerTy { |
1278 | /// A data structure for the information of first-private arguments. We can |
1279 | /// use this information to optimize data transfer by packing all |
1280 | /// first-private arguments and transfer them all at once. |
1281 | struct FirstPrivateArgInfoTy { |
1282 | /// The index of the element in \p TgtArgs corresponding to the argument |
1283 | int Index; |
1284 | /// Host pointer begin |
1285 | char *HstPtrBegin; |
1286 | /// Host pointer end |
1287 | char *HstPtrEnd; |
1288 | /// Aligned size |
1289 | int64_t AlignedSize; |
1290 | /// Host pointer name |
1291 | map_var_info_t HstPtrName = nullptr; |
1292 | |
1293 | FirstPrivateArgInfoTy(int Index, void *HstPtr, int64_t Size, |
1294 | const map_var_info_t HstPtrName = nullptr) |
1295 | : Index(Index), HstPtrBegin(reinterpret_cast<char *>(HstPtr)), |
1296 | HstPtrEnd(HstPtrBegin + Size), AlignedSize(Size + Size % Alignment), |
1297 | HstPtrName(HstPtrName) {} |
1298 | }; |
1299 | |
1300 | /// A vector of target pointers for all private arguments |
1301 | SmallVector<void *> TgtPtrs; |
1302 | |
1303 | /// A vector of information of all first-private arguments to be packed |
1304 | SmallVector<FirstPrivateArgInfoTy> FirstPrivateArgInfo; |
1305 | /// Host buffer for all arguments to be packed |
1306 | SmallVector<char> FirstPrivateArgBuffer; |
1307 | /// The total size of all arguments to be packed |
1308 | int64_t FirstPrivateArgSize = 0; |
1309 | |
1310 | /// A reference to the \p DeviceTy object |
1311 | DeviceTy &Device; |
1312 | /// A pointer to a \p AsyncInfoTy object |
1313 | AsyncInfoTy &AsyncInfo; |
1314 | |
1315 | // TODO: What would be the best value here? Should we make it configurable? |
1316 | // If the size is larger than this threshold, we will allocate and transfer it |
1317 | // immediately instead of packing it. |
1318 | static constexpr const int64_t FirstPrivateArgSizeThreshold = 1024; |
1319 | |
1320 | public: |
1321 | /// Constructor |
1322 | PrivateArgumentManagerTy(DeviceTy &Dev, AsyncInfoTy &AsyncInfo) |
1323 | : Device(Dev), AsyncInfo(AsyncInfo) {} |
1324 | |
1325 | /// Add a private argument |
1326 | int addArg(void *HstPtr, int64_t ArgSize, int64_t ArgOffset, |
1327 | bool IsFirstPrivate, void *&TgtPtr, int TgtArgsIndex, |
1328 | const map_var_info_t HstPtrName = nullptr, |
1329 | const bool AllocImmediately = false) { |
1330 | // If the argument is not first-private, or its size is greater than a |
1331 | // predefined threshold, we will allocate memory and issue the transfer |
1332 | // immediately. |
1333 | if (ArgSize > FirstPrivateArgSizeThreshold || !IsFirstPrivate || |
1334 | AllocImmediately) { |
1335 | TgtPtr = Device.allocData(ArgSize, HstPtr); |
1336 | if (!TgtPtr) { |
1337 | DP("Data allocation for %sprivate array " DPxMOD " failed.\n",{} |
1338 | (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtr)){}; |
1339 | return OFFLOAD_FAIL(~0); |
1340 | } |
1341 | #ifdef OMPTARGET_DEBUG |
1342 | void *TgtPtrBase = (void *)((intptr_t)TgtPtr + ArgOffset); |
1343 | DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD{} |
1344 | " for %sprivate array " DPxMOD " - pushing target argument " DPxMOD{} |
1345 | "\n",{} |
1346 | ArgSize, DPxPTR(TgtPtr), (IsFirstPrivate ? "first-" : ""),{} |
1347 | DPxPTR(HstPtr), DPxPTR(TgtPtrBase)){}; |
1348 | #endif |
1349 | // If first-private, copy data from host |
1350 | if (IsFirstPrivate) { |
1351 | DP("Submitting firstprivate data to the device.\n"){}; |
1352 | int Ret = Device.submitData(TgtPtr, HstPtr, ArgSize, AsyncInfo); |
1353 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1354 | DP("Copying data to device failed, failed.\n"){}; |
1355 | return OFFLOAD_FAIL(~0); |
1356 | } |
1357 | } |
1358 | TgtPtrs.push_back(TgtPtr); |
1359 | } else { |
1360 | DP("Firstprivate array " DPxMOD " of size %" PRId64 " will be packed\n",{} |
1361 | DPxPTR(HstPtr), ArgSize){}; |
1362 | // When reach this point, the argument must meet all following |
1363 | // requirements: |
1364 | // 1. Its size does not exceed the threshold (see the comment for |
1365 | // FirstPrivateArgSizeThreshold); |
1366 | // 2. It must be first-private (needs to be mapped to target device). |
1367 | // We will pack all this kind of arguments to transfer them all at once |
1368 | // to reduce the number of data transfer. We will not take |
1369 | // non-first-private arguments, aka. private arguments that doesn't need |
1370 | // to be mapped to target device, into account because data allocation |
1371 | // can be very efficient with memory manager. |
1372 | |
1373 | // Placeholder value |
1374 | TgtPtr = nullptr; |
1375 | FirstPrivateArgInfo.emplace_back(TgtArgsIndex, HstPtr, ArgSize, |
1376 | HstPtrName); |
1377 | FirstPrivateArgSize += FirstPrivateArgInfo.back().AlignedSize; |
1378 | } |
1379 | |
1380 | return OFFLOAD_SUCCESS(0); |
1381 | } |
1382 | |
1383 | /// Pack first-private arguments, replace place holder pointers in \p TgtArgs, |
1384 | /// and start the transfer. |
1385 | int packAndTransfer(SmallVector<void *> &TgtArgs) { |
1386 | if (!FirstPrivateArgInfo.empty()) { |
1387 | assert(FirstPrivateArgSize != 0 &&(static_cast <bool> (FirstPrivateArgSize != 0 && "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty") ? void (0) : __assert_fail ("FirstPrivateArgSize != 0 && \"FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty\"" , "openmp/libomptarget/src/omptarget.cpp", 1388, __extension__ __PRETTY_FUNCTION__)) |
1388 | "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty")(static_cast <bool> (FirstPrivateArgSize != 0 && "FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty") ? void (0) : __assert_fail ("FirstPrivateArgSize != 0 && \"FirstPrivateArgSize is 0 but FirstPrivateArgInfo is empty\"" , "openmp/libomptarget/src/omptarget.cpp", 1388, __extension__ __PRETTY_FUNCTION__)); |
1389 | FirstPrivateArgBuffer.resize(FirstPrivateArgSize, 0); |
1390 | auto Itr = FirstPrivateArgBuffer.begin(); |
1391 | // Copy all host data to this buffer |
1392 | for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { |
1393 | std::copy(Info.HstPtrBegin, Info.HstPtrEnd, Itr); |
1394 | Itr = std::next(Itr, Info.AlignedSize); |
1395 | } |
1396 | // Allocate target memory |
1397 | void *TgtPtr = |
1398 | Device.allocData(FirstPrivateArgSize, FirstPrivateArgBuffer.data()); |
1399 | if (TgtPtr == nullptr) { |
1400 | DP("Failed to allocate target memory for private arguments.\n"){}; |
1401 | return OFFLOAD_FAIL(~0); |
1402 | } |
1403 | TgtPtrs.push_back(TgtPtr); |
1404 | DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD "\n",{} |
1405 | FirstPrivateArgSize, DPxPTR(TgtPtr)){}; |
1406 | // Transfer data to target device |
1407 | int Ret = Device.submitData(TgtPtr, FirstPrivateArgBuffer.data(), |
1408 | FirstPrivateArgSize, AsyncInfo); |
1409 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1410 | DP("Failed to submit data of private arguments.\n"){}; |
1411 | return OFFLOAD_FAIL(~0); |
1412 | } |
1413 | // Fill in all placeholder pointers |
1414 | auto TP = reinterpret_cast<uintptr_t>(TgtPtr); |
1415 | for (FirstPrivateArgInfoTy &Info : FirstPrivateArgInfo) { |
1416 | void *&Ptr = TgtArgs[Info.Index]; |
1417 | assert(Ptr == nullptr && "Target pointer is already set by mistaken")(static_cast <bool> (Ptr == nullptr && "Target pointer is already set by mistaken" ) ? void (0) : __assert_fail ("Ptr == nullptr && \"Target pointer is already set by mistaken\"" , "openmp/libomptarget/src/omptarget.cpp", 1417, __extension__ __PRETTY_FUNCTION__)); |
1418 | Ptr = reinterpret_cast<void *>(TP); |
1419 | TP += Info.AlignedSize; |
1420 | DP("Firstprivate array " DPxMOD " of size %" PRId64 " mapped to " DPxMOD{} |
1421 | "\n",{} |
1422 | DPxPTR(Info.HstPtrBegin), Info.HstPtrEnd - Info.HstPtrBegin,{} |
1423 | DPxPTR(Ptr)){}; |
1424 | } |
1425 | } |
1426 | |
1427 | return OFFLOAD_SUCCESS(0); |
1428 | } |
1429 | |
1430 | /// Free all target memory allocated for private arguments |
1431 | int free() { |
1432 | for (void *P : TgtPtrs) { |
1433 | int Ret = Device.deleteData(P); |
1434 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1435 | DP("Deallocation of (first-)private arrays failed.\n"){}; |
1436 | return OFFLOAD_FAIL(~0); |
1437 | } |
1438 | } |
1439 | |
1440 | TgtPtrs.clear(); |
1441 | |
1442 | return OFFLOAD_SUCCESS(0); |
1443 | } |
1444 | }; |
1445 | |
1446 | /// Process data before launching the kernel, including calling targetDataBegin |
1447 | /// to map and transfer data to target device, transferring (first-)private |
1448 | /// variables. |
1449 | static int processDataBefore(ident_t *Loc, int64_t DeviceId, void *HostPtr, |
1450 | int32_t ArgNum, void **ArgBases, void **Args, |
1451 | int64_t *ArgSizes, int64_t *ArgTypes, |
1452 | map_var_info_t *ArgNames, void **ArgMappers, |
1453 | SmallVector<void *> &TgtArgs, |
1454 | SmallVector<ptrdiff_t> &TgtOffsets, |
1455 | PrivateArgumentManagerTy &PrivateArgumentManager, |
1456 | AsyncInfoTy &AsyncInfo) { |
1457 | TIMESCOPE_WITH_NAME_AND_IDENT("mappingBeforeTargetRegion", Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope("mappingBeforeTargetRegion" , SI.getProfileLocation()); |
1458 | DeviceTy &Device = *PM->Devices[DeviceId]; |
1459 | int Ret = targetDataBegin(Loc, Device, ArgNum, ArgBases, Args, ArgSizes, |
1460 | ArgTypes, ArgNames, ArgMappers, AsyncInfo); |
1461 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1462 | REPORT("Call to targetDataBegin failed, abort target.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataBegin failed, abort target.\n"); } while (0);; |
1463 | return OFFLOAD_FAIL(~0); |
1464 | } |
1465 | |
1466 | // List of (first-)private arrays allocated for this target region |
1467 | SmallVector<int> TgtArgsPositions(ArgNum, -1); |
1468 | |
1469 | for (int32_t I = 0; I < ArgNum; ++I) { |
1470 | if (!(ArgTypes[I] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { |
1471 | // This is not a target parameter, do not push it into TgtArgs. |
1472 | // Check for lambda mapping. |
1473 | if (isLambdaMapping(ArgTypes[I])) { |
1474 | assert((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) &&(static_cast <bool> ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF ) && "PTR_AND_OBJ must be also MEMBER_OF.") ? void (0 ) : __assert_fail ("(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && \"PTR_AND_OBJ must be also MEMBER_OF.\"" , "openmp/libomptarget/src/omptarget.cpp", 1475, __extension__ __PRETTY_FUNCTION__)) |
1475 | "PTR_AND_OBJ must be also MEMBER_OF.")(static_cast <bool> ((ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF ) && "PTR_AND_OBJ must be also MEMBER_OF.") ? void (0 ) : __assert_fail ("(ArgTypes[I] & OMP_TGT_MAPTYPE_MEMBER_OF) && \"PTR_AND_OBJ must be also MEMBER_OF.\"" , "openmp/libomptarget/src/omptarget.cpp", 1475, __extension__ __PRETTY_FUNCTION__)); |
1476 | unsigned Idx = getParentIndex(ArgTypes[I]); |
1477 | int TgtIdx = TgtArgsPositions[Idx]; |
1478 | assert(TgtIdx != -1 && "Base address must be translated already.")(static_cast <bool> (TgtIdx != -1 && "Base address must be translated already." ) ? void (0) : __assert_fail ("TgtIdx != -1 && \"Base address must be translated already.\"" , "openmp/libomptarget/src/omptarget.cpp", 1478, __extension__ __PRETTY_FUNCTION__)); |
1479 | // The parent lambda must be processed already and it must be the last |
1480 | // in TgtArgs and TgtOffsets arrays. |
1481 | void *HstPtrVal = Args[I]; |
1482 | void *HstPtrBegin = ArgBases[I]; |
1483 | void *HstPtrBase = Args[Idx]; |
1484 | bool IsLast, IsHostPtr; // IsLast is unused. |
1485 | void *TgtPtrBase = |
1486 | (void *)((intptr_t)TgtArgs[TgtIdx] + TgtOffsets[TgtIdx]); |
1487 | DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)){}; |
1488 | uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; |
1489 | void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); |
1490 | void *&PointerTgtPtrBegin = AsyncInfo.getVoidPtrLocation(); |
1491 | TargetPointerResultTy TPR = Device.getTgtPtrBegin( |
1492 | HstPtrVal, ArgSizes[I], IsLast, /*UpdateRefCount=*/false, |
1493 | /*UseHoldRefCount=*/false, IsHostPtr); |
1494 | PointerTgtPtrBegin = TPR.TargetPointer; |
1495 | if (!TPR.isPresent()) { |
1496 | DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",{} |
1497 | DPxPTR(HstPtrVal)){}; |
1498 | continue; |
1499 | } |
1500 | if (IsHostPtr) { |
1501 | DP("Unified memory is active, no need to map lambda captured"{} |
1502 | "variable (" DPxMOD ")\n",{} |
1503 | DPxPTR(HstPtrVal)){}; |
1504 | continue; |
1505 | } |
1506 | DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",{} |
1507 | DPxPTR(PointerTgtPtrBegin), DPxPTR(TgtPtrBegin)){}; |
1508 | Ret = Device.submitData(TgtPtrBegin, &PointerTgtPtrBegin, |
1509 | sizeof(void *), AsyncInfo); |
1510 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1511 | REPORT("Copying data to device failed.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Copying data to device failed.\n"); } while (0);; |
1512 | return OFFLOAD_FAIL(~0); |
1513 | } |
1514 | } |
1515 | continue; |
1516 | } |
1517 | void *HstPtrBegin = Args[I]; |
1518 | void *HstPtrBase = ArgBases[I]; |
1519 | void *TgtPtrBegin; |
1520 | map_var_info_t HstPtrName = (!ArgNames) ? nullptr : ArgNames[I]; |
1521 | ptrdiff_t TgtBaseOffset; |
1522 | bool IsLast, IsHostPtr; // unused. |
1523 | TargetPointerResultTy TPR; |
1524 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_LITERAL) { |
1525 | DP("Forwarding first-private value " DPxMOD " to the target construct\n",{} |
1526 | DPxPTR(HstPtrBase)){}; |
1527 | TgtPtrBegin = HstPtrBase; |
1528 | TgtBaseOffset = 0; |
1529 | } else if (ArgTypes[I] & OMP_TGT_MAPTYPE_PRIVATE) { |
1530 | TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; |
1531 | const bool IsFirstPrivate = (ArgTypes[I] & OMP_TGT_MAPTYPE_TO); |
1532 | // If there is a next argument and it depends on the current one, we need |
1533 | // to allocate the private memory immediately. If this is not the case, |
1534 | // then the argument can be marked for optimization and packed with the |
1535 | // other privates. |
1536 | const bool AllocImmediately = |
1537 | (I < ArgNum - 1 && (ArgTypes[I + 1] & OMP_TGT_MAPTYPE_MEMBER_OF)); |
1538 | Ret = PrivateArgumentManager.addArg( |
1539 | HstPtrBegin, ArgSizes[I], TgtBaseOffset, IsFirstPrivate, TgtPtrBegin, |
1540 | TgtArgs.size(), HstPtrName, AllocImmediately); |
1541 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1542 | REPORT("Failed to process %sprivate argument " DPxMOD "\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to process %sprivate argument " "0x%0*" "l" "x" "\n" , (IsFirstPrivate ? "first-" : ""), ((int)(2 * sizeof(uintptr_t ))), ((uintptr_t)(HstPtrBegin))); } while (0); |
1543 | (IsFirstPrivate ? "first-" : ""), DPxPTR(HstPtrBegin))do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to process %sprivate argument " "0x%0*" "l" "x" "\n" , (IsFirstPrivate ? "first-" : ""), ((int)(2 * sizeof(uintptr_t ))), ((uintptr_t)(HstPtrBegin))); } while (0);; |
1544 | return OFFLOAD_FAIL(~0); |
1545 | } |
1546 | } else { |
1547 | if (ArgTypes[I] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) |
1548 | HstPtrBase = *reinterpret_cast<void **>(HstPtrBase); |
1549 | TPR = Device.getTgtPtrBegin(HstPtrBegin, ArgSizes[I], IsLast, |
1550 | /*UpdateRefCount=*/false, |
1551 | /*UseHoldRefCount=*/false, IsHostPtr); |
1552 | TgtPtrBegin = TPR.TargetPointer; |
1553 | TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; |
1554 | #ifdef OMPTARGET_DEBUG |
1555 | void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); |
1556 | DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",{} |
1557 | DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)){}; |
1558 | #endif |
1559 | } |
1560 | TgtArgsPositions[I] = TgtArgs.size(); |
1561 | TgtArgs.push_back(TgtPtrBegin); |
1562 | TgtOffsets.push_back(TgtBaseOffset); |
1563 | } |
1564 | |
1565 | assert(TgtArgs.size() == TgtOffsets.size() &&(static_cast <bool> (TgtArgs.size() == TgtOffsets.size( ) && "Size mismatch in arguments and offsets") ? void (0) : __assert_fail ("TgtArgs.size() == TgtOffsets.size() && \"Size mismatch in arguments and offsets\"" , "openmp/libomptarget/src/omptarget.cpp", 1566, __extension__ __PRETTY_FUNCTION__)) |
1566 | "Size mismatch in arguments and offsets")(static_cast <bool> (TgtArgs.size() == TgtOffsets.size( ) && "Size mismatch in arguments and offsets") ? void (0) : __assert_fail ("TgtArgs.size() == TgtOffsets.size() && \"Size mismatch in arguments and offsets\"" , "openmp/libomptarget/src/omptarget.cpp", 1566, __extension__ __PRETTY_FUNCTION__)); |
1567 | |
1568 | // Pack and transfer first-private arguments |
1569 | Ret = PrivateArgumentManager.packAndTransfer(TgtArgs); |
1570 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1571 | DP("Failed to pack and transfer first private arguments\n"){}; |
1572 | return OFFLOAD_FAIL(~0); |
1573 | } |
1574 | |
1575 | return OFFLOAD_SUCCESS(0); |
1576 | } |
1577 | |
1578 | /// Process data after launching the kernel, including transferring data back to |
1579 | /// host if needed and deallocating target memory of (first-)private variables. |
1580 | static int processDataAfter(ident_t *Loc, int64_t DeviceId, void *HostPtr, |
1581 | int32_t ArgNum, void **ArgBases, void **Args, |
1582 | int64_t *ArgSizes, int64_t *ArgTypes, |
1583 | map_var_info_t *ArgNames, void **ArgMappers, |
1584 | PrivateArgumentManagerTy &PrivateArgumentManager, |
1585 | AsyncInfoTy &AsyncInfo) { |
1586 | TIMESCOPE_WITH_NAME_AND_IDENT("mappingAfterTargetRegion", Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope("mappingAfterTargetRegion" , SI.getProfileLocation()); |
1587 | DeviceTy &Device = *PM->Devices[DeviceId]; |
1588 | |
1589 | // Move data from device. |
1590 | int Ret = targetDataEnd(Loc, Device, ArgNum, ArgBases, Args, ArgSizes, |
1591 | ArgTypes, ArgNames, ArgMappers, AsyncInfo); |
1592 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1593 | REPORT("Call to targetDataEnd failed, abort target.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Call to targetDataEnd failed, abort target.\n"); } while ( 0);; |
1594 | return OFFLOAD_FAIL(~0); |
1595 | } |
1596 | |
1597 | // Free target memory for private arguments after synchronization. |
1598 | // TODO: We might want to remove `mutable` in the future by not changing the |
1599 | // captured variables somehow. |
1600 | AsyncInfo.addPostProcessingFunction( |
1601 | [PrivateArgumentManager = |
1602 | std::move(PrivateArgumentManager)]() mutable -> int { |
1603 | int Ret = PrivateArgumentManager.free(); |
1604 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1605 | REPORT("Failed to deallocate target memory for private args\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to deallocate target memory for private args\n"); } while (0);; |
1606 | return OFFLOAD_FAIL(~0); |
1607 | } |
1608 | return Ret; |
1609 | }); |
1610 | |
1611 | return OFFLOAD_SUCCESS(0); |
1612 | } |
1613 | } // namespace |
1614 | |
1615 | /// performs the same actions as data_begin in case arg_num is |
1616 | /// non-zero and initiates run of the offloaded region on the target platform; |
1617 | /// if arg_num is non-zero after the region execution is done it also |
1618 | /// performs the same action as data_update and data_end above. This function |
1619 | /// returns 0 if it was able to transfer the execution to a target and an |
1620 | /// integer different from zero otherwise. |
1621 | int target(ident_t *Loc, DeviceTy &Device, void *HostPtr, |
1622 | KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo) { |
1623 | int32_t DeviceId = Device.DeviceID; |
1624 | TableMap *TM = getTableMap(HostPtr); |
1625 | // No map for this host pointer found! |
1626 | if (!TM) { |
1627 | REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host ptr " "0x%0*" "l" "x" " does not have a matching target pointer.\n" , ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HostPtr))); } while (0); |
1628 | DPxPTR(HostPtr))do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host ptr " "0x%0*" "l" "x" " does not have a matching target pointer.\n" , ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HostPtr))); } while (0);; |
1629 | return OFFLOAD_FAIL(~0); |
1630 | } |
1631 | |
1632 | // get target table. |
1633 | __tgt_target_table *TargetTable = nullptr; |
1634 | { |
1635 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); |
1636 | assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&(static_cast <bool> (TM->Table->TargetsTable.size () > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TM->Table->TargetsTable.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 1637, __extension__ __PRETTY_FUNCTION__)) |
1637 | "Not expecting a device ID outside the table's bounds!")(static_cast <bool> (TM->Table->TargetsTable.size () > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TM->Table->TargetsTable.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 1637, __extension__ __PRETTY_FUNCTION__)); |
1638 | TargetTable = TM->Table->TargetsTable[DeviceId]; |
1639 | } |
1640 | assert(TargetTable && "Global data has not been mapped\n")(static_cast <bool> (TargetTable && "Global data has not been mapped\n" ) ? void (0) : __assert_fail ("TargetTable && \"Global data has not been mapped\\n\"" , "openmp/libomptarget/src/omptarget.cpp", 1640, __extension__ __PRETTY_FUNCTION__)); |
1641 | |
1642 | DP("loop trip count is %" PRIu64 ".\n", KernelArgs.Tripcount){}; |
1643 | |
1644 | // We need to keep bases and offsets separate. Sometimes (e.g. in OpenCL) we |
1645 | // need to manifest base pointers prior to launching a kernel. Even if we have |
1646 | // mapped an object only partially, e.g. A[N:M], although the kernel is |
1647 | // expected to access elements starting at address &A[N] and beyond, we still |
1648 | // need to manifest the base of the array &A[0]. In other cases, e.g. the COI |
1649 | // API, we need the begin address itself, i.e. &A[N], as the API operates on |
1650 | // begin addresses, not bases. That's why we pass args and offsets as two |
1651 | // separate entities so that each plugin can do what it needs. This behavior |
1652 | // was introdued via https://reviews.llvm.org/D33028 and commit 1546d319244c. |
1653 | SmallVector<void *> TgtArgs; |
1654 | SmallVector<ptrdiff_t> TgtOffsets; |
1655 | |
1656 | PrivateArgumentManagerTy PrivateArgumentManager(Device, AsyncInfo); |
1657 | |
1658 | int NumClangLaunchArgs = KernelArgs.NumArgs; |
1659 | int Ret = OFFLOAD_SUCCESS(0); |
1660 | if (NumClangLaunchArgs) { |
1661 | // Process data, such as data mapping, before launching the kernel |
1662 | Ret = processDataBefore(Loc, DeviceId, HostPtr, NumClangLaunchArgs, |
1663 | KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs, |
1664 | KernelArgs.ArgSizes, KernelArgs.ArgTypes, |
1665 | KernelArgs.ArgNames, KernelArgs.ArgMappers, TgtArgs, |
1666 | TgtOffsets, PrivateArgumentManager, AsyncInfo); |
1667 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1668 | REPORT("Failed to process data before launching the kernel.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to process data before launching the kernel.\n"); } while (0);; |
1669 | return OFFLOAD_FAIL(~0); |
1670 | } |
1671 | |
1672 | // Clang might pass more values via the ArgPtrs to the runtime that we pass |
1673 | // on to the kernel. |
1674 | // TOOD: Next time we adjust the KernelArgsTy we should introduce a new |
1675 | // NumKernelArgs field. |
1676 | KernelArgs.NumArgs = TgtArgs.size(); |
1677 | } |
1678 | |
1679 | // Launch device execution. |
1680 | void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; |
1681 | DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",{} |
1682 | TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index){}; |
1683 | |
1684 | { |
1685 | assert(KernelArgs.NumArgs == TgtArgs.size() && "Argument count mismatch!")(static_cast <bool> (KernelArgs.NumArgs == TgtArgs.size () && "Argument count mismatch!") ? void (0) : __assert_fail ("KernelArgs.NumArgs == TgtArgs.size() && \"Argument count mismatch!\"" , "openmp/libomptarget/src/omptarget.cpp", 1685, __extension__ __PRETTY_FUNCTION__)); |
1686 | TIMESCOPE_WITH_NAME_AND_IDENT("Initiate Kernel Launch", Loc)SourceInfo SI(Loc); llvm::TimeTraceScope TimeScope("Initiate Kernel Launch" , SI.getProfileLocation()); |
1687 | Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(), |
1688 | KernelArgs, AsyncInfo); |
1689 | } |
1690 | |
1691 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1692 | REPORT("Executing target region abort target.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Executing target region abort target.\n"); } while (0);; |
1693 | return OFFLOAD_FAIL(~0); |
1694 | } |
1695 | |
1696 | if (NumClangLaunchArgs) { |
1697 | // Transfer data back and deallocate target memory for (first-)private |
1698 | // variables |
1699 | Ret = processDataAfter(Loc, DeviceId, HostPtr, NumClangLaunchArgs, |
1700 | KernelArgs.ArgBasePtrs, KernelArgs.ArgPtrs, |
1701 | KernelArgs.ArgSizes, KernelArgs.ArgTypes, |
1702 | KernelArgs.ArgNames, KernelArgs.ArgMappers, |
1703 | PrivateArgumentManager, AsyncInfo); |
1704 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1705 | REPORT("Failed to process data after launching the kernel.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Failed to process data after launching the kernel.\n"); } while (0);; |
1706 | return OFFLOAD_FAIL(~0); |
1707 | } |
1708 | } |
1709 | |
1710 | return OFFLOAD_SUCCESS(0); |
1711 | } |
1712 | |
1713 | /// Executes a kernel using pre-recorded information for loading to |
1714 | /// device memory to launch the target kernel with the pre-recorded |
1715 | /// configuration. |
1716 | int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr, |
1717 | void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs, |
1718 | ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams, |
1719 | int32_t ThreadLimit, uint64_t LoopTripCount, |
1720 | AsyncInfoTy &AsyncInfo) { |
1721 | int32_t DeviceId = Device.DeviceID; |
1722 | TableMap *TM = getTableMap(HostPtr); |
1723 | // Fail if the table map fails to find the target kernel pointer for the |
1724 | // provided host pointer. |
1725 | if (!TM) { |
1726 | REPORT("Host ptr " DPxMOD " does not have a matching target pointer.\n",do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host ptr " "0x%0*" "l" "x" " does not have a matching target pointer.\n" , ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HostPtr))); } while (0); |
1727 | DPxPTR(HostPtr))do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Host ptr " "0x%0*" "l" "x" " does not have a matching target pointer.\n" , ((int)(2 * sizeof(uintptr_t))), ((uintptr_t)(HostPtr))); } while (0);; |
1728 | return OFFLOAD_FAIL(~0); |
1729 | } |
1730 | |
1731 | // Retrieve the target table of offloading entries. |
1732 | __tgt_target_table *TargetTable = nullptr; |
1733 | { |
1734 | std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx); |
1735 | assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&(static_cast <bool> (TM->Table->TargetsTable.size () > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TM->Table->TargetsTable.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 1736, __extension__ __PRETTY_FUNCTION__)) |
1736 | "Not expecting a device ID outside the table's bounds!")(static_cast <bool> (TM->Table->TargetsTable.size () > (size_t)DeviceId && "Not expecting a device ID outside the table's bounds!" ) ? void (0) : __assert_fail ("TM->Table->TargetsTable.size() > (size_t)DeviceId && \"Not expecting a device ID outside the table's bounds!\"" , "openmp/libomptarget/src/omptarget.cpp", 1736, __extension__ __PRETTY_FUNCTION__)); |
1737 | TargetTable = TM->Table->TargetsTable[DeviceId]; |
1738 | } |
1739 | assert(TargetTable && "Global data has not been mapped\n")(static_cast <bool> (TargetTable && "Global data has not been mapped\n" ) ? void (0) : __assert_fail ("TargetTable && \"Global data has not been mapped\\n\"" , "openmp/libomptarget/src/omptarget.cpp", 1739, __extension__ __PRETTY_FUNCTION__)); |
1740 | |
1741 | // Retrieve the target kernel pointer, allocate and store the recorded device |
1742 | // memory data, and launch device execution. |
1743 | void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].addr; |
1744 | DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",{} |
1745 | TargetTable->EntriesBegin[TM->Index].name, DPxPTR(TgtEntryPtr), TM->Index){}; |
1746 | |
1747 | void *TgtPtr = Device.allocData(DeviceMemorySize, /* HstPtr */ nullptr, |
1748 | TARGET_ALLOC_DEFAULT); |
1749 | Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo); |
1750 | |
1751 | KernelArgsTy KernelArgs = {0}; |
1752 | KernelArgs.Version = 2; |
1753 | KernelArgs.NumArgs = NumArgs; |
1754 | KernelArgs.Tripcount = LoopTripCount; |
1755 | KernelArgs.NumTeams[0] = NumTeams; |
1756 | KernelArgs.ThreadLimit[0] = ThreadLimit; |
1757 | |
1758 | int Ret = Device.launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs, |
1759 | AsyncInfo); |
1760 | |
1761 | if (Ret != OFFLOAD_SUCCESS(0)) { |
1762 | REPORT("Executing target region abort target.\n")do { fprintf(stderr, "Libomptarget" " error: "); fprintf(stderr , "Executing target region abort target.\n"); } while (0);; |
1763 | return OFFLOAD_FAIL(~0); |
1764 | } |
1765 | |
1766 | return OFFLOAD_SUCCESS(0); |
1767 | } |