blob: 5be43fe6e149702d7af4b82c4d21047e5d2ca914 [file] [log] [blame]
George Rokos2467df62017-01-25 21:27:24 +00001//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is dual licensed under the MIT and the University of Illinois Open
6// Source Licenses. See LICENSE.txt for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// Implementation of the interface to be used by Clang during the codegen of a
11// target region.
12//
13//===----------------------------------------------------------------------===//
14
15#include <algorithm>
16#include <cassert>
17#include <climits>
18#include <cstdlib>
19#include <cstring>
20#include <dlfcn.h>
21#include <list>
22#include <map>
23#include <mutex>
24#include <string>
25#include <vector>
26
27// Header file global to this project
28#include "omptarget.h"
29
30#define DP(...) DEBUGP("Libomptarget", __VA_ARGS__)
31#define INF_REF_CNT (LONG_MAX>>1) // leave room for additions/subtractions
32#define CONSIDERED_INF(x) (x > (INF_REF_CNT>>1))
33
34// List of all plugins that can support offloading.
35static const char *RTLNames[] = {
36 /* PowerPC target */ "libomptarget.rtl.ppc64.so",
37 /* x86_64 target */ "libomptarget.rtl.x86_64.so",
38 /* CUDA target */ "libomptarget.rtl.cuda.so"};
39
40// forward declarations
41struct RTLInfoTy;
42static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
43 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
44 int32_t team_num, int32_t thread_limit, int IsTeamConstruct);
45
46/// Map between host data and target data.
47struct HostDataToTargetTy {
48 uintptr_t HstPtrBase; // host info.
49 uintptr_t HstPtrBegin;
50 uintptr_t HstPtrEnd; // non-inclusive.
51
52 uintptr_t TgtPtrBegin; // target info.
53
54 long RefCount;
55
56 HostDataToTargetTy()
57 : HstPtrBase(0), HstPtrBegin(0), HstPtrEnd(0),
58 TgtPtrBegin(0), RefCount(0) {}
59 HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB)
60 : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
61 TgtPtrBegin(TB), RefCount(1) {}
62};
63
64typedef std::list<HostDataToTargetTy> HostDataToTargetListTy;
65
66struct LookupResult {
67 struct {
68 unsigned IsContained : 1;
69 unsigned ExtendsBefore : 1;
70 unsigned ExtendsAfter : 1;
71 } Flags;
72
73 HostDataToTargetListTy::iterator Entry;
74
Jonas Hahnfeldcfe5ef52017-01-27 11:03:33 +000075 LookupResult() : Flags({0,0,0}), Entry() {}
George Rokos2467df62017-01-25 21:27:24 +000076};
77
78/// Map for shadow pointers
79struct ShadowPtrValTy {
80 void *HstPtrVal;
81 void *TgtPtrAddr;
82 void *TgtPtrVal;
83};
84typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
85
86///
87struct PendingCtorDtorListsTy {
88 std::list<void *> PendingCtors;
89 std::list<void *> PendingDtors;
90};
91typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
92 PendingCtorsDtorsPerLibrary;
93
94struct DeviceTy {
95 int32_t DeviceID;
96 RTLInfoTy *RTL;
97 int32_t RTLDeviceID;
98
99 bool IsInit;
100 std::once_flag InitFlag;
101 bool HasPendingGlobals;
102
103 HostDataToTargetListTy HostDataToTargetMap;
104 PendingCtorsDtorsPerLibrary PendingCtorsDtors;
105
106 ShadowPtrListTy ShadowPtrMap;
107
108 std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx;
109
110 uint64_t loopTripCnt;
111
112 DeviceTy(RTLInfoTy *RTL)
113 : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
114 HasPendingGlobals(false), HostDataToTargetMap(),
115 PendingCtorsDtors(), ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(),
116 ShadowMtx(), loopTripCnt(0) {}
117
118 // The existence of mutexes makes DeviceTy non-copyable. We need to
119 // provide a copy constructor and an assignment operator explicitly.
120 DeviceTy(const DeviceTy &d)
121 : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID),
122 IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals),
123 HostDataToTargetMap(d.HostDataToTargetMap),
124 PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap),
125 DataMapMtx(), PendingGlobalsMtx(),
126 ShadowMtx(), loopTripCnt(d.loopTripCnt) {}
127
128 DeviceTy& operator=(const DeviceTy &d) {
129 DeviceID = d.DeviceID;
130 RTL = d.RTL;
131 RTLDeviceID = d.RTLDeviceID;
132 IsInit = d.IsInit;
133 HasPendingGlobals = d.HasPendingGlobals;
134 HostDataToTargetMap = d.HostDataToTargetMap;
135 PendingCtorsDtors = d.PendingCtorsDtors;
136 ShadowPtrMap = d.ShadowPtrMap;
137 loopTripCnt = d.loopTripCnt;
138
139 return *this;
140 }
141
142 long getMapEntryRefCnt(void *HstPtrBegin);
143 LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
144 void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
145 bool &IsNew, bool IsImplicit, bool UpdateRefCount = true);
146 void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
147 void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
148 bool UpdateRefCount);
149 int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete);
150 int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
151 int disassociatePtr(void *HstPtrBegin);
152
153 // calls to RTL
154 int32_t initOnce();
155 __tgt_target_table *load_binary(void *Img);
156
157 int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size);
158 int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
159
160 int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, int32_t TgtVarsSize);
161 int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
162 int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit,
163 uint64_t LoopTripCount);
164
165private:
166 // Call to RTL
167 void init(); // To be called only via DeviceTy::initOnce()
168};
169
170/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
171typedef std::vector<DeviceTy> DevicesTy;
172static DevicesTy Devices;
173
174struct RTLInfoTy {
175 typedef int32_t(is_valid_binary_ty)(void *);
176 typedef int32_t(number_of_devices_ty)();
177 typedef int32_t(init_device_ty)(int32_t);
178 typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
179 typedef void *(data_alloc_ty)(int32_t, int64_t);
180 typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
181 typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
182 typedef int32_t(data_delete_ty)(int32_t, void *);
183 typedef int32_t(run_region_ty)(int32_t, void *, void **, int32_t);
184 typedef int32_t(run_team_region_ty)(int32_t, void *, void **, int32_t,
185 int32_t, int32_t, uint64_t);
186
187 int32_t Idx; // RTL index, index is the number of devices
188 // of other RTLs that were registered before,
189 // i.e. the OpenMP index of the first device
190 // to be registered with this RTL.
191 int32_t NumberOfDevices; // Number of devices this RTL deals with.
192 std::vector<DeviceTy *> Devices; // one per device (NumberOfDevices in total).
193
194 void *LibraryHandler;
195
196#ifdef OMPTARGET_DEBUG
197 std::string RTLName;
198#endif
199
200 // Functions implemented in the RTL.
201 is_valid_binary_ty *is_valid_binary;
202 number_of_devices_ty *number_of_devices;
203 init_device_ty *init_device;
204 load_binary_ty *load_binary;
205 data_alloc_ty *data_alloc;
206 data_submit_ty *data_submit;
207 data_retrieve_ty *data_retrieve;
208 data_delete_ty *data_delete;
209 run_region_ty *run_region;
210 run_team_region_ty *run_team_region;
211
212 // Are there images associated with this RTL.
213 bool isUsed;
214
215 // Mutex for thread-safety when calling RTL interface functions.
216 // It is easier to enforce thread-safety at the libomptarget level,
217 // so that developers of new RTLs do not have to worry about it.
218 std::mutex Mtx;
219
220 // The existence of the mutex above makes RTLInfoTy non-copyable.
221 // We need to provide a copy constructor explicitly.
222 RTLInfoTy()
223 : Idx(-1), NumberOfDevices(-1), Devices(), LibraryHandler(0),
224#ifdef OMPTARGET_DEBUG
225 RTLName(),
226#endif
227 is_valid_binary(0), number_of_devices(0), init_device(0),
228 load_binary(0), data_alloc(0), data_submit(0), data_retrieve(0),
229 data_delete(0), run_region(0), run_team_region(0), isUsed(false),
230 Mtx() {}
231
232 RTLInfoTy(const RTLInfoTy &r) : Mtx() {
233 Idx = r.Idx;
234 NumberOfDevices = r.NumberOfDevices;
235 Devices = r.Devices;
236 LibraryHandler = r.LibraryHandler;
237#ifdef OMPTARGET_DEBUG
238 RTLName = r.RTLName;
239#endif
240 is_valid_binary = r.is_valid_binary;
241 number_of_devices = r.number_of_devices;
242 init_device = r.init_device;
243 load_binary = r.load_binary;
244 data_alloc = r.data_alloc;
245 data_submit = r.data_submit;
246 data_retrieve = r.data_retrieve;
247 data_delete = r.data_delete;
248 run_region = r.run_region;
249 run_team_region = r.run_team_region;
250 isUsed = r.isUsed;
251 }
252};
253
254/// RTLs identified in the system.
255class RTLsTy {
256private:
257 // Mutex-like object to guarantee thread-safety and unique initialization
258 // (i.e. the library attempts to load the RTLs (plugins) only once).
259 std::once_flag initFlag;
260 void LoadRTLs(); // not thread-safe
261
262public:
263 // List of the detected runtime libraries.
264 std::list<RTLInfoTy> AllRTLs;
265
266 // Array of pointers to the detected runtime libraries that have compatible
267 // binaries.
268 std::vector<RTLInfoTy *> UsedRTLs;
269
270 explicit RTLsTy() {}
271
272 // Load all the runtime libraries (plugins) if not done before.
273 void LoadRTLsOnce();
274};
275
276void RTLsTy::LoadRTLs() {
277 // Parse environment variable OMP_TARGET_OFFLOAD (if set)
278 char *envStr = getenv("OMP_TARGET_OFFLOAD");
279 if (envStr && !strcmp(envStr, "DISABLED")) {
280 DP("Target offloading disabled by environment\n");
281 return;
282 }
283
284 DP("Loading RTLs...\n");
285
286 // Attempt to open all the plugins and, if they exist, check if the interface
287 // is correct and if they are supporting any devices.
288 for (auto *Name : RTLNames) {
289 DP("Loading library '%s'...\n", Name);
290 void *dynlib_handle = dlopen(Name, RTLD_NOW);
291
292 if (!dynlib_handle) {
293 // Library does not exist or cannot be found.
294 DP("Unable to load library '%s': %s!\n", Name, dlerror());
295 continue;
296 }
297
298 DP("Successfully loaded library '%s'!\n", Name);
299
300 // Retrieve the RTL information from the runtime library.
301 RTLInfoTy R;
302
303 R.LibraryHandler = dynlib_handle;
304 R.isUsed = false;
305
306#ifdef OMPTARGET_DEBUG
307 R.RTLName = Name;
308#endif
309
310 if (!(R.is_valid_binary = (RTLInfoTy::is_valid_binary_ty *)dlsym(
311 dynlib_handle, "__tgt_rtl_is_valid_binary")))
312 continue;
313 if (!(R.number_of_devices = (RTLInfoTy::number_of_devices_ty *)dlsym(
314 dynlib_handle, "__tgt_rtl_number_of_devices")))
315 continue;
316 if (!(R.init_device = (RTLInfoTy::init_device_ty *)dlsym(
317 dynlib_handle, "__tgt_rtl_init_device")))
318 continue;
319 if (!(R.load_binary = (RTLInfoTy::load_binary_ty *)dlsym(
320 dynlib_handle, "__tgt_rtl_load_binary")))
321 continue;
322 if (!(R.data_alloc = (RTLInfoTy::data_alloc_ty *)dlsym(
323 dynlib_handle, "__tgt_rtl_data_alloc")))
324 continue;
325 if (!(R.data_submit = (RTLInfoTy::data_submit_ty *)dlsym(
326 dynlib_handle, "__tgt_rtl_data_submit")))
327 continue;
328 if (!(R.data_retrieve = (RTLInfoTy::data_retrieve_ty *)dlsym(
329 dynlib_handle, "__tgt_rtl_data_retrieve")))
330 continue;
331 if (!(R.data_delete = (RTLInfoTy::data_delete_ty *)dlsym(
332 dynlib_handle, "__tgt_rtl_data_delete")))
333 continue;
334 if (!(R.run_region = (RTLInfoTy::run_region_ty *)dlsym(
335 dynlib_handle, "__tgt_rtl_run_target_region")))
336 continue;
337 if (!(R.run_team_region = (RTLInfoTy::run_team_region_ty *)dlsym(
338 dynlib_handle, "__tgt_rtl_run_target_team_region")))
339 continue;
340
341 // No devices are supported by this RTL?
342 if (!(R.NumberOfDevices = R.number_of_devices())) {
343 DP("No devices supported in this RTL\n");
344 continue;
345 }
346
347 DP("Registering RTL %s supporting %d devices!\n",
348 R.RTLName.c_str(), R.NumberOfDevices);
349
350 // The RTL is valid! Will save the information in the RTLs list.
351 AllRTLs.push_back(R);
352 }
353
354 DP("RTLs loaded!\n");
355
356 return;
357}
358
359void RTLsTy::LoadRTLsOnce() {
360 // RTL.LoadRTLs() is called only once in a thread-safe fashion.
361 std::call_once(initFlag, &RTLsTy::LoadRTLs, this);
362}
363
364static RTLsTy RTLs;
365static std::mutex RTLsMtx;
366
367/// Map between the host entry begin and the translation table. Each
368/// registered library gets one TranslationTable. Use the map from
369/// __tgt_offload_entry so that we may quickly determine whether we
370/// are trying to (re)register an existing lib or really have a new one.
371struct TranslationTable {
372 __tgt_target_table HostTable;
373
374 // Image assigned to a given device.
375 std::vector<__tgt_device_image *> TargetsImages; // One image per device ID.
376
377 // Table of entry points or NULL if it was not already computed.
378 std::vector<__tgt_target_table *> TargetsTable; // One table per device ID.
379};
380typedef std::map<__tgt_offload_entry *, TranslationTable>
381 HostEntriesBeginToTransTableTy;
382static HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
383static std::mutex TrlTblMtx;
384
385/// Map between the host ptr and a table index
386struct TableMap {
387 TranslationTable *Table; // table associated with the host ptr.
388 uint32_t Index; // index in which the host ptr translated entry is found.
389 TableMap() : Table(0), Index(0) {}
390 TableMap(TranslationTable *table, uint32_t index)
391 : Table(table), Index(index) {}
392};
393typedef std::map<void *, TableMap> HostPtrToTableMapTy;
394static HostPtrToTableMapTy HostPtrToTableMap;
395static std::mutex TblMapMtx;
396
397/// Check whether a device has an associated RTL and initialize it if it's not
398/// already initialized.
399static bool device_is_ready(int device_num) {
400 DP("Checking whether device %d is ready.\n", device_num);
401 // Devices.size() can only change while registering a new
402 // library, so try to acquire the lock of RTLs' mutex.
403 RTLsMtx.lock();
404 size_t Devices_size = Devices.size();
405 RTLsMtx.unlock();
406 if (Devices_size <= (size_t)device_num) {
407 DP("Device ID %d does not have a matching RTL\n", device_num);
408 return false;
409 }
410
411 // Get device info
412 DeviceTy &Device = Devices[device_num];
413
414 DP("Is the device %d (local ID %d) initialized? %d\n", device_num,
415 Device.RTLDeviceID, Device.IsInit);
416
417 // Init the device if not done before
418 if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) {
419 DP("Failed to init device %d\n", device_num);
420 return false;
421 }
422
423 DP("Device %d is ready to use.\n", device_num);
424
425 return true;
426}
427
428////////////////////////////////////////////////////////////////////////////////
429// Target API functions
430//
431EXTERN int omp_get_num_devices(void) {
432 RTLsMtx.lock();
433 size_t Devices_size = Devices.size();
434 RTLsMtx.unlock();
435
436 DP("Call to omp_get_num_devices returning %zd\n", Devices_size);
437
438 return Devices_size;
439}
440
441EXTERN int omp_get_initial_device(void) {
442 DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE);
443 return HOST_DEVICE;
444}
445
446EXTERN void *omp_target_alloc(size_t size, int device_num) {
447 DP("Call to omp_target_alloc for device %d requesting %zu bytes\n",
448 device_num, size);
449
450 if (size <= 0) {
451 DP("Call to omp_target_alloc with non-positive length\n");
452 return NULL;
453 }
454
455 void *rc = NULL;
456
457 if (device_num == omp_get_initial_device()) {
458 rc = malloc(size);
459 DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc));
460 return rc;
461 }
462
463 if (!device_is_ready(device_num)) {
464 DP("omp_target_alloc returns NULL ptr\n");
465 return NULL;
466 }
467
468 DeviceTy &Device = Devices[device_num];
469 rc = Device.RTL->data_alloc(Device.RTLDeviceID, size);
470 DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc));
471 return rc;
472}
473
474EXTERN void omp_target_free(void *device_ptr, int device_num) {
475 DP("Call to omp_target_free for device %d and address " DPxMOD "\n",
476 device_num, DPxPTR(device_ptr));
477
478 if (!device_ptr) {
479 DP("Call to omp_target_free with NULL ptr\n");
480 return;
481 }
482
483 if (device_num == omp_get_initial_device()) {
484 free(device_ptr);
485 DP("omp_target_free deallocated host ptr\n");
486 return;
487 }
488
489 if (!device_is_ready(device_num)) {
490 DP("omp_target_free returns, nothing to do\n");
491 return;
492 }
493
494 DeviceTy &Device = Devices[device_num];
495 Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr);
496 DP("omp_target_free deallocated device ptr\n");
497}
498
499EXTERN int omp_target_is_present(void *ptr, int device_num) {
500 DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",
501 device_num, DPxPTR(ptr));
502
503 if (!ptr) {
504 DP("Call to omp_target_is_present with NULL ptr, returning false\n");
505 return false;
506 }
507
508 if (device_num == omp_get_initial_device()) {
509 DP("Call to omp_target_is_present on host, returning true\n");
510 return true;
511 }
512
513 RTLsMtx.lock();
514 size_t Devices_size = Devices.size();
515 RTLsMtx.unlock();
516 if (Devices_size <= (size_t)device_num) {
517 DP("Call to omp_target_is_present with invalid device ID, returning "
518 "false\n");
519 return false;
520 }
521
522 DeviceTy& Device = Devices[device_num];
523 bool IsLast; // not used
524 int rc = (Device.getTgtPtrBegin(ptr, 0, IsLast, false) != NULL);
525 DP("Call to omp_target_is_present returns %d\n", rc);
526 return rc;
527}
528
529EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
530 size_t dst_offset, size_t src_offset, int dst_device, int src_device) {
531 DP("Call to omp_target_memcpy, dst device %d, src device %d, "
532 "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
533 "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst),
534 DPxPTR(src), dst_offset, src_offset, length);
535
536 if (!dst || !src || length <= 0) {
537 DP("Call to omp_target_memcpy with invalid arguments\n");
538 return OFFLOAD_FAIL;
539 }
540
541 if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) {
542 DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
543 return OFFLOAD_FAIL;
544 }
545
546 if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) {
547 DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
548 return OFFLOAD_FAIL;
549 }
550
551 int rc = OFFLOAD_SUCCESS;
552 void *srcAddr = (char *)src + src_offset;
553 void *dstAddr = (char *)dst + dst_offset;
554
555 if (src_device == omp_get_initial_device() &&
556 dst_device == omp_get_initial_device()) {
557 DP("copy from host to host\n");
558 const void *p = memcpy(dstAddr, srcAddr, length);
559 if (p == NULL)
560 rc = OFFLOAD_FAIL;
561 } else if (src_device == omp_get_initial_device()) {
562 DP("copy from host to device\n");
563 DeviceTy& DstDev = Devices[dst_device];
564 rc = DstDev.data_submit(dstAddr, srcAddr, length);
565 } else if (dst_device == omp_get_initial_device()) {
566 DP("copy from device to host\n");
567 DeviceTy& SrcDev = Devices[src_device];
568 rc = SrcDev.data_retrieve(dstAddr, srcAddr, length);
569 } else {
570 DP("copy from device to device\n");
571 void *buffer = malloc(length);
572 DeviceTy& SrcDev = Devices[src_device];
573 DeviceTy& DstDev = Devices[dst_device];
574 rc = SrcDev.data_retrieve(buffer, srcAddr, length);
575 if (rc == OFFLOAD_SUCCESS)
576 rc = DstDev.data_submit(dstAddr, buffer, length);
577 }
578
579 DP("omp_target_memcpy returns %d\n", rc);
580 return rc;
581}
582
583EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
584 int num_dims, const size_t *volume, const size_t *dst_offsets,
585 const size_t *src_offsets, const size_t *dst_dimensions,
586 const size_t *src_dimensions, int dst_device, int src_device) {
587 DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
588 "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
589 "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
590 "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device,
591 src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets),
592 DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions),
593 DPxPTR(volume), element_size, num_dims);
594
595 if (!(dst || src)) {
596 DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
597 INT_MAX);
598 return INT_MAX;
599 }
600
601 if (!dst || !src || element_size < 1 || num_dims < 1 || !volume ||
602 !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) {
603 DP("Call to omp_target_memcpy_rect with invalid arguments\n");
604 return OFFLOAD_FAIL;
605 }
606
607 int rc;
608 if (num_dims == 1) {
609 rc = omp_target_memcpy(dst, src, element_size * volume[0],
610 element_size * dst_offsets[0], element_size * src_offsets[0],
611 dst_device, src_device);
612 } else {
613 size_t dst_slice_size = element_size;
614 size_t src_slice_size = element_size;
615 for (int i=1; i<num_dims; ++i) {
616 dst_slice_size *= dst_dimensions[i];
617 src_slice_size *= src_dimensions[i];
618 }
619
620 size_t dst_off = dst_offsets[0] * dst_slice_size;
621 size_t src_off = src_offsets[0] * src_slice_size;
622 for (size_t i=0; i<volume[0]; ++i) {
623 rc = omp_target_memcpy_rect((char *) dst + dst_off + dst_slice_size * i,
624 (char *) src + src_off + src_slice_size * i, element_size,
625 num_dims - 1, volume + 1, dst_offsets + 1, src_offsets + 1,
626 dst_dimensions + 1, src_dimensions + 1, dst_device, src_device);
627
628 if (rc) {
629 DP("Recursive call to omp_target_memcpy_rect returns unsuccessfully\n");
630 return rc;
631 }
632 }
633 }
634
635 DP("omp_target_memcpy_rect returns %d\n", rc);
636 return rc;
637}
638
639EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr,
640 size_t size, size_t device_offset, int device_num) {
641 DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", "
642 "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n",
643 DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num);
644
645 if (!host_ptr || !device_ptr || size <= 0) {
646 DP("Call to omp_target_associate_ptr with invalid arguments\n");
647 return OFFLOAD_FAIL;
648 }
649
650 if (device_num == omp_get_initial_device()) {
651 DP("omp_target_associate_ptr: no association possible on the host\n");
652 return OFFLOAD_FAIL;
653 }
654
655 if (!device_is_ready(device_num)) {
656 DP("omp_target_associate_ptr returns OFFLOAD_FAIL\n");
657 return OFFLOAD_FAIL;
658 }
659
660 DeviceTy& Device = Devices[device_num];
661 void *device_addr = (void *)((uint64_t)device_ptr + (uint64_t)device_offset);
662 int rc = Device.associatePtr(host_ptr, device_addr, size);
663 DP("omp_target_associate_ptr returns %d\n", rc);
664 return rc;
665}
666
667EXTERN int omp_target_disassociate_ptr(void *host_ptr, int device_num) {
668 DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", "
669 "device_num %d\n", DPxPTR(host_ptr), device_num);
670
671 if (!host_ptr) {
672 DP("Call to omp_target_associate_ptr with invalid host_ptr\n");
673 return OFFLOAD_FAIL;
674 }
675
676 if (device_num == omp_get_initial_device()) {
677 DP("omp_target_disassociate_ptr: no association possible on the host\n");
678 return OFFLOAD_FAIL;
679 }
680
681 if (!device_is_ready(device_num)) {
682 DP("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n");
683 return OFFLOAD_FAIL;
684 }
685
686 DeviceTy& Device = Devices[device_num];
687 int rc = Device.disassociatePtr(host_ptr);
688 DP("omp_target_disassociate_ptr returns %d\n", rc);
689 return rc;
690}
691
692////////////////////////////////////////////////////////////////////////////////
693// functionality for device
694
695int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
696 DataMapMtx.lock();
697
698 // Check if entry exists
699 for (auto &HT : HostDataToTargetMap) {
700 if ((uintptr_t)HstPtrBegin == HT.HstPtrBegin) {
701 // Mapping already exists
702 bool isValid = HT.HstPtrBegin == (uintptr_t) HstPtrBegin &&
703 HT.HstPtrEnd == (uintptr_t) HstPtrBegin + Size &&
704 HT.TgtPtrBegin == (uintptr_t) TgtPtrBegin;
705 DataMapMtx.unlock();
706 if (isValid) {
707 DP("Attempt to re-associate the same device ptr+offset with the same "
708 "host ptr, nothing to do\n");
709 return OFFLOAD_SUCCESS;
710 } else {
711 DP("Not allowed to re-associate a different device ptr+offset with the "
712 "same host ptr\n");
713 return OFFLOAD_FAIL;
714 }
715 }
716 }
717
718 // Mapping does not exist, allocate it
719 HostDataToTargetTy newEntry;
720
721 // Set up missing fields
722 newEntry.HstPtrBase = (uintptr_t) HstPtrBegin;
723 newEntry.HstPtrBegin = (uintptr_t) HstPtrBegin;
724 newEntry.HstPtrEnd = (uintptr_t) HstPtrBegin + Size;
725 newEntry.TgtPtrBegin = (uintptr_t) TgtPtrBegin;
726 // refCount must be infinite
727 newEntry.RefCount = INF_REF_CNT;
728
729 DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", HstEnd="
730 DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(newEntry.HstPtrBase),
731 DPxPTR(newEntry.HstPtrBegin), DPxPTR(newEntry.HstPtrEnd),
732 DPxPTR(newEntry.TgtPtrBegin));
733 HostDataToTargetMap.push_front(newEntry);
734
735 DataMapMtx.unlock();
736
737 return OFFLOAD_SUCCESS;
738}
739
740int DeviceTy::disassociatePtr(void *HstPtrBegin) {
741 DataMapMtx.lock();
742
743 // Check if entry exists
744 for (HostDataToTargetListTy::iterator ii = HostDataToTargetMap.begin();
745 ii != HostDataToTargetMap.end(); ++ii) {
746 if ((uintptr_t)HstPtrBegin == ii->HstPtrBegin) {
747 // Mapping exists
748 if (CONSIDERED_INF(ii->RefCount)) {
749 DP("Association found, removing it\n");
750 HostDataToTargetMap.erase(ii);
751 DataMapMtx.unlock();
752 return OFFLOAD_SUCCESS;
753 } else {
754 DP("Trying to disassociate a pointer which was not mapped via "
755 "omp_target_associate_ptr\n");
756 break;
757 }
758 }
759 }
760
761 // Mapping not found
762 DataMapMtx.unlock();
763 DP("Association not found\n");
764 return OFFLOAD_FAIL;
765}
766
767// Get ref count of map entry containing HstPtrBegin
768long DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) {
769 uintptr_t hp = (uintptr_t)HstPtrBegin;
770 long RefCnt = -1;
771
772 DataMapMtx.lock();
773 for (auto &HT : HostDataToTargetMap) {
774 if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) {
775 DP("DeviceTy::getMapEntry: requested entry found\n");
776 RefCnt = HT.RefCount;
777 break;
778 }
779 }
780 DataMapMtx.unlock();
781
782 if (RefCnt < 0) {
783 DP("DeviceTy::getMapEntry: requested entry not found\n");
784 }
785
786 return RefCnt;
787}
788
789LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) {
790 uintptr_t hp = (uintptr_t)HstPtrBegin;
791 LookupResult lr;
792
793 DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp),
794 Size);
795 for (lr.Entry = HostDataToTargetMap.begin();
796 lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) {
797 auto &HT = *lr.Entry;
798 // Is it contained?
799 lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd &&
800 (hp+Size) <= HT.HstPtrEnd;
801 // Does it extend into an already mapped region?
802 lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin;
803 // Does it extend beyond the mapped region?
804 lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd;
805
806 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore ||
807 lr.Flags.ExtendsAfter) {
808 break;
809 }
810 }
811
812 if (lr.Flags.ExtendsBefore) {
813 DP("WARNING: Pointer is not mapped but section extends into already "
814 "mapped data\n");
815 }
816 if (lr.Flags.ExtendsAfter) {
817 DP("WARNING: Pointer is already mapped but section extends beyond mapped "
818 "region\n");
819 }
820
821 return lr;
822}
823
824// Used by target_data_begin
825// Return the target pointer begin (where the data will be moved).
826// Allocate memory if this is the first occurrence if this mapping.
827// Increment the reference counter.
828// If NULL is returned, then either data allocation failed or the user tried
829// to do an illegal mapping.
830void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
831 int64_t Size, bool &IsNew, bool IsImplicit, bool UpdateRefCount) {
832 void *rc = NULL;
833 DataMapMtx.lock();
834 LookupResult lr = lookupMapping(HstPtrBegin, Size);
835
836 // Check if the pointer is contained.
837 if (lr.Flags.IsContained ||
838 ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) {
839 auto &HT = *lr.Entry;
840 IsNew = false;
841
842 if (UpdateRefCount)
843 ++HT.RefCount;
844
845 uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
846 DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
847 "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""),
848 DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
849 (UpdateRefCount ? " updated" : ""),
850 (CONSIDERED_INF(HT.RefCount)) ? "INF" :
851 std::to_string(HT.RefCount).c_str());
852 rc = (void *)tp;
853 } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) {
854 // Explicit extension of mapped data - not allowed.
855 DP("Explicit extension of mapping is not allowed.\n");
856 } else if (Size) {
857 // If it is not contained and Size > 0 we should create a new entry for it.
858 IsNew = true;
859 uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size);
860 DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
861 "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
862 DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
863 HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
864 (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
865 rc = (void *)tp;
866 }
867
868 DataMapMtx.unlock();
869 return rc;
870}
871
872// Used by target_data_begin, target_data_end, target_data_update and target.
873// Return the target pointer begin (where the data will be moved).
874// Decrement the reference counter if called from target_data_end.
875void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
876 bool UpdateRefCount) {
877 void *rc = NULL;
878 DataMapMtx.lock();
879 LookupResult lr = lookupMapping(HstPtrBegin, Size);
880
881 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
882 auto &HT = *lr.Entry;
883 IsLast = !(HT.RefCount > 1);
884
885 if (HT.RefCount > 1 && UpdateRefCount)
886 --HT.RefCount;
887
888 uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
889 DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
890 "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
891 (UpdateRefCount ? " updated" : ""),
892 (CONSIDERED_INF(HT.RefCount)) ? "INF" :
893 std::to_string(HT.RefCount).c_str());
894 rc = (void *)tp;
895 } else {
896 IsLast = false;
897 }
898
899 DataMapMtx.unlock();
900 return rc;
901}
902
903// Return the target pointer begin (where the data will be moved).
904// Lock-free version called from within assertions.
905void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) {
906 uintptr_t hp = (uintptr_t)HstPtrBegin;
907 LookupResult lr = lookupMapping(HstPtrBegin, Size);
908 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
909 auto &HT = *lr.Entry;
910 uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin);
911 return (void *)tp;
912 }
913
914 return NULL;
915}
916
917int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) {
918 // Check if the pointer is contained in any sub-nodes.
919 int rc;
920 DataMapMtx.lock();
921 LookupResult lr = lookupMapping(HstPtrBegin, Size);
922 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
923 auto &HT = *lr.Entry;
924 if (ForceDelete)
925 HT.RefCount = 1;
926 if (--HT.RefCount <= 0) {
927 assert(HT.RefCount == 0 && "did not expect a negative ref count");
928 DP("Deleting tgt data " DPxMOD " of size %ld\n",
929 DPxPTR(HT.TgtPtrBegin), Size);
930 RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin);
931 DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
932 ", Size=%ld\n", (ForceDelete ? " (forced)" : ""),
933 DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size);
934 HostDataToTargetMap.erase(lr.Entry);
935 }
936 rc = OFFLOAD_SUCCESS;
937 } else {
938 DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated"
939 " memory\n", DPxPTR(HstPtrBegin));
940 rc = OFFLOAD_FAIL;
941 }
942
943 DataMapMtx.unlock();
944 return rc;
945}
946
947/// Init device, should not be called directly.
948void DeviceTy::init() {
949 int32_t rc = RTL->init_device(RTLDeviceID);
950 if (rc == OFFLOAD_SUCCESS) {
951 IsInit = true;
952 }
953}
954
955/// Thread-safe method to initialize the device only once.
956int32_t DeviceTy::initOnce() {
957 std::call_once(InitFlag, &DeviceTy::init, this);
958
959 // At this point, if IsInit is true, then either this thread or some other
960 // thread in the past successfully initialized the device, so we can return
961 // OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it
962 // failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means
963 // that some other thread already attempted to execute init() and if IsInit
964 // is still false, return OFFLOAD_FAIL.
965 if (IsInit)
966 return OFFLOAD_SUCCESS;
967 else
968 return OFFLOAD_FAIL;
969}
970
971// Load binary to device.
972__tgt_target_table *DeviceTy::load_binary(void *Img) {
973 RTL->Mtx.lock();
974 __tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img);
975 RTL->Mtx.unlock();
976 return rc;
977}
978
979// Submit data to device.
980int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin,
981 int64_t Size) {
982 return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
983}
984
985// Retrieve data from device.
986int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
987 int64_t Size) {
988 return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
989}
990
991// Run region on device
992int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
993 int32_t TgtVarsSize) {
994 return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize);
995}
996
997// Run team region on device.
998int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
999 int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit,
1000 uint64_t LoopTripCount) {
1001 return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize,
1002 NumTeams, ThreadLimit, LoopTripCount);
1003}
1004
1005////////////////////////////////////////////////////////////////////////////////
1006// Functionality for registering libs
1007
1008static void RegisterImageIntoTranslationTable(TranslationTable &TT,
1009 RTLInfoTy &RTL, __tgt_device_image *image) {
1010
1011 // same size, as when we increase one, we also increase the other.
1012 assert(TT.TargetsTable.size() == TT.TargetsImages.size() &&
1013 "We should have as many images as we have tables!");
1014
1015 // Resize the Targets Table and Images to accommodate the new targets if
1016 // required
1017 unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices;
1018
1019 if (TT.TargetsTable.size() < TargetsTableMinimumSize) {
1020 TT.TargetsImages.resize(TargetsTableMinimumSize, 0);
1021 TT.TargetsTable.resize(TargetsTableMinimumSize, 0);
1022 }
1023
1024 // Register the image in all devices for this target type.
1025 for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) {
1026 // If we are changing the image we are also invalidating the target table.
1027 if (TT.TargetsImages[RTL.Idx + i] != image) {
1028 TT.TargetsImages[RTL.Idx + i] = image;
1029 TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table.
1030 }
1031 }
1032}
1033
1034////////////////////////////////////////////////////////////////////////////////
1035// Functionality for registering Ctors/Dtors
1036
1037static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
1038 __tgt_device_image *img, RTLInfoTy *RTL) {
1039
1040 for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
1041 DeviceTy &Device = Devices[RTL->Idx + i];
1042 Device.PendingGlobalsMtx.lock();
1043 Device.HasPendingGlobals = true;
1044 for (__tgt_offload_entry *entry = img->EntriesBegin;
1045 entry != img->EntriesEnd; ++entry) {
1046 if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
1047 DP("Adding ctor " DPxMOD " to the pending list.\n",
1048 DPxPTR(entry->addr));
1049 Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
1050 } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
1051 // Dtors are pushed in reverse order so they are executed from end
1052 // to beginning when unregistering the library!
1053 DP("Adding dtor " DPxMOD " to the pending list.\n",
1054 DPxPTR(entry->addr));
1055 Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
1056 }
1057
1058 if (entry->flags & OMP_DECLARE_TARGET_LINK) {
1059 DP("The \"link\" attribute is not yet supported!\n");
1060 }
1061 }
1062 Device.PendingGlobalsMtx.unlock();
1063 }
1064}
1065
1066////////////////////////////////////////////////////////////////////////////////
1067/// adds a target shared library to the target execution image
1068EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
1069
1070 // Attempt to load all plugins available in the system.
1071 RTLs.LoadRTLsOnce();
1072
1073 RTLsMtx.lock();
1074 // Register the images with the RTLs that understand them, if any.
1075 for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
1076 // Obtain the image.
1077 __tgt_device_image *img = &desc->DeviceImages[i];
1078
1079 RTLInfoTy *FoundRTL = NULL;
1080
1081 // Scan the RTLs that have associated images until we find one that supports
1082 // the current image.
1083 for (auto &R : RTLs.AllRTLs) {
1084 if (!R.is_valid_binary(img)) {
1085 DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
1086 DPxPTR(img->ImageStart), R.RTLName.c_str());
1087 continue;
1088 }
1089
1090 DP("Image " DPxMOD " is compatible with RTL %s!\n",
1091 DPxPTR(img->ImageStart), R.RTLName.c_str());
1092
1093 // If this RTL is not already in use, initialize it.
1094 if (!R.isUsed) {
1095 // Initialize the device information for the RTL we are about to use.
1096 DeviceTy device(&R);
1097
1098 size_t start = Devices.size();
1099 Devices.resize(start + R.NumberOfDevices, device);
1100 for (int32_t device_id = 0; device_id < R.NumberOfDevices;
1101 device_id++) {
1102 // global device ID
1103 Devices[start + device_id].DeviceID = start + device_id;
1104 // RTL local device ID
1105 Devices[start + device_id].RTLDeviceID = device_id;
1106
1107 // Save pointer to device in RTL in case we want to unregister the RTL
1108 R.Devices.push_back(&Devices[start + device_id]);
1109 }
1110
1111 // Initialize the index of this RTL and save it in the used RTLs.
1112 R.Idx = (RTLs.UsedRTLs.empty())
1113 ? 0
1114 : RTLs.UsedRTLs.back()->Idx +
1115 RTLs.UsedRTLs.back()->NumberOfDevices;
1116 assert((size_t) R.Idx == start &&
1117 "RTL index should equal the number of devices used so far.");
1118 R.isUsed = true;
1119 RTLs.UsedRTLs.push_back(&R);
1120
1121 DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx);
1122 }
1123
1124 // Initialize (if necessary) translation table for this library.
1125 TrlTblMtx.lock();
1126 if(!HostEntriesBeginToTransTable.count(desc->HostEntriesBegin)){
1127 TranslationTable &tt =
1128 HostEntriesBeginToTransTable[desc->HostEntriesBegin];
1129 tt.HostTable.EntriesBegin = desc->HostEntriesBegin;
1130 tt.HostTable.EntriesEnd = desc->HostEntriesEnd;
1131 }
1132
1133 // Retrieve translation table for this library.
1134 TranslationTable &TransTable =
1135 HostEntriesBeginToTransTable[desc->HostEntriesBegin];
1136
1137 DP("Registering image " DPxMOD " with RTL %s!\n",
1138 DPxPTR(img->ImageStart), R.RTLName.c_str());
1139 RegisterImageIntoTranslationTable(TransTable, R, img);
1140 TrlTblMtx.unlock();
1141 FoundRTL = &R;
1142
1143 // Load ctors/dtors for static objects
1144 RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL);
1145
1146 // if an RTL was found we are done - proceed to register the next image
1147 break;
1148 }
1149
1150 if (!FoundRTL) {
1151 DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart));
1152 }
1153 }
1154 RTLsMtx.unlock();
1155
1156
1157 DP("Done registering entries!\n");
1158}
1159
1160////////////////////////////////////////////////////////////////////////////////
1161/// unloads a target shared library
1162EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) {
1163 DP("Unloading target library!\n");
1164
1165 RTLsMtx.lock();
1166 // Find which RTL understands each image, if any.
1167 for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
1168 // Obtain the image.
1169 __tgt_device_image *img = &desc->DeviceImages[i];
1170
1171 RTLInfoTy *FoundRTL = NULL;
1172
1173 // Scan the RTLs that have associated images until we find one that supports
1174 // the current image. We only need to scan RTLs that are already being used.
1175 for (auto *R : RTLs.UsedRTLs) {
1176
1177 assert(R->isUsed && "Expecting used RTLs.");
1178
1179 if (!R->is_valid_binary(img)) {
1180 DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
1181 DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
1182 continue;
1183 }
1184
1185 DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n",
1186 DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
1187
1188 FoundRTL = R;
1189
1190 // Execute dtors for static objects if the device has been used, i.e.
1191 // if its PendingCtors list has been emptied.
1192 for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) {
1193 DeviceTy &Device = Devices[FoundRTL->Idx + i];
1194 Device.PendingGlobalsMtx.lock();
1195 if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
1196 for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
1197 int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1,
1198 1, true /*team*/);
1199 if (rc != OFFLOAD_SUCCESS) {
1200 DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
1201 }
1202 }
1203 // Remove this library's entry from PendingCtorsDtors
1204 Device.PendingCtorsDtors.erase(desc);
1205 }
1206 Device.PendingGlobalsMtx.unlock();
1207 }
1208
1209 DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n",
1210 DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
1211
1212 break;
1213 }
1214
1215 // if no RTL was found proceed to unregister the next image
1216 if (!FoundRTL){
1217 DP("No RTLs in use support the image " DPxMOD "!\n",
1218 DPxPTR(img->ImageStart));
1219 }
1220 }
1221 RTLsMtx.unlock();
1222 DP("Done unregistering images!\n");
1223
1224 // Remove entries from HostPtrToTableMap
1225 TblMapMtx.lock();
1226 for (__tgt_offload_entry *cur = desc->HostEntriesBegin;
1227 cur < desc->HostEntriesEnd; ++cur) {
1228 HostPtrToTableMap.erase(cur->addr);
1229 }
1230
1231 // Remove translation table for this descriptor.
1232 auto tt = HostEntriesBeginToTransTable.find(desc->HostEntriesBegin);
1233 if (tt != HostEntriesBeginToTransTable.end()) {
1234 DP("Removing translation table for descriptor " DPxMOD "\n",
1235 DPxPTR(desc->HostEntriesBegin));
1236 HostEntriesBeginToTransTable.erase(tt);
1237 } else {
1238 DP("Translation table for descriptor " DPxMOD " cannot be found, probably "
1239 "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin));
1240 }
1241
1242 TblMapMtx.unlock();
1243
1244 // TODO: Remove RTL and the devices it manages if it's not used anymore?
1245 // TODO: Write some RTL->unload_image(...) function?
1246
1247 DP("Done unregistering library!\n");
1248}
1249
1250/// Map global data and execute pending ctors
1251static int InitLibrary(DeviceTy& Device) {
1252 /*
1253 * Map global data
1254 */
1255 int32_t device_id = Device.DeviceID;
1256 int rc = OFFLOAD_SUCCESS;
1257
1258 Device.PendingGlobalsMtx.lock();
1259 TrlTblMtx.lock();
1260 for (HostEntriesBeginToTransTableTy::iterator
1261 ii = HostEntriesBeginToTransTable.begin();
1262 ii != HostEntriesBeginToTransTable.end(); ++ii) {
1263 TranslationTable *TransTable = &ii->second;
1264 if (TransTable->TargetsTable[device_id] != 0) {
1265 // Library entries have already been processed
1266 continue;
1267 }
1268
1269 // 1) get image.
1270 assert(TransTable->TargetsImages.size() > (size_t)device_id &&
1271 "Not expecting a device ID outside the table's bounds!");
1272 __tgt_device_image *img = TransTable->TargetsImages[device_id];
1273 if (!img) {
1274 DP("No image loaded for device id %d.\n", device_id);
1275 rc = OFFLOAD_FAIL;
1276 break;
1277 }
1278 // 2) load image into the target table.
1279 __tgt_target_table *TargetTable =
1280 TransTable->TargetsTable[device_id] = Device.load_binary(img);
1281 // Unable to get table for this image: invalidate image and fail.
1282 if (!TargetTable) {
1283 DP("Unable to generate entries table for device id %d.\n", device_id);
1284 TransTable->TargetsImages[device_id] = 0;
1285 rc = OFFLOAD_FAIL;
1286 break;
1287 }
1288
1289 // Verify whether the two table sizes match.
1290 size_t hsize =
1291 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
1292 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
1293
1294 // Invalid image for these host entries!
1295 if (hsize != tsize) {
1296 DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
1297 device_id, hsize, tsize);
1298 TransTable->TargetsImages[device_id] = 0;
1299 TransTable->TargetsTable[device_id] = 0;
1300 rc = OFFLOAD_FAIL;
1301 break;
1302 }
1303
1304 // process global data that needs to be mapped.
1305 Device.DataMapMtx.lock();
1306 __tgt_target_table *HostTable = &TransTable->HostTable;
1307 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
1308 *CurrHostEntry = HostTable->EntriesBegin,
1309 *EntryDeviceEnd = TargetTable->EntriesEnd;
1310 CurrDeviceEntry != EntryDeviceEnd;
1311 CurrDeviceEntry++, CurrHostEntry++) {
1312 if (CurrDeviceEntry->size != 0) {
1313 // has data.
1314 assert(CurrDeviceEntry->size == CurrHostEntry->size &&
1315 "data size mismatch");
1316 assert(Device.getTgtPtrBegin(CurrHostEntry->addr,
1317 CurrHostEntry->size) == NULL &&
1318 "data in declared target should not be already mapped");
1319 // add entry to map.
1320 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
1321 "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
1322 CurrDeviceEntry->size);
1323 Device.HostDataToTargetMap.push_front(HostDataToTargetTy(
1324 (uintptr_t)CurrHostEntry->addr, (uintptr_t)CurrHostEntry->addr,
1325 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size,
1326 (uintptr_t)CurrDeviceEntry->addr));
1327 }
1328 }
1329 Device.DataMapMtx.unlock();
1330 }
1331 TrlTblMtx.unlock();
1332
1333 if (rc != OFFLOAD_SUCCESS) {
1334 Device.PendingGlobalsMtx.unlock();
1335 return rc;
1336 }
1337
1338 /*
1339 * Run ctors for static objects
1340 */
1341 if (!Device.PendingCtorsDtors.empty()) {
1342 // Call all ctors for all libraries registered so far
1343 for (auto &lib : Device.PendingCtorsDtors) {
1344 if (!lib.second.PendingCtors.empty()) {
1345 DP("Has pending ctors... call now\n");
1346 for (auto &entry : lib.second.PendingCtors) {
1347 void *ctor = entry;
1348 int rc = target(device_id, ctor, 0, NULL, NULL, NULL,
1349 NULL, 1, 1, true /*team*/);
1350 if (rc != OFFLOAD_SUCCESS) {
1351 DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
1352 Device.PendingGlobalsMtx.unlock();
1353 return OFFLOAD_FAIL;
1354 }
1355 }
1356 // Clear the list to indicate that this device has been used
1357 lib.second.PendingCtors.clear();
1358 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
1359 }
1360 }
1361 }
1362 Device.HasPendingGlobals = false;
1363 Device.PendingGlobalsMtx.unlock();
1364
1365 return OFFLOAD_SUCCESS;
1366}
1367
1368// Check whether a device has been initialized, global ctors have been
1369// executed and global data has been mapped; do so if not already done.
1370static int CheckDevice(int32_t device_id) {
1371 // Is device ready?
1372 if (!device_is_ready(device_id)) {
1373 DP("Device %d is not ready.\n", device_id);
1374 return OFFLOAD_FAIL;
1375 }
1376
1377 // Get device info.
1378 DeviceTy &Device = Devices[device_id];
1379
1380 // Check whether global data has been mapped for this device
1381 Device.PendingGlobalsMtx.lock();
1382 bool hasPendingGlobals = Device.HasPendingGlobals;
1383 Device.PendingGlobalsMtx.unlock();
1384 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
1385 DP("Failed to init globals on device %d\n", device_id);
1386 return OFFLOAD_FAIL;
1387 }
1388
1389 return OFFLOAD_SUCCESS;
1390}
1391
1392// Following datatypes and functions (tgt_oldmap_type, combined_entry_t,
1393// translate_map, cleanup_map) will be removed once the compiler starts using
1394// the new map types.
1395
1396// Old map types
1397enum tgt_oldmap_type {
1398 OMP_TGT_OLDMAPTYPE_TO = 0x001, // copy data from host to device
1399 OMP_TGT_OLDMAPTYPE_FROM = 0x002, // copy data from device to host
1400 OMP_TGT_OLDMAPTYPE_ALWAYS = 0x004, // copy regardless of the ref. count
1401 OMP_TGT_OLDMAPTYPE_DELETE = 0x008, // force unmapping of data
1402 OMP_TGT_OLDMAPTYPE_MAP_PTR = 0x010, // map pointer as well as pointee
1403 OMP_TGT_OLDMAPTYPE_FIRST_MAP = 0x020, // first occurrence of mapped variable
1404 OMP_TGT_OLDMAPTYPE_RETURN_PTR = 0x040, // return TgtBase addr of mapped data
1405 OMP_TGT_OLDMAPTYPE_PRIVATE_PTR = 0x080, // private variable - not mapped
1406 OMP_TGT_OLDMAPTYPE_PRIVATE_VAL = 0x100 // copy by value - not mapped
1407};
1408
1409// Temporary functions for map translation and cleanup
1410struct combined_entry_t {
1411 int num_members; // number of members in combined entry
1412 void *base_addr; // base address of combined entry
1413 void *begin_addr; // begin address of combined entry
1414 void *end_addr; // size of combined entry
1415};
1416
1417static void translate_map(int32_t arg_num, void **args_base, void **args,
1418 int64_t *arg_sizes, int32_t *arg_types, int32_t &new_arg_num,
1419 void **&new_args_base, void **&new_args, int64_t *&new_arg_sizes,
1420 int64_t *&new_arg_types, bool is_target_construct) {
1421 if (arg_num <= 0) {
1422 DP("Nothing to translate\n");
1423 new_arg_num = 0;
1424 return;
1425 }
1426
1427 // array of combined entries
1428 combined_entry_t *cmb_entries =
1429 (combined_entry_t *) alloca(arg_num * sizeof(combined_entry_t));
1430 // number of combined entries
1431 long num_combined = 0;
1432 // old entry is MAP_PTR?
1433 bool *is_ptr_old = (bool *) alloca(arg_num * sizeof(bool));
1434 // old entry is member of member_of[old] cmb_entry
1435 int *member_of = (int *) alloca(arg_num * sizeof(int));
George Rokos15a6e7d2017-02-15 20:45:37 +00001436 // temporary storage for modifications of the original arg_types
1437 int32_t *mod_arg_types = (int32_t *) alloca(arg_num *sizeof(int32_t));
George Rokos2467df62017-01-25 21:27:24 +00001438
1439 DP("Translating %d map entries\n", arg_num);
1440 for (int i = 0; i < arg_num; ++i) {
1441 member_of[i] = -1;
1442 is_ptr_old[i] = false;
George Rokos15a6e7d2017-02-15 20:45:37 +00001443 mod_arg_types[i] = arg_types[i];
George Rokos2467df62017-01-25 21:27:24 +00001444 // Scan previous entries to see whether this entry shares the same base
1445 for (int j = 0; j < i; ++j) {
1446 void *new_begin_addr = NULL;
1447 void *new_end_addr = NULL;
1448
George Rokos15a6e7d2017-02-15 20:45:37 +00001449 if (mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_MAP_PTR) {
George Rokos2467df62017-01-25 21:27:24 +00001450 if (args_base[i] == args[j]) {
George Rokos15a6e7d2017-02-15 20:45:37 +00001451 if (!(mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR)) {
George Rokos2467df62017-01-25 21:27:24 +00001452 DP("Entry %d has the same base as entry %d's begin address\n", i,
1453 j);
1454 new_begin_addr = args_base[i];
1455 new_end_addr = (char *)args_base[i] + sizeof(void *);
1456 assert(arg_sizes[j] == sizeof(void *));
1457 is_ptr_old[j] = true;
1458 } else {
1459 DP("Entry %d has the same base as entry %d's begin address, but "
1460 "%d's base was a MAP_PTR too\n", i, j, j);
George Rokos15a6e7d2017-02-15 20:45:37 +00001461 int32_t to_from_always_delete =
1462 OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM |
1463 OMP_TGT_OLDMAPTYPE_ALWAYS | OMP_TGT_OLDMAPTYPE_DELETE;
1464 if (mod_arg_types[j] & to_from_always_delete) {
1465 DP("Resetting to/from/always/delete flags for entry %d because "
1466 "it is only a pointer to pointer\n", j);
1467 mod_arg_types[j] &= ~to_from_always_delete;
1468 }
George Rokos2467df62017-01-25 21:27:24 +00001469 }
1470 }
1471 } else {
George Rokos15a6e7d2017-02-15 20:45:37 +00001472 if (!(mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_FIRST_MAP) &&
George Rokos2467df62017-01-25 21:27:24 +00001473 args_base[i] == args_base[j]) {
1474 DP("Entry %d has the same base address as entry %d\n", i, j);
1475 new_begin_addr = args[i];
1476 new_end_addr = (char *)args[i] + arg_sizes[i];
1477 }
1478 }
1479
1480 // If we have combined the entry with a previous one
1481 if (new_begin_addr) {
1482 int id;
1483 if(member_of[j] == -1) {
1484 // We have a new entry
1485 id = num_combined++;
1486 DP("Creating new combined entry %d for old entry %d\n", id, j);
1487 // Initialize new entry
1488 cmb_entries[id].num_members = 1;
1489 cmb_entries[id].base_addr = args_base[j];
George Rokos15a6e7d2017-02-15 20:45:37 +00001490 if (mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR) {
George Rokos2467df62017-01-25 21:27:24 +00001491 cmb_entries[id].begin_addr = args_base[j];
1492 cmb_entries[id].end_addr = (char *)args_base[j] + arg_sizes[j];
1493 } else {
1494 cmb_entries[id].begin_addr = args[j];
1495 cmb_entries[id].end_addr = (char *)args[j] + arg_sizes[j];
1496 }
1497 member_of[j] = id;
1498 } else {
1499 // Reuse existing combined entry
1500 DP("Reusing existing combined entry %d\n", member_of[j]);
1501 id = member_of[j];
1502 }
1503
1504 // Update combined entry
1505 DP("Adding entry %d to combined entry %d\n", i, id);
1506 cmb_entries[id].num_members++;
1507 // base_addr stays the same
1508 cmb_entries[id].begin_addr =
1509 std::min(cmb_entries[id].begin_addr, new_begin_addr);
1510 cmb_entries[id].end_addr =
1511 std::max(cmb_entries[id].end_addr, new_end_addr);
1512 member_of[i] = id;
1513 break;
1514 }
1515 }
1516 }
1517
1518 DP("New entries: %ld combined + %d original\n", num_combined, arg_num);
1519 new_arg_num = arg_num + num_combined;
1520 new_args_base = (void **) malloc(new_arg_num * sizeof(void *));
1521 new_args = (void **) malloc(new_arg_num * sizeof(void *));
1522 new_arg_sizes = (int64_t *) malloc(new_arg_num * sizeof(int64_t));
1523 new_arg_types = (int64_t *) malloc(new_arg_num * sizeof(int64_t));
1524
1525 const int64_t alignment = 8;
1526
1527 int next_id = 0; // next ID
1528 int next_cid = 0; // next combined ID
1529 int *combined_to_new_id = (int *) alloca(num_combined * sizeof(int));
1530 for (int i = 0; i < arg_num; ++i) {
1531 // It is member_of
1532 if (member_of[i] == next_cid) {
1533 int cid = next_cid++; // ID of this combined entry
1534 int nid = next_id++; // ID of the new (global) entry
1535 combined_to_new_id[cid] = nid;
1536 DP("Combined entry %3d will become new entry %3d\n", cid, nid);
1537
1538 int64_t padding = (int64_t)cmb_entries[cid].begin_addr % alignment;
1539 if (padding) {
1540 DP("Using a padding of %" PRId64 " for begin address " DPxMOD "\n",
1541 padding, DPxPTR(cmb_entries[cid].begin_addr));
1542 cmb_entries[cid].begin_addr =
1543 (char *)cmb_entries[cid].begin_addr - padding;
1544 }
1545
1546 new_args_base[nid] = cmb_entries[cid].base_addr;
1547 new_args[nid] = cmb_entries[cid].begin_addr;
1548 new_arg_sizes[nid] = (int64_t) ((char *)cmb_entries[cid].end_addr -
1549 (char *)cmb_entries[cid].begin_addr);
1550 new_arg_types[nid] = OMP_TGT_MAPTYPE_TARGET_PARAM;
1551 DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", "
1552 "size %" PRId64 ", type 0x%" PRIx64 "\n", nid,
1553 DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
1554 new_arg_types[nid]);
1555 } else if (member_of[i] != -1) {
1556 DP("Combined entry %3d has been encountered before, do nothing\n",
1557 member_of[i]);
1558 }
1559
1560 // Now that the combined entry (the one the old entry was a member of) has
1561 // been inserted into the new arguments list, proceed with the old entry.
1562 int nid = next_id++;
1563 DP("Old entry %3d will become new entry %3d\n", i, nid);
1564
1565 new_args_base[nid] = args_base[i];
1566 new_args[nid] = args[i];
1567 new_arg_sizes[nid] = arg_sizes[i];
George Rokos15a6e7d2017-02-15 20:45:37 +00001568 int64_t old_type = mod_arg_types[i];
George Rokos2467df62017-01-25 21:27:24 +00001569
1570 if (is_ptr_old[i]) {
1571 // Reset TO and FROM flags
1572 old_type &= ~(OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM);
1573 }
1574
1575 if (member_of[i] == -1) {
1576 if (!is_target_construct)
1577 old_type &= ~OMP_TGT_MAPTYPE_TARGET_PARAM;
1578 new_arg_types[nid] = old_type;
1579 DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64
1580 ", type 0x%" PRIx64 " (old entry %d not MEMBER_OF)\n", nid,
1581 DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
1582 new_arg_types[nid], i);
1583 } else {
1584 // Old entry is not FIRST_MAP
1585 old_type &= ~OMP_TGT_OLDMAPTYPE_FIRST_MAP;
1586 // Add MEMBER_OF
1587 int new_member_of = combined_to_new_id[member_of[i]];
1588 old_type |= ((int64_t)new_member_of + 1) << 48;
1589 new_arg_types[nid] = old_type;
1590 DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64
1591 ", type 0x%" PRIx64 " (old entry %d MEMBER_OF %d)\n", nid,
1592 DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
1593 new_arg_types[nid], i, new_member_of);
1594 }
1595 }
1596}
1597
1598static void cleanup_map(int32_t new_arg_num, void **new_args_base,
1599 void **new_args, int64_t *new_arg_sizes, int64_t *new_arg_types,
1600 int32_t arg_num, void **args_base) {
1601 if (new_arg_num > 0) {
1602 int offset = new_arg_num - arg_num;
1603 for (int32_t i = 0; i < arg_num; ++i) {
1604 // Restore old base address
1605 args_base[i] = new_args_base[i+offset];
1606 }
1607 free(new_args_base);
1608 free(new_args);
1609 free(new_arg_sizes);
1610 free(new_arg_types);
1611 }
1612}
1613
1614static short member_of(int64_t type) {
1615 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
1616}
1617
1618/// Internal function to do the mapping and transfer the data to the device
1619static int target_data_begin(DeviceTy &Device, int32_t arg_num,
1620 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
1621 // process each input.
1622 int rc = OFFLOAD_SUCCESS;
1623 for (int32_t i = 0; i < arg_num; ++i) {
1624 // Ignore private variables and arrays - there is no mapping for them.
1625 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
1626 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
1627 continue;
1628
1629 void *HstPtrBegin = args[i];
1630 void *HstPtrBase = args_base[i];
1631 // Address of pointer on the host and device, respectively.
1632 void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
1633 bool IsNew, Pointer_IsNew;
1634 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
1635 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
1636 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
1637 DP("Has a pointer entry: \n");
1638 // base is address of pointer.
1639 Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
1640 sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef);
1641 if (!Pointer_TgtPtrBegin) {
1642 DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
1643 "illegal mapping).\n");
1644 }
1645 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
1646 "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin),
1647 (Pointer_IsNew ? "" : " not"));
1648 Pointer_HstPtrBegin = HstPtrBase;
1649 // modify current entry.
1650 HstPtrBase = *(void **)HstPtrBase;
1651 UpdateRef = true; // subsequently update ref count of pointee
1652 }
1653
1654 void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
1655 arg_sizes[i], IsNew, IsImplicit, UpdateRef);
1656 if (!TgtPtrBegin && arg_sizes[i]) {
1657 // If arg_sizes[i]==0, then the argument is a pointer to NULL, so
1658 // getOrAlloc() returning NULL is not an error.
1659 DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
1660 "illegal mapping).\n");
1661 }
1662 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
1663 " - is%s new\n", arg_sizes[i], DPxPTR(TgtPtrBegin),
1664 (IsNew ? "" : " not"));
1665
1666 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
1667 void *ret_ptr;
1668 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1669 ret_ptr = Pointer_TgtPtrBegin;
1670 else {
1671 bool IsLast; // not used
1672 ret_ptr = Device.getTgtPtrBegin(HstPtrBegin, 0, IsLast, false);
1673 }
1674
1675 DP("Returning device pointer " DPxMOD "\n", DPxPTR(ret_ptr));
1676 args_base[i] = ret_ptr;
1677 }
1678
1679 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
1680 bool copy = false;
1681 if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
1682 copy = true;
1683 } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
1684 // Copy data only if the "parent" struct has RefCount==1.
1685 short parent_idx = member_of(arg_types[i]);
1686 long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
1687 assert(parent_rc > 0 && "parent struct not found");
1688 if (parent_rc == 1) {
1689 copy = true;
1690 }
1691 }
1692
1693 if (copy) {
1694 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
1695 arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
1696 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
1697 if (rt != OFFLOAD_SUCCESS) {
1698 DP("Copying data to device failed.\n");
1699 rc = OFFLOAD_FAIL;
1700 }
1701 }
1702 }
1703
1704 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
1705 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
1706 DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
1707 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1708 void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
1709 int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
1710 sizeof(void *));
1711 if (rt != OFFLOAD_SUCCESS) {
1712 DP("Copying data to device failed.\n");
1713 rc = OFFLOAD_FAIL;
1714 }
1715 // create shadow pointers for this entry
1716 Device.ShadowMtx.lock();
1717 Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase,
1718 Pointer_TgtPtrBegin, TgtPtrBase};
1719 Device.ShadowMtx.unlock();
1720 }
1721 }
1722
1723 return rc;
1724}
1725
1726EXTERN void __tgt_target_data_begin_nowait(int32_t device_id, int32_t arg_num,
1727 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
1728 int32_t depNum, void *depList, int32_t noAliasDepNum,
1729 void *noAliasDepList) {
1730 if (depNum + noAliasDepNum > 0)
1731 __kmpc_omp_taskwait(NULL, 0);
1732
1733 __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
1734 arg_types);
1735}
1736
1737/// creates host-to-target data mapping, stores it in the
1738/// libomptarget.so internal structure (an entry in a stack of data maps)
1739/// and passes the data to the device.
1740EXTERN void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
1741 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
1742 DP("Entering data begin region for device %d with %d mappings\n", device_id,
1743 arg_num);
1744
1745 // No devices available?
1746 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
1747 device_id = omp_get_default_device();
1748 DP("Use default device id %d\n", device_id);
1749 }
1750
1751 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
1752 DP("Failed to get device %d ready\n", device_id);
1753 return;
1754 }
1755
1756 DeviceTy& Device = Devices[device_id];
1757
1758 // Translate maps
1759 int32_t new_arg_num;
1760 void **new_args_base;
1761 void **new_args;
1762 int64_t *new_arg_sizes;
1763 int64_t *new_arg_types;
1764 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
1765 new_args_base, new_args, new_arg_sizes, new_arg_types, false);
1766
1767 //target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types);
1768 target_data_begin(Device, new_arg_num, new_args_base, new_args, new_arg_sizes,
1769 new_arg_types);
1770
1771 // Cleanup translation memory
1772 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
1773 new_arg_types, arg_num, args_base);
1774}
1775
1776/// Internal function to undo the mapping and retrieve the data from the device.
1777static int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
1778 void **args, int64_t *arg_sizes, int64_t *arg_types) {
1779 int rc = OFFLOAD_SUCCESS;
1780 // process each input.
1781 for (int32_t i = arg_num - 1; i >= 0; --i) {
1782 // Ignore private variables and arrays - there is no mapping for them.
1783 // Also, ignore the use_device_ptr directive, it has no effect here.
1784 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
1785 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
1786 continue;
1787
1788 void *HstPtrBegin = args[i];
1789 bool IsLast;
1790 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
1791 (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
1792 bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
1793
1794 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
1795 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
1796 UpdateRef);
1797 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
1798 " - is%s last\n", arg_sizes[i], DPxPTR(TgtPtrBegin),
1799 (IsLast ? "" : " not"));
1800
George Rokos15a6e7d2017-02-15 20:45:37 +00001801 bool DelEntry = IsLast || ForceDelete;
1802
George Rokos2467df62017-01-25 21:27:24 +00001803 if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1804 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
George Rokos15a6e7d2017-02-15 20:45:37 +00001805 DelEntry = false; // protect parent struct from being deallocated
George Rokos2467df62017-01-25 21:27:24 +00001806 }
1807
George Rokos2467df62017-01-25 21:27:24 +00001808 if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
1809 // Move data back to the host
1810 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
1811 bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
1812 bool CopyMember = false;
1813 if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1814 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
1815 // Copy data only if the "parent" struct has RefCount==1.
1816 short parent_idx = member_of(arg_types[i]);
1817 long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
1818 assert(parent_rc > 0 && "parent struct not found");
1819 if (parent_rc == 1) {
1820 CopyMember = true;
1821 }
1822 }
1823
1824 if (DelEntry || Always || CopyMember) {
1825 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
1826 arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
1827 int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]);
1828 if (rt != OFFLOAD_SUCCESS) {
1829 DP("Copying data from device failed.\n");
1830 rc = OFFLOAD_FAIL;
1831 }
1832 }
1833 }
1834
1835 // If we copied back to the host a struct/array containing pointers, we
1836 // need to restore the original host pointer values from their shadow
1837 // copies. If the struct is going to be deallocated, remove any remaining
1838 // shadow pointer entries for this struct.
1839 uintptr_t lb = (uintptr_t) HstPtrBegin;
1840 uintptr_t ub = (uintptr_t) HstPtrBegin + arg_sizes[i];
1841 Device.ShadowMtx.lock();
1842 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
1843 it != Device.ShadowPtrMap.end(); ++it) {
1844 void **ShadowHstPtrAddr = (void**) it->first;
1845
1846 // An STL map is sorted on its keys; use this property
1847 // to quickly determine when to break out of the loop.
1848 if ((uintptr_t) ShadowHstPtrAddr < lb)
1849 continue;
1850 if ((uintptr_t) ShadowHstPtrAddr >= ub)
1851 break;
1852
1853 // If we copied the struct to the host, we need to restore the pointer.
1854 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
1855 DP("Restoring original host pointer value " DPxMOD " for host "
1856 "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
1857 DPxPTR(ShadowHstPtrAddr));
1858 *ShadowHstPtrAddr = it->second.HstPtrVal;
1859 }
1860 // If the struct is to be deallocated, remove the shadow entry.
1861 if (DelEntry) {
1862 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
1863 Device.ShadowPtrMap.erase(it);
1864 }
1865 }
1866 Device.ShadowMtx.unlock();
1867
1868 // Deallocate map
1869 if (DelEntry) {
1870 int rt = Device.deallocTgtPtr(HstPtrBegin, arg_sizes[i], ForceDelete);
1871 if (rt != OFFLOAD_SUCCESS) {
1872 DP("Deallocating data from device failed.\n");
1873 rc = OFFLOAD_FAIL;
1874 }
1875 }
1876 }
1877 }
1878
1879 return rc;
1880}
1881
1882/// passes data from the target, releases target memory and destroys
1883/// the host-target mapping (top entry from the stack of data maps)
1884/// created by the last __tgt_target_data_begin.
1885EXTERN void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
1886 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
1887 DP("Entering data end region with %d mappings\n", arg_num);
1888
1889 // No devices available?
1890 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
1891 device_id = omp_get_default_device();
1892 }
1893
1894 RTLsMtx.lock();
1895 size_t Devices_size = Devices.size();
1896 RTLsMtx.unlock();
1897 if (Devices_size <= (size_t)device_id) {
1898 DP("Device ID %d does not have a matching RTL.\n", device_id);
1899 return;
1900 }
1901
1902 DeviceTy &Device = Devices[device_id];
1903 if (!Device.IsInit) {
1904 DP("uninit device: ignore");
1905 return;
1906 }
1907
1908 // Translate maps
1909 int32_t new_arg_num;
1910 void **new_args_base;
1911 void **new_args;
1912 int64_t *new_arg_sizes;
1913 int64_t *new_arg_types;
1914 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
1915 new_args_base, new_args, new_arg_sizes, new_arg_types, false);
1916
1917 //target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
1918 target_data_end(Device, new_arg_num, new_args_base, new_args, new_arg_sizes,
1919 new_arg_types);
1920
1921 // Cleanup translation memory
1922 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
1923 new_arg_types, arg_num, args_base);
1924}
1925
1926EXTERN void __tgt_target_data_end_nowait(int32_t device_id, int32_t arg_num,
1927 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
1928 int32_t depNum, void *depList, int32_t noAliasDepNum,
1929 void *noAliasDepList) {
1930 if (depNum + noAliasDepNum > 0)
1931 __kmpc_omp_taskwait(NULL, 0);
1932
1933 __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
1934 arg_types);
1935}
1936
1937/// passes data to/from the target.
1938EXTERN void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
1939 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
1940 DP("Entering data update with %d mappings\n", arg_num);
1941
1942 // No devices available?
1943 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
1944 device_id = omp_get_default_device();
1945 }
1946
1947 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
1948 DP("Failed to get device %d ready\n", device_id);
1949 return;
1950 }
1951
1952 DeviceTy& Device = Devices[device_id];
1953
1954 // process each input.
1955 for (int32_t i = 0; i < arg_num; ++i) {
1956 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
1957 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
1958 continue;
1959
1960 void *HstPtrBegin = args[i];
1961 int64_t MapSize = arg_sizes[i];
1962 bool IsLast;
1963 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
1964 false);
1965
1966 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
1967 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
1968 arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
1969 Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize);
1970
1971 uintptr_t lb = (uintptr_t) HstPtrBegin;
1972 uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
1973 Device.ShadowMtx.lock();
1974 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
1975 it != Device.ShadowPtrMap.end(); ++it) {
1976 void **ShadowHstPtrAddr = (void**) it->first;
1977 if ((uintptr_t) ShadowHstPtrAddr < lb)
1978 continue;
1979 if ((uintptr_t) ShadowHstPtrAddr >= ub)
1980 break;
1981 DP("Restoring original host pointer value " DPxMOD " for host pointer "
1982 DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
1983 DPxPTR(ShadowHstPtrAddr));
1984 *ShadowHstPtrAddr = it->second.HstPtrVal;
1985 }
1986 Device.ShadowMtx.unlock();
1987 }
1988
1989 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
1990 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
1991 arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
1992 Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize);
1993
1994 uintptr_t lb = (uintptr_t) HstPtrBegin;
1995 uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
1996 Device.ShadowMtx.lock();
1997 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
1998 it != Device.ShadowPtrMap.end(); ++it) {
1999 void **ShadowHstPtrAddr = (void**) it->first;
2000 if ((uintptr_t) ShadowHstPtrAddr < lb)
2001 continue;
2002 if ((uintptr_t) ShadowHstPtrAddr >= ub)
2003 break;
2004 DP("Restoring original target pointer value " DPxMOD " for target "
2005 "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
2006 DPxPTR(it->second.TgtPtrAddr));
2007 Device.data_submit(it->second.TgtPtrAddr,
2008 &it->second.TgtPtrVal, sizeof(void *));
2009 }
2010 Device.ShadowMtx.unlock();
2011 }
2012 }
2013}
2014
2015EXTERN void __tgt_target_data_update_nowait(
2016 int32_t device_id, int32_t arg_num, void **args_base, void **args,
2017 int64_t *arg_sizes, int32_t *arg_types, int32_t depNum, void *depList,
2018 int32_t noAliasDepNum, void *noAliasDepList) {
2019 if (depNum + noAliasDepNum > 0)
2020 __kmpc_omp_taskwait(NULL, 0);
2021
2022 __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes,
2023 arg_types);
2024}
2025
2026/// performs the same actions as data_begin in case arg_num is
2027/// non-zero and initiates run of the offloaded region on the target platform;
2028/// if arg_num is non-zero after the region execution is done it also
2029/// performs the same action as data_update and data_end above. This function
2030/// returns 0 if it was able to transfer the execution to a target and an
2031/// integer different from zero otherwise.
2032static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
2033 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
2034 int32_t team_num, int32_t thread_limit, int IsTeamConstruct) {
2035 DeviceTy &Device = Devices[device_id];
2036
2037 // Find the table information in the map or look it up in the translation
2038 // tables.
2039 TableMap *TM = 0;
2040 TblMapMtx.lock();
2041 HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr);
2042 if (TableMapIt == HostPtrToTableMap.end()) {
2043 // We don't have a map. So search all the registered libraries.
2044 TrlTblMtx.lock();
2045 for (HostEntriesBeginToTransTableTy::iterator
2046 ii = HostEntriesBeginToTransTable.begin(),
2047 ie = HostEntriesBeginToTransTable.end();
2048 !TM && ii != ie; ++ii) {
2049 // get the translation table (which contains all the good info).
2050 TranslationTable *TransTable = &ii->second;
2051 // iterate over all the host table entries to see if we can locate the
2052 // host_ptr.
2053 __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
2054 __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
2055 __tgt_offload_entry *cur = begin;
2056 for (uint32_t i = 0; cur < end; ++cur, ++i) {
2057 if (cur->addr != host_ptr)
2058 continue;
2059 // we got a match, now fill the HostPtrToTableMap so that we
2060 // may avoid this search next time.
2061 TM = &HostPtrToTableMap[host_ptr];
2062 TM->Table = TransTable;
2063 TM->Index = i;
2064 break;
2065 }
2066 }
2067 TrlTblMtx.unlock();
2068 } else {
2069 TM = &TableMapIt->second;
2070 }
2071 TblMapMtx.unlock();
2072
2073 // No map for this host pointer found!
2074 if (!TM) {
2075 DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
2076 DPxPTR(host_ptr));
2077 return OFFLOAD_FAIL;
2078 }
2079
2080 // get target table.
2081 TrlTblMtx.lock();
2082 assert(TM->Table->TargetsTable.size() > (size_t)device_id &&
2083 "Not expecting a device ID outside the table's bounds!");
2084 __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
2085 TrlTblMtx.unlock();
2086 assert(TargetTable && "Global data has not been mapped\n");
2087
2088 // Move data to device.
2089 int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
2090 arg_types);
2091
2092 if (rc != OFFLOAD_SUCCESS) {
2093 DP("Call to target_data_begin failed, skipping target execution.\n");
2094 // Call target_data_end to dealloc whatever target_data_begin allocated
2095 // and return OFFLOAD_FAIL.
2096 target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
2097 return OFFLOAD_FAIL;
2098 }
2099
2100 std::vector<void *> tgt_args;
2101
2102 // List of (first-)private arrays allocated for this target region
2103 std::vector<void *> fpArrays;
2104
2105 for (int32_t i = 0; i < arg_num; ++i) {
2106 if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
2107 // This is not a target parameter, do not push it into tgt_args.
2108 continue;
2109 }
2110 void *HstPtrBegin = args[i];
2111 void *HstPtrBase = args_base[i];
2112 void *TgtPtrBase;
2113 bool IsLast; // unused.
2114 if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
2115 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
2116 DPxPTR(HstPtrBase));
2117 TgtPtrBase = HstPtrBase;
2118 } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
2119 // Allocate memory for (first-)private array
2120 void *TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
2121 arg_sizes[i]);
2122 if (!TgtPtrBegin) {
2123 DP ("Data allocation for %sprivate array " DPxMOD " failed\n",
2124 (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
2125 DPxPTR(HstPtrBegin));
2126 rc = OFFLOAD_FAIL;
2127 break;
2128 } else {
2129 fpArrays.push_back(TgtPtrBegin);
2130 uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
2131 TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta);
2132 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
2133 "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
2134 arg_sizes[i], DPxPTR(TgtPtrBegin),
2135 (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
2136 DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase));
2137 // If first-private, copy data from host
2138 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
2139 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
2140 if (rt != OFFLOAD_SUCCESS) {
2141 DP ("Copying data to device failed.\n");
2142 rc = OFFLOAD_FAIL;
2143 break;
2144 }
2145 }
2146 }
2147 } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
2148 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *),
2149 IsLast, false);
2150 TgtPtrBase = TgtPtrBegin; // no offset for ptrs.
2151 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
2152 "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
2153 DPxPTR(HstPtrBase));
2154 } else {
2155 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i],
2156 IsLast, false);
2157 uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
2158 TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta);
2159 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
2160 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
2161 }
2162 tgt_args.push_back(TgtPtrBase);
2163 }
2164 // Push omp handle.
2165 tgt_args.push_back((void *)0);
2166
2167 // Pop loop trip count
2168 uint64_t ltc = Device.loopTripCnt;
2169 Device.loopTripCnt = 0;
2170
2171 // Launch device execution.
2172 if (rc == OFFLOAD_SUCCESS) {
2173 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
2174 TargetTable->EntriesBegin[TM->Index].name,
2175 DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
2176 if (IsTeamConstruct) {
2177 rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
2178 &tgt_args[0], tgt_args.size(), team_num, thread_limit, ltc);
2179 } else {
2180 rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
2181 &tgt_args[0], tgt_args.size());
2182 }
2183 } else {
2184 DP("Errors occurred while obtaining target arguments, skipping kernel "
2185 "execution\n");
2186 }
2187
2188 // Deallocate (first-)private arrays
2189 for (auto it : fpArrays) {
2190 int rt = Device.RTL->data_delete(Device.RTLDeviceID, it);
2191 if (rt != OFFLOAD_SUCCESS) {
2192 DP("Deallocation of (first-)private arrays failed.\n");
2193 rc = OFFLOAD_FAIL;
2194 }
2195 }
2196
2197 // Move data from device.
2198 int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
2199 arg_types);
2200
2201 if (rt != OFFLOAD_SUCCESS) {
2202 DP("Call to target_data_end failed.\n");
2203 rc = OFFLOAD_FAIL;
2204 }
2205
2206 return rc;
2207}
2208
2209EXTERN int __tgt_target(int32_t device_id, void *host_ptr, int32_t arg_num,
2210 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
2211 if (device_id == OFFLOAD_DEVICE_CONSTRUCTOR ||
2212 device_id == OFFLOAD_DEVICE_DESTRUCTOR) {
2213 // Return immediately for the time being, target calls with device_id
2214 // -2 or -3 will be removed from the compiler in the future.
2215 return OFFLOAD_SUCCESS;
2216 }
2217
2218 DP("Entering target region with entry point " DPxMOD " and device Id %d\n",
2219 DPxPTR(host_ptr), device_id);
2220
2221 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
2222 device_id = omp_get_default_device();
2223 }
2224
2225 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
2226 DP("Failed to get device %d ready\n", device_id);
2227 return OFFLOAD_FAIL;
2228 }
2229
2230 // Translate maps
2231 int32_t new_arg_num;
2232 void **new_args_base;
2233 void **new_args;
2234 int64_t *new_arg_sizes;
2235 int64_t *new_arg_types;
2236 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
2237 new_args_base, new_args, new_arg_sizes, new_arg_types, true);
2238
2239 //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
2240 // arg_types, 0, 0, false /*team*/, false /*recursive*/);
2241 int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args,
2242 new_arg_sizes, new_arg_types, 0, 0, false /*team*/);
2243
2244 // Cleanup translation memory
2245 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
2246 new_arg_types, arg_num, args_base);
2247
2248 return rc;
2249}
2250
2251EXTERN int __tgt_target_nowait(int32_t device_id, void *host_ptr,
2252 int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
2253 int32_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum,
2254 void *noAliasDepList) {
2255 if (depNum + noAliasDepNum > 0)
2256 __kmpc_omp_taskwait(NULL, 0);
2257
2258 return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
2259 arg_types);
2260}
2261
2262EXTERN int __tgt_target_teams(int32_t device_id, void *host_ptr,
2263 int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
2264 int32_t *arg_types, int32_t team_num, int32_t thread_limit) {
2265 if (device_id == OFFLOAD_DEVICE_CONSTRUCTOR ||
2266 device_id == OFFLOAD_DEVICE_DESTRUCTOR) {
2267 // Return immediately for the time being, target calls with device_id
2268 // -2 or -3 will be removed from the compiler in the future.
2269 return OFFLOAD_SUCCESS;
2270 }
2271
2272 DP("Entering target region with entry point " DPxMOD " and device Id %d\n",
2273 DPxPTR(host_ptr), device_id);
2274
2275 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
2276 device_id = omp_get_default_device();
2277 }
2278
2279 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
2280 DP("Failed to get device %d ready\n", device_id);
2281 return OFFLOAD_FAIL;
2282 }
2283
2284 // Translate maps
2285 int32_t new_arg_num;
2286 void **new_args_base;
2287 void **new_args;
2288 int64_t *new_arg_sizes;
2289 int64_t *new_arg_types;
2290 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
2291 new_args_base, new_args, new_arg_sizes, new_arg_types, true);
2292
2293 //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
2294 // arg_types, team_num, thread_limit, true /*team*/,
2295 // false /*recursive*/);
2296 int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args,
2297 new_arg_sizes, new_arg_types, team_num, thread_limit, true /*team*/);
2298
2299 // Cleanup translation memory
2300 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
2301 new_arg_types, arg_num, args_base);
2302
2303 return rc;
2304}
2305
2306EXTERN int __tgt_target_teams_nowait(int32_t device_id, void *host_ptr,
2307 int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
2308 int32_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum,
2309 void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
2310 if (depNum + noAliasDepNum > 0)
2311 __kmpc_omp_taskwait(NULL, 0);
2312
2313 return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args,
2314 arg_sizes, arg_types, team_num, thread_limit);
2315}
2316
2317
2318// The trip count mechanism will be revised - this scheme is not thread-safe.
2319EXTERN void __kmpc_push_target_tripcount(int32_t device_id,
2320 uint64_t loop_tripcount) {
2321 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
2322 device_id = omp_get_default_device();
2323 }
2324
2325 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
2326 DP("Failed to get device %d ready\n", device_id);
2327 return;
2328 }
2329
2330 DP("__kmpc_push_target_tripcount(%d, %" PRIu64 ")\n", device_id,
2331 loop_tripcount);
2332 Devices[device_id].loopTripCnt = loop_tripcount;
2333}
2334