blob: eb1e765ba2fe77b1576bc5624ce11772181b5b49 [file] [log] [blame]
George Rokos2467df62017-01-25 21:27:24 +00001//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is dual licensed under the MIT and the University of Illinois Open
6// Source Licenses. See LICENSE.txt for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// Implementation of the interface to be used by Clang during the codegen of a
11// target region.
12//
13//===----------------------------------------------------------------------===//
14
15#include <algorithm>
16#include <cassert>
17#include <climits>
18#include <cstdlib>
19#include <cstring>
20#include <dlfcn.h>
21#include <list>
22#include <map>
23#include <mutex>
24#include <string>
25#include <vector>
26
27// Header file global to this project
28#include "omptarget.h"
29
30#define DP(...) DEBUGP("Libomptarget", __VA_ARGS__)
31#define INF_REF_CNT (LONG_MAX>>1) // leave room for additions/subtractions
32#define CONSIDERED_INF(x) (x > (INF_REF_CNT>>1))
33
34// List of all plugins that can support offloading.
35static const char *RTLNames[] = {
36 /* PowerPC target */ "libomptarget.rtl.ppc64.so",
37 /* x86_64 target */ "libomptarget.rtl.x86_64.so",
Paul Osmialowski1e254c52017-03-06 21:00:07 +000038 /* CUDA target */ "libomptarget.rtl.cuda.so",
39 /* AArch64 target */ "libomptarget.rtl.aarch64.so"};
George Rokos2467df62017-01-25 21:27:24 +000040
41// forward declarations
42struct RTLInfoTy;
43static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
44 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
45 int32_t team_num, int32_t thread_limit, int IsTeamConstruct);
46
47/// Map between host data and target data.
48struct HostDataToTargetTy {
49 uintptr_t HstPtrBase; // host info.
50 uintptr_t HstPtrBegin;
51 uintptr_t HstPtrEnd; // non-inclusive.
52
53 uintptr_t TgtPtrBegin; // target info.
54
55 long RefCount;
56
57 HostDataToTargetTy()
58 : HstPtrBase(0), HstPtrBegin(0), HstPtrEnd(0),
59 TgtPtrBegin(0), RefCount(0) {}
60 HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB)
61 : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
62 TgtPtrBegin(TB), RefCount(1) {}
63};
64
65typedef std::list<HostDataToTargetTy> HostDataToTargetListTy;
66
67struct LookupResult {
68 struct {
69 unsigned IsContained : 1;
70 unsigned ExtendsBefore : 1;
71 unsigned ExtendsAfter : 1;
72 } Flags;
73
74 HostDataToTargetListTy::iterator Entry;
75
Jonas Hahnfeldcfe5ef52017-01-27 11:03:33 +000076 LookupResult() : Flags({0,0,0}), Entry() {}
George Rokos2467df62017-01-25 21:27:24 +000077};
78
79/// Map for shadow pointers
80struct ShadowPtrValTy {
81 void *HstPtrVal;
82 void *TgtPtrAddr;
83 void *TgtPtrVal;
84};
85typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
86
87///
88struct PendingCtorDtorListsTy {
89 std::list<void *> PendingCtors;
90 std::list<void *> PendingDtors;
91};
92typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
93 PendingCtorsDtorsPerLibrary;
94
95struct DeviceTy {
96 int32_t DeviceID;
97 RTLInfoTy *RTL;
98 int32_t RTLDeviceID;
99
100 bool IsInit;
101 std::once_flag InitFlag;
102 bool HasPendingGlobals;
103
104 HostDataToTargetListTy HostDataToTargetMap;
105 PendingCtorsDtorsPerLibrary PendingCtorsDtors;
106
107 ShadowPtrListTy ShadowPtrMap;
108
109 std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx;
110
111 uint64_t loopTripCnt;
112
113 DeviceTy(RTLInfoTy *RTL)
114 : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
115 HasPendingGlobals(false), HostDataToTargetMap(),
116 PendingCtorsDtors(), ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(),
117 ShadowMtx(), loopTripCnt(0) {}
118
119 // The existence of mutexes makes DeviceTy non-copyable. We need to
120 // provide a copy constructor and an assignment operator explicitly.
121 DeviceTy(const DeviceTy &d)
122 : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID),
123 IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals),
124 HostDataToTargetMap(d.HostDataToTargetMap),
125 PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap),
126 DataMapMtx(), PendingGlobalsMtx(),
127 ShadowMtx(), loopTripCnt(d.loopTripCnt) {}
128
129 DeviceTy& operator=(const DeviceTy &d) {
130 DeviceID = d.DeviceID;
131 RTL = d.RTL;
132 RTLDeviceID = d.RTLDeviceID;
133 IsInit = d.IsInit;
134 HasPendingGlobals = d.HasPendingGlobals;
135 HostDataToTargetMap = d.HostDataToTargetMap;
136 PendingCtorsDtors = d.PendingCtorsDtors;
137 ShadowPtrMap = d.ShadowPtrMap;
138 loopTripCnt = d.loopTripCnt;
139
140 return *this;
141 }
142
143 long getMapEntryRefCnt(void *HstPtrBegin);
144 LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
145 void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
146 bool &IsNew, bool IsImplicit, bool UpdateRefCount = true);
147 void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
148 void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
149 bool UpdateRefCount);
150 int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete);
151 int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
152 int disassociatePtr(void *HstPtrBegin);
153
154 // calls to RTL
155 int32_t initOnce();
156 __tgt_target_table *load_binary(void *Img);
157
158 int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size);
159 int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
160
161 int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, int32_t TgtVarsSize);
162 int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
163 int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit,
164 uint64_t LoopTripCount);
165
166private:
167 // Call to RTL
168 void init(); // To be called only via DeviceTy::initOnce()
169};
170
171/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
172typedef std::vector<DeviceTy> DevicesTy;
173static DevicesTy Devices;
174
175struct RTLInfoTy {
176 typedef int32_t(is_valid_binary_ty)(void *);
177 typedef int32_t(number_of_devices_ty)();
178 typedef int32_t(init_device_ty)(int32_t);
179 typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
180 typedef void *(data_alloc_ty)(int32_t, int64_t);
181 typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
182 typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
183 typedef int32_t(data_delete_ty)(int32_t, void *);
184 typedef int32_t(run_region_ty)(int32_t, void *, void **, int32_t);
185 typedef int32_t(run_team_region_ty)(int32_t, void *, void **, int32_t,
186 int32_t, int32_t, uint64_t);
187
188 int32_t Idx; // RTL index, index is the number of devices
189 // of other RTLs that were registered before,
190 // i.e. the OpenMP index of the first device
191 // to be registered with this RTL.
192 int32_t NumberOfDevices; // Number of devices this RTL deals with.
193 std::vector<DeviceTy *> Devices; // one per device (NumberOfDevices in total).
194
195 void *LibraryHandler;
196
197#ifdef OMPTARGET_DEBUG
198 std::string RTLName;
199#endif
200
201 // Functions implemented in the RTL.
202 is_valid_binary_ty *is_valid_binary;
203 number_of_devices_ty *number_of_devices;
204 init_device_ty *init_device;
205 load_binary_ty *load_binary;
206 data_alloc_ty *data_alloc;
207 data_submit_ty *data_submit;
208 data_retrieve_ty *data_retrieve;
209 data_delete_ty *data_delete;
210 run_region_ty *run_region;
211 run_team_region_ty *run_team_region;
212
213 // Are there images associated with this RTL.
214 bool isUsed;
215
216 // Mutex for thread-safety when calling RTL interface functions.
217 // It is easier to enforce thread-safety at the libomptarget level,
218 // so that developers of new RTLs do not have to worry about it.
219 std::mutex Mtx;
220
221 // The existence of the mutex above makes RTLInfoTy non-copyable.
222 // We need to provide a copy constructor explicitly.
223 RTLInfoTy()
224 : Idx(-1), NumberOfDevices(-1), Devices(), LibraryHandler(0),
225#ifdef OMPTARGET_DEBUG
226 RTLName(),
227#endif
228 is_valid_binary(0), number_of_devices(0), init_device(0),
229 load_binary(0), data_alloc(0), data_submit(0), data_retrieve(0),
230 data_delete(0), run_region(0), run_team_region(0), isUsed(false),
231 Mtx() {}
232
233 RTLInfoTy(const RTLInfoTy &r) : Mtx() {
234 Idx = r.Idx;
235 NumberOfDevices = r.NumberOfDevices;
236 Devices = r.Devices;
237 LibraryHandler = r.LibraryHandler;
238#ifdef OMPTARGET_DEBUG
239 RTLName = r.RTLName;
240#endif
241 is_valid_binary = r.is_valid_binary;
242 number_of_devices = r.number_of_devices;
243 init_device = r.init_device;
244 load_binary = r.load_binary;
245 data_alloc = r.data_alloc;
246 data_submit = r.data_submit;
247 data_retrieve = r.data_retrieve;
248 data_delete = r.data_delete;
249 run_region = r.run_region;
250 run_team_region = r.run_team_region;
251 isUsed = r.isUsed;
252 }
253};
254
255/// RTLs identified in the system.
256class RTLsTy {
257private:
258 // Mutex-like object to guarantee thread-safety and unique initialization
259 // (i.e. the library attempts to load the RTLs (plugins) only once).
260 std::once_flag initFlag;
261 void LoadRTLs(); // not thread-safe
262
263public:
264 // List of the detected runtime libraries.
265 std::list<RTLInfoTy> AllRTLs;
266
267 // Array of pointers to the detected runtime libraries that have compatible
268 // binaries.
269 std::vector<RTLInfoTy *> UsedRTLs;
270
271 explicit RTLsTy() {}
272
273 // Load all the runtime libraries (plugins) if not done before.
274 void LoadRTLsOnce();
275};
276
277void RTLsTy::LoadRTLs() {
278 // Parse environment variable OMP_TARGET_OFFLOAD (if set)
279 char *envStr = getenv("OMP_TARGET_OFFLOAD");
280 if (envStr && !strcmp(envStr, "DISABLED")) {
281 DP("Target offloading disabled by environment\n");
282 return;
283 }
284
285 DP("Loading RTLs...\n");
286
287 // Attempt to open all the plugins and, if they exist, check if the interface
288 // is correct and if they are supporting any devices.
289 for (auto *Name : RTLNames) {
290 DP("Loading library '%s'...\n", Name);
291 void *dynlib_handle = dlopen(Name, RTLD_NOW);
292
293 if (!dynlib_handle) {
294 // Library does not exist or cannot be found.
295 DP("Unable to load library '%s': %s!\n", Name, dlerror());
296 continue;
297 }
298
299 DP("Successfully loaded library '%s'!\n", Name);
300
301 // Retrieve the RTL information from the runtime library.
302 RTLInfoTy R;
303
304 R.LibraryHandler = dynlib_handle;
305 R.isUsed = false;
306
307#ifdef OMPTARGET_DEBUG
308 R.RTLName = Name;
309#endif
310
311 if (!(R.is_valid_binary = (RTLInfoTy::is_valid_binary_ty *)dlsym(
312 dynlib_handle, "__tgt_rtl_is_valid_binary")))
313 continue;
314 if (!(R.number_of_devices = (RTLInfoTy::number_of_devices_ty *)dlsym(
315 dynlib_handle, "__tgt_rtl_number_of_devices")))
316 continue;
317 if (!(R.init_device = (RTLInfoTy::init_device_ty *)dlsym(
318 dynlib_handle, "__tgt_rtl_init_device")))
319 continue;
320 if (!(R.load_binary = (RTLInfoTy::load_binary_ty *)dlsym(
321 dynlib_handle, "__tgt_rtl_load_binary")))
322 continue;
323 if (!(R.data_alloc = (RTLInfoTy::data_alloc_ty *)dlsym(
324 dynlib_handle, "__tgt_rtl_data_alloc")))
325 continue;
326 if (!(R.data_submit = (RTLInfoTy::data_submit_ty *)dlsym(
327 dynlib_handle, "__tgt_rtl_data_submit")))
328 continue;
329 if (!(R.data_retrieve = (RTLInfoTy::data_retrieve_ty *)dlsym(
330 dynlib_handle, "__tgt_rtl_data_retrieve")))
331 continue;
332 if (!(R.data_delete = (RTLInfoTy::data_delete_ty *)dlsym(
333 dynlib_handle, "__tgt_rtl_data_delete")))
334 continue;
335 if (!(R.run_region = (RTLInfoTy::run_region_ty *)dlsym(
336 dynlib_handle, "__tgt_rtl_run_target_region")))
337 continue;
338 if (!(R.run_team_region = (RTLInfoTy::run_team_region_ty *)dlsym(
339 dynlib_handle, "__tgt_rtl_run_target_team_region")))
340 continue;
341
342 // No devices are supported by this RTL?
343 if (!(R.NumberOfDevices = R.number_of_devices())) {
344 DP("No devices supported in this RTL\n");
345 continue;
346 }
347
348 DP("Registering RTL %s supporting %d devices!\n",
349 R.RTLName.c_str(), R.NumberOfDevices);
350
351 // The RTL is valid! Will save the information in the RTLs list.
352 AllRTLs.push_back(R);
353 }
354
355 DP("RTLs loaded!\n");
356
357 return;
358}
359
360void RTLsTy::LoadRTLsOnce() {
361 // RTL.LoadRTLs() is called only once in a thread-safe fashion.
362 std::call_once(initFlag, &RTLsTy::LoadRTLs, this);
363}
364
365static RTLsTy RTLs;
366static std::mutex RTLsMtx;
367
368/// Map between the host entry begin and the translation table. Each
369/// registered library gets one TranslationTable. Use the map from
370/// __tgt_offload_entry so that we may quickly determine whether we
371/// are trying to (re)register an existing lib or really have a new one.
372struct TranslationTable {
373 __tgt_target_table HostTable;
374
375 // Image assigned to a given device.
376 std::vector<__tgt_device_image *> TargetsImages; // One image per device ID.
377
378 // Table of entry points or NULL if it was not already computed.
379 std::vector<__tgt_target_table *> TargetsTable; // One table per device ID.
380};
381typedef std::map<__tgt_offload_entry *, TranslationTable>
382 HostEntriesBeginToTransTableTy;
383static HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
384static std::mutex TrlTblMtx;
385
386/// Map between the host ptr and a table index
387struct TableMap {
388 TranslationTable *Table; // table associated with the host ptr.
389 uint32_t Index; // index in which the host ptr translated entry is found.
390 TableMap() : Table(0), Index(0) {}
391 TableMap(TranslationTable *table, uint32_t index)
392 : Table(table), Index(index) {}
393};
394typedef std::map<void *, TableMap> HostPtrToTableMapTy;
395static HostPtrToTableMapTy HostPtrToTableMap;
396static std::mutex TblMapMtx;
397
398/// Check whether a device has an associated RTL and initialize it if it's not
399/// already initialized.
400static bool device_is_ready(int device_num) {
401 DP("Checking whether device %d is ready.\n", device_num);
402 // Devices.size() can only change while registering a new
403 // library, so try to acquire the lock of RTLs' mutex.
404 RTLsMtx.lock();
405 size_t Devices_size = Devices.size();
406 RTLsMtx.unlock();
407 if (Devices_size <= (size_t)device_num) {
408 DP("Device ID %d does not have a matching RTL\n", device_num);
409 return false;
410 }
411
412 // Get device info
413 DeviceTy &Device = Devices[device_num];
414
415 DP("Is the device %d (local ID %d) initialized? %d\n", device_num,
416 Device.RTLDeviceID, Device.IsInit);
417
418 // Init the device if not done before
419 if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) {
420 DP("Failed to init device %d\n", device_num);
421 return false;
422 }
423
424 DP("Device %d is ready to use.\n", device_num);
425
426 return true;
427}
428
429////////////////////////////////////////////////////////////////////////////////
430// Target API functions
431//
432EXTERN int omp_get_num_devices(void) {
433 RTLsMtx.lock();
434 size_t Devices_size = Devices.size();
435 RTLsMtx.unlock();
436
437 DP("Call to omp_get_num_devices returning %zd\n", Devices_size);
438
439 return Devices_size;
440}
441
442EXTERN int omp_get_initial_device(void) {
443 DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE);
444 return HOST_DEVICE;
445}
446
447EXTERN void *omp_target_alloc(size_t size, int device_num) {
448 DP("Call to omp_target_alloc for device %d requesting %zu bytes\n",
449 device_num, size);
450
451 if (size <= 0) {
452 DP("Call to omp_target_alloc with non-positive length\n");
453 return NULL;
454 }
455
456 void *rc = NULL;
457
458 if (device_num == omp_get_initial_device()) {
459 rc = malloc(size);
460 DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc));
461 return rc;
462 }
463
464 if (!device_is_ready(device_num)) {
465 DP("omp_target_alloc returns NULL ptr\n");
466 return NULL;
467 }
468
469 DeviceTy &Device = Devices[device_num];
470 rc = Device.RTL->data_alloc(Device.RTLDeviceID, size);
471 DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc));
472 return rc;
473}
474
475EXTERN void omp_target_free(void *device_ptr, int device_num) {
476 DP("Call to omp_target_free for device %d and address " DPxMOD "\n",
477 device_num, DPxPTR(device_ptr));
478
479 if (!device_ptr) {
480 DP("Call to omp_target_free with NULL ptr\n");
481 return;
482 }
483
484 if (device_num == omp_get_initial_device()) {
485 free(device_ptr);
486 DP("omp_target_free deallocated host ptr\n");
487 return;
488 }
489
490 if (!device_is_ready(device_num)) {
491 DP("omp_target_free returns, nothing to do\n");
492 return;
493 }
494
495 DeviceTy &Device = Devices[device_num];
496 Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr);
497 DP("omp_target_free deallocated device ptr\n");
498}
499
500EXTERN int omp_target_is_present(void *ptr, int device_num) {
501 DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",
502 device_num, DPxPTR(ptr));
503
504 if (!ptr) {
505 DP("Call to omp_target_is_present with NULL ptr, returning false\n");
506 return false;
507 }
508
509 if (device_num == omp_get_initial_device()) {
510 DP("Call to omp_target_is_present on host, returning true\n");
511 return true;
512 }
513
514 RTLsMtx.lock();
515 size_t Devices_size = Devices.size();
516 RTLsMtx.unlock();
517 if (Devices_size <= (size_t)device_num) {
518 DP("Call to omp_target_is_present with invalid device ID, returning "
519 "false\n");
520 return false;
521 }
522
523 DeviceTy& Device = Devices[device_num];
524 bool IsLast; // not used
525 int rc = (Device.getTgtPtrBegin(ptr, 0, IsLast, false) != NULL);
526 DP("Call to omp_target_is_present returns %d\n", rc);
527 return rc;
528}
529
530EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
531 size_t dst_offset, size_t src_offset, int dst_device, int src_device) {
532 DP("Call to omp_target_memcpy, dst device %d, src device %d, "
533 "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
534 "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst),
535 DPxPTR(src), dst_offset, src_offset, length);
536
537 if (!dst || !src || length <= 0) {
538 DP("Call to omp_target_memcpy with invalid arguments\n");
539 return OFFLOAD_FAIL;
540 }
541
542 if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) {
543 DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
544 return OFFLOAD_FAIL;
545 }
546
547 if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) {
548 DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
549 return OFFLOAD_FAIL;
550 }
551
552 int rc = OFFLOAD_SUCCESS;
553 void *srcAddr = (char *)src + src_offset;
554 void *dstAddr = (char *)dst + dst_offset;
555
556 if (src_device == omp_get_initial_device() &&
557 dst_device == omp_get_initial_device()) {
558 DP("copy from host to host\n");
559 const void *p = memcpy(dstAddr, srcAddr, length);
560 if (p == NULL)
561 rc = OFFLOAD_FAIL;
562 } else if (src_device == omp_get_initial_device()) {
563 DP("copy from host to device\n");
564 DeviceTy& DstDev = Devices[dst_device];
565 rc = DstDev.data_submit(dstAddr, srcAddr, length);
566 } else if (dst_device == omp_get_initial_device()) {
567 DP("copy from device to host\n");
568 DeviceTy& SrcDev = Devices[src_device];
569 rc = SrcDev.data_retrieve(dstAddr, srcAddr, length);
570 } else {
571 DP("copy from device to device\n");
572 void *buffer = malloc(length);
573 DeviceTy& SrcDev = Devices[src_device];
574 DeviceTy& DstDev = Devices[dst_device];
575 rc = SrcDev.data_retrieve(buffer, srcAddr, length);
576 if (rc == OFFLOAD_SUCCESS)
577 rc = DstDev.data_submit(dstAddr, buffer, length);
578 }
579
580 DP("omp_target_memcpy returns %d\n", rc);
581 return rc;
582}
583
584EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
585 int num_dims, const size_t *volume, const size_t *dst_offsets,
586 const size_t *src_offsets, const size_t *dst_dimensions,
587 const size_t *src_dimensions, int dst_device, int src_device) {
588 DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
589 "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
590 "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
591 "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device,
592 src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets),
593 DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions),
594 DPxPTR(volume), element_size, num_dims);
595
596 if (!(dst || src)) {
597 DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
598 INT_MAX);
599 return INT_MAX;
600 }
601
602 if (!dst || !src || element_size < 1 || num_dims < 1 || !volume ||
603 !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) {
604 DP("Call to omp_target_memcpy_rect with invalid arguments\n");
605 return OFFLOAD_FAIL;
606 }
607
608 int rc;
609 if (num_dims == 1) {
610 rc = omp_target_memcpy(dst, src, element_size * volume[0],
611 element_size * dst_offsets[0], element_size * src_offsets[0],
612 dst_device, src_device);
613 } else {
614 size_t dst_slice_size = element_size;
615 size_t src_slice_size = element_size;
616 for (int i=1; i<num_dims; ++i) {
617 dst_slice_size *= dst_dimensions[i];
618 src_slice_size *= src_dimensions[i];
619 }
620
621 size_t dst_off = dst_offsets[0] * dst_slice_size;
622 size_t src_off = src_offsets[0] * src_slice_size;
623 for (size_t i=0; i<volume[0]; ++i) {
624 rc = omp_target_memcpy_rect((char *) dst + dst_off + dst_slice_size * i,
625 (char *) src + src_off + src_slice_size * i, element_size,
626 num_dims - 1, volume + 1, dst_offsets + 1, src_offsets + 1,
627 dst_dimensions + 1, src_dimensions + 1, dst_device, src_device);
628
629 if (rc) {
630 DP("Recursive call to omp_target_memcpy_rect returns unsuccessfully\n");
631 return rc;
632 }
633 }
634 }
635
636 DP("omp_target_memcpy_rect returns %d\n", rc);
637 return rc;
638}
639
640EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr,
641 size_t size, size_t device_offset, int device_num) {
642 DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", "
643 "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n",
644 DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num);
645
646 if (!host_ptr || !device_ptr || size <= 0) {
647 DP("Call to omp_target_associate_ptr with invalid arguments\n");
648 return OFFLOAD_FAIL;
649 }
650
651 if (device_num == omp_get_initial_device()) {
652 DP("omp_target_associate_ptr: no association possible on the host\n");
653 return OFFLOAD_FAIL;
654 }
655
656 if (!device_is_ready(device_num)) {
657 DP("omp_target_associate_ptr returns OFFLOAD_FAIL\n");
658 return OFFLOAD_FAIL;
659 }
660
661 DeviceTy& Device = Devices[device_num];
662 void *device_addr = (void *)((uint64_t)device_ptr + (uint64_t)device_offset);
663 int rc = Device.associatePtr(host_ptr, device_addr, size);
664 DP("omp_target_associate_ptr returns %d\n", rc);
665 return rc;
666}
667
668EXTERN int omp_target_disassociate_ptr(void *host_ptr, int device_num) {
669 DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", "
670 "device_num %d\n", DPxPTR(host_ptr), device_num);
671
672 if (!host_ptr) {
673 DP("Call to omp_target_associate_ptr with invalid host_ptr\n");
674 return OFFLOAD_FAIL;
675 }
676
677 if (device_num == omp_get_initial_device()) {
678 DP("omp_target_disassociate_ptr: no association possible on the host\n");
679 return OFFLOAD_FAIL;
680 }
681
682 if (!device_is_ready(device_num)) {
683 DP("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n");
684 return OFFLOAD_FAIL;
685 }
686
687 DeviceTy& Device = Devices[device_num];
688 int rc = Device.disassociatePtr(host_ptr);
689 DP("omp_target_disassociate_ptr returns %d\n", rc);
690 return rc;
691}
692
693////////////////////////////////////////////////////////////////////////////////
694// functionality for device
695
696int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
697 DataMapMtx.lock();
698
699 // Check if entry exists
700 for (auto &HT : HostDataToTargetMap) {
701 if ((uintptr_t)HstPtrBegin == HT.HstPtrBegin) {
702 // Mapping already exists
703 bool isValid = HT.HstPtrBegin == (uintptr_t) HstPtrBegin &&
704 HT.HstPtrEnd == (uintptr_t) HstPtrBegin + Size &&
705 HT.TgtPtrBegin == (uintptr_t) TgtPtrBegin;
706 DataMapMtx.unlock();
707 if (isValid) {
708 DP("Attempt to re-associate the same device ptr+offset with the same "
709 "host ptr, nothing to do\n");
710 return OFFLOAD_SUCCESS;
711 } else {
712 DP("Not allowed to re-associate a different device ptr+offset with the "
713 "same host ptr\n");
714 return OFFLOAD_FAIL;
715 }
716 }
717 }
718
719 // Mapping does not exist, allocate it
720 HostDataToTargetTy newEntry;
721
722 // Set up missing fields
723 newEntry.HstPtrBase = (uintptr_t) HstPtrBegin;
724 newEntry.HstPtrBegin = (uintptr_t) HstPtrBegin;
725 newEntry.HstPtrEnd = (uintptr_t) HstPtrBegin + Size;
726 newEntry.TgtPtrBegin = (uintptr_t) TgtPtrBegin;
727 // refCount must be infinite
728 newEntry.RefCount = INF_REF_CNT;
729
730 DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", HstEnd="
731 DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(newEntry.HstPtrBase),
732 DPxPTR(newEntry.HstPtrBegin), DPxPTR(newEntry.HstPtrEnd),
733 DPxPTR(newEntry.TgtPtrBegin));
734 HostDataToTargetMap.push_front(newEntry);
735
736 DataMapMtx.unlock();
737
738 return OFFLOAD_SUCCESS;
739}
740
741int DeviceTy::disassociatePtr(void *HstPtrBegin) {
742 DataMapMtx.lock();
743
744 // Check if entry exists
745 for (HostDataToTargetListTy::iterator ii = HostDataToTargetMap.begin();
746 ii != HostDataToTargetMap.end(); ++ii) {
747 if ((uintptr_t)HstPtrBegin == ii->HstPtrBegin) {
748 // Mapping exists
749 if (CONSIDERED_INF(ii->RefCount)) {
750 DP("Association found, removing it\n");
751 HostDataToTargetMap.erase(ii);
752 DataMapMtx.unlock();
753 return OFFLOAD_SUCCESS;
754 } else {
755 DP("Trying to disassociate a pointer which was not mapped via "
756 "omp_target_associate_ptr\n");
757 break;
758 }
759 }
760 }
761
762 // Mapping not found
763 DataMapMtx.unlock();
764 DP("Association not found\n");
765 return OFFLOAD_FAIL;
766}
767
768// Get ref count of map entry containing HstPtrBegin
769long DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) {
770 uintptr_t hp = (uintptr_t)HstPtrBegin;
771 long RefCnt = -1;
772
773 DataMapMtx.lock();
774 for (auto &HT : HostDataToTargetMap) {
775 if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) {
776 DP("DeviceTy::getMapEntry: requested entry found\n");
777 RefCnt = HT.RefCount;
778 break;
779 }
780 }
781 DataMapMtx.unlock();
782
783 if (RefCnt < 0) {
784 DP("DeviceTy::getMapEntry: requested entry not found\n");
785 }
786
787 return RefCnt;
788}
789
790LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) {
791 uintptr_t hp = (uintptr_t)HstPtrBegin;
792 LookupResult lr;
793
794 DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp),
795 Size);
796 for (lr.Entry = HostDataToTargetMap.begin();
797 lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) {
798 auto &HT = *lr.Entry;
799 // Is it contained?
800 lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd &&
801 (hp+Size) <= HT.HstPtrEnd;
802 // Does it extend into an already mapped region?
803 lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin;
804 // Does it extend beyond the mapped region?
805 lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd;
806
807 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore ||
808 lr.Flags.ExtendsAfter) {
809 break;
810 }
811 }
812
813 if (lr.Flags.ExtendsBefore) {
814 DP("WARNING: Pointer is not mapped but section extends into already "
815 "mapped data\n");
816 }
817 if (lr.Flags.ExtendsAfter) {
818 DP("WARNING: Pointer is already mapped but section extends beyond mapped "
819 "region\n");
820 }
821
822 return lr;
823}
824
825// Used by target_data_begin
826// Return the target pointer begin (where the data will be moved).
827// Allocate memory if this is the first occurrence if this mapping.
828// Increment the reference counter.
829// If NULL is returned, then either data allocation failed or the user tried
830// to do an illegal mapping.
831void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
832 int64_t Size, bool &IsNew, bool IsImplicit, bool UpdateRefCount) {
833 void *rc = NULL;
834 DataMapMtx.lock();
835 LookupResult lr = lookupMapping(HstPtrBegin, Size);
836
837 // Check if the pointer is contained.
838 if (lr.Flags.IsContained ||
839 ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) {
840 auto &HT = *lr.Entry;
841 IsNew = false;
842
843 if (UpdateRefCount)
844 ++HT.RefCount;
845
846 uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
847 DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
848 "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""),
849 DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
850 (UpdateRefCount ? " updated" : ""),
851 (CONSIDERED_INF(HT.RefCount)) ? "INF" :
852 std::to_string(HT.RefCount).c_str());
853 rc = (void *)tp;
854 } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) {
855 // Explicit extension of mapped data - not allowed.
856 DP("Explicit extension of mapping is not allowed.\n");
857 } else if (Size) {
858 // If it is not contained and Size > 0 we should create a new entry for it.
859 IsNew = true;
860 uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size);
861 DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
862 "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
863 DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
864 HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
865 (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
866 rc = (void *)tp;
867 }
868
869 DataMapMtx.unlock();
870 return rc;
871}
872
873// Used by target_data_begin, target_data_end, target_data_update and target.
874// Return the target pointer begin (where the data will be moved).
875// Decrement the reference counter if called from target_data_end.
876void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
877 bool UpdateRefCount) {
878 void *rc = NULL;
879 DataMapMtx.lock();
880 LookupResult lr = lookupMapping(HstPtrBegin, Size);
881
882 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
883 auto &HT = *lr.Entry;
884 IsLast = !(HT.RefCount > 1);
885
886 if (HT.RefCount > 1 && UpdateRefCount)
887 --HT.RefCount;
888
889 uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
890 DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
891 "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
892 (UpdateRefCount ? " updated" : ""),
893 (CONSIDERED_INF(HT.RefCount)) ? "INF" :
894 std::to_string(HT.RefCount).c_str());
895 rc = (void *)tp;
896 } else {
897 IsLast = false;
898 }
899
900 DataMapMtx.unlock();
901 return rc;
902}
903
904// Return the target pointer begin (where the data will be moved).
905// Lock-free version called from within assertions.
906void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) {
907 uintptr_t hp = (uintptr_t)HstPtrBegin;
908 LookupResult lr = lookupMapping(HstPtrBegin, Size);
909 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
910 auto &HT = *lr.Entry;
911 uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin);
912 return (void *)tp;
913 }
914
915 return NULL;
916}
917
918int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) {
919 // Check if the pointer is contained in any sub-nodes.
920 int rc;
921 DataMapMtx.lock();
922 LookupResult lr = lookupMapping(HstPtrBegin, Size);
923 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
924 auto &HT = *lr.Entry;
925 if (ForceDelete)
926 HT.RefCount = 1;
927 if (--HT.RefCount <= 0) {
928 assert(HT.RefCount == 0 && "did not expect a negative ref count");
929 DP("Deleting tgt data " DPxMOD " of size %ld\n",
930 DPxPTR(HT.TgtPtrBegin), Size);
931 RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin);
932 DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
933 ", Size=%ld\n", (ForceDelete ? " (forced)" : ""),
934 DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size);
935 HostDataToTargetMap.erase(lr.Entry);
936 }
937 rc = OFFLOAD_SUCCESS;
938 } else {
939 DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated"
940 " memory\n", DPxPTR(HstPtrBegin));
941 rc = OFFLOAD_FAIL;
942 }
943
944 DataMapMtx.unlock();
945 return rc;
946}
947
948/// Init device, should not be called directly.
949void DeviceTy::init() {
950 int32_t rc = RTL->init_device(RTLDeviceID);
951 if (rc == OFFLOAD_SUCCESS) {
952 IsInit = true;
953 }
954}
955
956/// Thread-safe method to initialize the device only once.
957int32_t DeviceTy::initOnce() {
958 std::call_once(InitFlag, &DeviceTy::init, this);
959
960 // At this point, if IsInit is true, then either this thread or some other
961 // thread in the past successfully initialized the device, so we can return
962 // OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it
963 // failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means
964 // that some other thread already attempted to execute init() and if IsInit
965 // is still false, return OFFLOAD_FAIL.
966 if (IsInit)
967 return OFFLOAD_SUCCESS;
968 else
969 return OFFLOAD_FAIL;
970}
971
972// Load binary to device.
973__tgt_target_table *DeviceTy::load_binary(void *Img) {
974 RTL->Mtx.lock();
975 __tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img);
976 RTL->Mtx.unlock();
977 return rc;
978}
979
980// Submit data to device.
981int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin,
982 int64_t Size) {
983 return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
984}
985
986// Retrieve data from device.
987int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
988 int64_t Size) {
989 return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
990}
991
992// Run region on device
993int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
994 int32_t TgtVarsSize) {
995 return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize);
996}
997
998// Run team region on device.
999int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
1000 int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit,
1001 uint64_t LoopTripCount) {
1002 return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize,
1003 NumTeams, ThreadLimit, LoopTripCount);
1004}
1005
1006////////////////////////////////////////////////////////////////////////////////
1007// Functionality for registering libs
1008
1009static void RegisterImageIntoTranslationTable(TranslationTable &TT,
1010 RTLInfoTy &RTL, __tgt_device_image *image) {
1011
1012 // same size, as when we increase one, we also increase the other.
1013 assert(TT.TargetsTable.size() == TT.TargetsImages.size() &&
1014 "We should have as many images as we have tables!");
1015
1016 // Resize the Targets Table and Images to accommodate the new targets if
1017 // required
1018 unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices;
1019
1020 if (TT.TargetsTable.size() < TargetsTableMinimumSize) {
1021 TT.TargetsImages.resize(TargetsTableMinimumSize, 0);
1022 TT.TargetsTable.resize(TargetsTableMinimumSize, 0);
1023 }
1024
1025 // Register the image in all devices for this target type.
1026 for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) {
1027 // If we are changing the image we are also invalidating the target table.
1028 if (TT.TargetsImages[RTL.Idx + i] != image) {
1029 TT.TargetsImages[RTL.Idx + i] = image;
1030 TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table.
1031 }
1032 }
1033}
1034
1035////////////////////////////////////////////////////////////////////////////////
1036// Functionality for registering Ctors/Dtors
1037
1038static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
1039 __tgt_device_image *img, RTLInfoTy *RTL) {
1040
1041 for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
1042 DeviceTy &Device = Devices[RTL->Idx + i];
1043 Device.PendingGlobalsMtx.lock();
1044 Device.HasPendingGlobals = true;
1045 for (__tgt_offload_entry *entry = img->EntriesBegin;
1046 entry != img->EntriesEnd; ++entry) {
1047 if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
1048 DP("Adding ctor " DPxMOD " to the pending list.\n",
1049 DPxPTR(entry->addr));
1050 Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
1051 } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
1052 // Dtors are pushed in reverse order so they are executed from end
1053 // to beginning when unregistering the library!
1054 DP("Adding dtor " DPxMOD " to the pending list.\n",
1055 DPxPTR(entry->addr));
1056 Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
1057 }
1058
1059 if (entry->flags & OMP_DECLARE_TARGET_LINK) {
1060 DP("The \"link\" attribute is not yet supported!\n");
1061 }
1062 }
1063 Device.PendingGlobalsMtx.unlock();
1064 }
1065}
1066
1067////////////////////////////////////////////////////////////////////////////////
1068/// adds a target shared library to the target execution image
1069EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
1070
1071 // Attempt to load all plugins available in the system.
1072 RTLs.LoadRTLsOnce();
1073
1074 RTLsMtx.lock();
1075 // Register the images with the RTLs that understand them, if any.
1076 for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
1077 // Obtain the image.
1078 __tgt_device_image *img = &desc->DeviceImages[i];
1079
1080 RTLInfoTy *FoundRTL = NULL;
1081
1082 // Scan the RTLs that have associated images until we find one that supports
1083 // the current image.
1084 for (auto &R : RTLs.AllRTLs) {
1085 if (!R.is_valid_binary(img)) {
1086 DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
1087 DPxPTR(img->ImageStart), R.RTLName.c_str());
1088 continue;
1089 }
1090
1091 DP("Image " DPxMOD " is compatible with RTL %s!\n",
1092 DPxPTR(img->ImageStart), R.RTLName.c_str());
1093
1094 // If this RTL is not already in use, initialize it.
1095 if (!R.isUsed) {
1096 // Initialize the device information for the RTL we are about to use.
1097 DeviceTy device(&R);
1098
1099 size_t start = Devices.size();
1100 Devices.resize(start + R.NumberOfDevices, device);
1101 for (int32_t device_id = 0; device_id < R.NumberOfDevices;
1102 device_id++) {
1103 // global device ID
1104 Devices[start + device_id].DeviceID = start + device_id;
1105 // RTL local device ID
1106 Devices[start + device_id].RTLDeviceID = device_id;
1107
1108 // Save pointer to device in RTL in case we want to unregister the RTL
1109 R.Devices.push_back(&Devices[start + device_id]);
1110 }
1111
1112 // Initialize the index of this RTL and save it in the used RTLs.
1113 R.Idx = (RTLs.UsedRTLs.empty())
1114 ? 0
1115 : RTLs.UsedRTLs.back()->Idx +
1116 RTLs.UsedRTLs.back()->NumberOfDevices;
1117 assert((size_t) R.Idx == start &&
1118 "RTL index should equal the number of devices used so far.");
1119 R.isUsed = true;
1120 RTLs.UsedRTLs.push_back(&R);
1121
1122 DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx);
1123 }
1124
1125 // Initialize (if necessary) translation table for this library.
1126 TrlTblMtx.lock();
1127 if(!HostEntriesBeginToTransTable.count(desc->HostEntriesBegin)){
1128 TranslationTable &tt =
1129 HostEntriesBeginToTransTable[desc->HostEntriesBegin];
1130 tt.HostTable.EntriesBegin = desc->HostEntriesBegin;
1131 tt.HostTable.EntriesEnd = desc->HostEntriesEnd;
1132 }
1133
1134 // Retrieve translation table for this library.
1135 TranslationTable &TransTable =
1136 HostEntriesBeginToTransTable[desc->HostEntriesBegin];
1137
1138 DP("Registering image " DPxMOD " with RTL %s!\n",
1139 DPxPTR(img->ImageStart), R.RTLName.c_str());
1140 RegisterImageIntoTranslationTable(TransTable, R, img);
1141 TrlTblMtx.unlock();
1142 FoundRTL = &R;
1143
1144 // Load ctors/dtors for static objects
1145 RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL);
1146
1147 // if an RTL was found we are done - proceed to register the next image
1148 break;
1149 }
1150
1151 if (!FoundRTL) {
1152 DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart));
1153 }
1154 }
1155 RTLsMtx.unlock();
1156
1157
1158 DP("Done registering entries!\n");
1159}
1160
1161////////////////////////////////////////////////////////////////////////////////
1162/// unloads a target shared library
1163EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) {
1164 DP("Unloading target library!\n");
1165
1166 RTLsMtx.lock();
1167 // Find which RTL understands each image, if any.
1168 for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
1169 // Obtain the image.
1170 __tgt_device_image *img = &desc->DeviceImages[i];
1171
1172 RTLInfoTy *FoundRTL = NULL;
1173
1174 // Scan the RTLs that have associated images until we find one that supports
1175 // the current image. We only need to scan RTLs that are already being used.
1176 for (auto *R : RTLs.UsedRTLs) {
1177
1178 assert(R->isUsed && "Expecting used RTLs.");
1179
1180 if (!R->is_valid_binary(img)) {
1181 DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
1182 DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
1183 continue;
1184 }
1185
1186 DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n",
1187 DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
1188
1189 FoundRTL = R;
1190
1191 // Execute dtors for static objects if the device has been used, i.e.
1192 // if its PendingCtors list has been emptied.
1193 for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) {
1194 DeviceTy &Device = Devices[FoundRTL->Idx + i];
1195 Device.PendingGlobalsMtx.lock();
1196 if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
1197 for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
1198 int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1,
1199 1, true /*team*/);
1200 if (rc != OFFLOAD_SUCCESS) {
1201 DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
1202 }
1203 }
1204 // Remove this library's entry from PendingCtorsDtors
1205 Device.PendingCtorsDtors.erase(desc);
1206 }
1207 Device.PendingGlobalsMtx.unlock();
1208 }
1209
1210 DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n",
1211 DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
1212
1213 break;
1214 }
1215
1216 // if no RTL was found proceed to unregister the next image
1217 if (!FoundRTL){
1218 DP("No RTLs in use support the image " DPxMOD "!\n",
1219 DPxPTR(img->ImageStart));
1220 }
1221 }
1222 RTLsMtx.unlock();
1223 DP("Done unregistering images!\n");
1224
1225 // Remove entries from HostPtrToTableMap
1226 TblMapMtx.lock();
1227 for (__tgt_offload_entry *cur = desc->HostEntriesBegin;
1228 cur < desc->HostEntriesEnd; ++cur) {
1229 HostPtrToTableMap.erase(cur->addr);
1230 }
1231
1232 // Remove translation table for this descriptor.
1233 auto tt = HostEntriesBeginToTransTable.find(desc->HostEntriesBegin);
1234 if (tt != HostEntriesBeginToTransTable.end()) {
1235 DP("Removing translation table for descriptor " DPxMOD "\n",
1236 DPxPTR(desc->HostEntriesBegin));
1237 HostEntriesBeginToTransTable.erase(tt);
1238 } else {
1239 DP("Translation table for descriptor " DPxMOD " cannot be found, probably "
1240 "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin));
1241 }
1242
1243 TblMapMtx.unlock();
1244
1245 // TODO: Remove RTL and the devices it manages if it's not used anymore?
1246 // TODO: Write some RTL->unload_image(...) function?
1247
1248 DP("Done unregistering library!\n");
1249}
1250
1251/// Map global data and execute pending ctors
1252static int InitLibrary(DeviceTy& Device) {
1253 /*
1254 * Map global data
1255 */
1256 int32_t device_id = Device.DeviceID;
1257 int rc = OFFLOAD_SUCCESS;
1258
1259 Device.PendingGlobalsMtx.lock();
1260 TrlTblMtx.lock();
1261 for (HostEntriesBeginToTransTableTy::iterator
1262 ii = HostEntriesBeginToTransTable.begin();
1263 ii != HostEntriesBeginToTransTable.end(); ++ii) {
1264 TranslationTable *TransTable = &ii->second;
1265 if (TransTable->TargetsTable[device_id] != 0) {
1266 // Library entries have already been processed
1267 continue;
1268 }
1269
1270 // 1) get image.
1271 assert(TransTable->TargetsImages.size() > (size_t)device_id &&
1272 "Not expecting a device ID outside the table's bounds!");
1273 __tgt_device_image *img = TransTable->TargetsImages[device_id];
1274 if (!img) {
1275 DP("No image loaded for device id %d.\n", device_id);
1276 rc = OFFLOAD_FAIL;
1277 break;
1278 }
1279 // 2) load image into the target table.
1280 __tgt_target_table *TargetTable =
1281 TransTable->TargetsTable[device_id] = Device.load_binary(img);
1282 // Unable to get table for this image: invalidate image and fail.
1283 if (!TargetTable) {
1284 DP("Unable to generate entries table for device id %d.\n", device_id);
1285 TransTable->TargetsImages[device_id] = 0;
1286 rc = OFFLOAD_FAIL;
1287 break;
1288 }
1289
1290 // Verify whether the two table sizes match.
1291 size_t hsize =
1292 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
1293 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
1294
1295 // Invalid image for these host entries!
1296 if (hsize != tsize) {
1297 DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
1298 device_id, hsize, tsize);
1299 TransTable->TargetsImages[device_id] = 0;
1300 TransTable->TargetsTable[device_id] = 0;
1301 rc = OFFLOAD_FAIL;
1302 break;
1303 }
1304
1305 // process global data that needs to be mapped.
George Rokos2467df62017-01-25 21:27:24 +00001306 __tgt_target_table *HostTable = &TransTable->HostTable;
1307 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
1308 *CurrHostEntry = HostTable->EntriesBegin,
1309 *EntryDeviceEnd = TargetTable->EntriesEnd;
1310 CurrDeviceEntry != EntryDeviceEnd;
1311 CurrDeviceEntry++, CurrHostEntry++) {
1312 if (CurrDeviceEntry->size != 0) {
1313 // has data.
1314 assert(CurrDeviceEntry->size == CurrHostEntry->size &&
1315 "data size mismatch");
George Rokosba7380b2017-03-22 16:43:40 +00001316
1317 // Fortran may use multiple weak declarations for the same symbol,
1318 // therefore we must allow for multiple weak symbols to be loaded from
1319 // the fat binary. Treat these mappings as any other "regular" mapping.
1320 // Add entry to map.
George Rokos2467df62017-01-25 21:27:24 +00001321 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
1322 "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
1323 CurrDeviceEntry->size);
George Rokosba7380b2017-03-22 16:43:40 +00001324 bool IsNew; //unused
1325 Device.getOrAllocTgtPtr(CurrHostEntry->addr /*HstPtrBegin*/,
1326 CurrHostEntry->addr /*HstPtrBase*/, CurrHostEntry->size /*Size*/,
1327 IsNew, false /*IsImplicit*/, true /*UpdateRefCount*/);
George Rokos2467df62017-01-25 21:27:24 +00001328 }
1329 }
George Rokos2467df62017-01-25 21:27:24 +00001330 }
1331 TrlTblMtx.unlock();
1332
1333 if (rc != OFFLOAD_SUCCESS) {
1334 Device.PendingGlobalsMtx.unlock();
1335 return rc;
1336 }
1337
1338 /*
1339 * Run ctors for static objects
1340 */
1341 if (!Device.PendingCtorsDtors.empty()) {
1342 // Call all ctors for all libraries registered so far
1343 for (auto &lib : Device.PendingCtorsDtors) {
1344 if (!lib.second.PendingCtors.empty()) {
1345 DP("Has pending ctors... call now\n");
1346 for (auto &entry : lib.second.PendingCtors) {
1347 void *ctor = entry;
1348 int rc = target(device_id, ctor, 0, NULL, NULL, NULL,
1349 NULL, 1, 1, true /*team*/);
1350 if (rc != OFFLOAD_SUCCESS) {
1351 DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
1352 Device.PendingGlobalsMtx.unlock();
1353 return OFFLOAD_FAIL;
1354 }
1355 }
1356 // Clear the list to indicate that this device has been used
1357 lib.second.PendingCtors.clear();
1358 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
1359 }
1360 }
1361 }
1362 Device.HasPendingGlobals = false;
1363 Device.PendingGlobalsMtx.unlock();
1364
1365 return OFFLOAD_SUCCESS;
1366}
1367
1368// Check whether a device has been initialized, global ctors have been
1369// executed and global data has been mapped; do so if not already done.
1370static int CheckDevice(int32_t device_id) {
1371 // Is device ready?
1372 if (!device_is_ready(device_id)) {
1373 DP("Device %d is not ready.\n", device_id);
1374 return OFFLOAD_FAIL;
1375 }
1376
1377 // Get device info.
1378 DeviceTy &Device = Devices[device_id];
1379
1380 // Check whether global data has been mapped for this device
1381 Device.PendingGlobalsMtx.lock();
1382 bool hasPendingGlobals = Device.HasPendingGlobals;
1383 Device.PendingGlobalsMtx.unlock();
1384 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
1385 DP("Failed to init globals on device %d\n", device_id);
1386 return OFFLOAD_FAIL;
1387 }
1388
1389 return OFFLOAD_SUCCESS;
1390}
1391
1392// Following datatypes and functions (tgt_oldmap_type, combined_entry_t,
1393// translate_map, cleanup_map) will be removed once the compiler starts using
1394// the new map types.
1395
1396// Old map types
1397enum tgt_oldmap_type {
1398 OMP_TGT_OLDMAPTYPE_TO = 0x001, // copy data from host to device
1399 OMP_TGT_OLDMAPTYPE_FROM = 0x002, // copy data from device to host
1400 OMP_TGT_OLDMAPTYPE_ALWAYS = 0x004, // copy regardless of the ref. count
1401 OMP_TGT_OLDMAPTYPE_DELETE = 0x008, // force unmapping of data
1402 OMP_TGT_OLDMAPTYPE_MAP_PTR = 0x010, // map pointer as well as pointee
1403 OMP_TGT_OLDMAPTYPE_FIRST_MAP = 0x020, // first occurrence of mapped variable
1404 OMP_TGT_OLDMAPTYPE_RETURN_PTR = 0x040, // return TgtBase addr of mapped data
1405 OMP_TGT_OLDMAPTYPE_PRIVATE_PTR = 0x080, // private variable - not mapped
1406 OMP_TGT_OLDMAPTYPE_PRIVATE_VAL = 0x100 // copy by value - not mapped
1407};
1408
1409// Temporary functions for map translation and cleanup
1410struct combined_entry_t {
1411 int num_members; // number of members in combined entry
1412 void *base_addr; // base address of combined entry
1413 void *begin_addr; // begin address of combined entry
1414 void *end_addr; // size of combined entry
1415};
1416
1417static void translate_map(int32_t arg_num, void **args_base, void **args,
1418 int64_t *arg_sizes, int32_t *arg_types, int32_t &new_arg_num,
1419 void **&new_args_base, void **&new_args, int64_t *&new_arg_sizes,
1420 int64_t *&new_arg_types, bool is_target_construct) {
1421 if (arg_num <= 0) {
1422 DP("Nothing to translate\n");
1423 new_arg_num = 0;
1424 return;
1425 }
1426
1427 // array of combined entries
1428 combined_entry_t *cmb_entries =
1429 (combined_entry_t *) alloca(arg_num * sizeof(combined_entry_t));
1430 // number of combined entries
1431 long num_combined = 0;
1432 // old entry is MAP_PTR?
1433 bool *is_ptr_old = (bool *) alloca(arg_num * sizeof(bool));
1434 // old entry is member of member_of[old] cmb_entry
1435 int *member_of = (int *) alloca(arg_num * sizeof(int));
George Rokos15a6e7d2017-02-15 20:45:37 +00001436 // temporary storage for modifications of the original arg_types
1437 int32_t *mod_arg_types = (int32_t *) alloca(arg_num *sizeof(int32_t));
George Rokos2467df62017-01-25 21:27:24 +00001438
1439 DP("Translating %d map entries\n", arg_num);
1440 for (int i = 0; i < arg_num; ++i) {
1441 member_of[i] = -1;
1442 is_ptr_old[i] = false;
George Rokos15a6e7d2017-02-15 20:45:37 +00001443 mod_arg_types[i] = arg_types[i];
George Rokos2467df62017-01-25 21:27:24 +00001444 // Scan previous entries to see whether this entry shares the same base
1445 for (int j = 0; j < i; ++j) {
1446 void *new_begin_addr = NULL;
1447 void *new_end_addr = NULL;
1448
George Rokos15a6e7d2017-02-15 20:45:37 +00001449 if (mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_MAP_PTR) {
George Rokos2467df62017-01-25 21:27:24 +00001450 if (args_base[i] == args[j]) {
George Rokos15a6e7d2017-02-15 20:45:37 +00001451 if (!(mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR)) {
George Rokos2467df62017-01-25 21:27:24 +00001452 DP("Entry %d has the same base as entry %d's begin address\n", i,
1453 j);
1454 new_begin_addr = args_base[i];
1455 new_end_addr = (char *)args_base[i] + sizeof(void *);
1456 assert(arg_sizes[j] == sizeof(void *));
1457 is_ptr_old[j] = true;
1458 } else {
1459 DP("Entry %d has the same base as entry %d's begin address, but "
1460 "%d's base was a MAP_PTR too\n", i, j, j);
George Rokos15a6e7d2017-02-15 20:45:37 +00001461 int32_t to_from_always_delete =
1462 OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM |
1463 OMP_TGT_OLDMAPTYPE_ALWAYS | OMP_TGT_OLDMAPTYPE_DELETE;
1464 if (mod_arg_types[j] & to_from_always_delete) {
1465 DP("Resetting to/from/always/delete flags for entry %d because "
1466 "it is only a pointer to pointer\n", j);
1467 mod_arg_types[j] &= ~to_from_always_delete;
1468 }
George Rokos2467df62017-01-25 21:27:24 +00001469 }
1470 }
1471 } else {
George Rokos15a6e7d2017-02-15 20:45:37 +00001472 if (!(mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_FIRST_MAP) &&
George Rokos2467df62017-01-25 21:27:24 +00001473 args_base[i] == args_base[j]) {
1474 DP("Entry %d has the same base address as entry %d\n", i, j);
1475 new_begin_addr = args[i];
1476 new_end_addr = (char *)args[i] + arg_sizes[i];
1477 }
1478 }
1479
1480 // If we have combined the entry with a previous one
1481 if (new_begin_addr) {
1482 int id;
1483 if(member_of[j] == -1) {
1484 // We have a new entry
1485 id = num_combined++;
1486 DP("Creating new combined entry %d for old entry %d\n", id, j);
1487 // Initialize new entry
1488 cmb_entries[id].num_members = 1;
1489 cmb_entries[id].base_addr = args_base[j];
George Rokos15a6e7d2017-02-15 20:45:37 +00001490 if (mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR) {
George Rokos2467df62017-01-25 21:27:24 +00001491 cmb_entries[id].begin_addr = args_base[j];
1492 cmb_entries[id].end_addr = (char *)args_base[j] + arg_sizes[j];
1493 } else {
1494 cmb_entries[id].begin_addr = args[j];
1495 cmb_entries[id].end_addr = (char *)args[j] + arg_sizes[j];
1496 }
1497 member_of[j] = id;
1498 } else {
1499 // Reuse existing combined entry
1500 DP("Reusing existing combined entry %d\n", member_of[j]);
1501 id = member_of[j];
1502 }
1503
1504 // Update combined entry
1505 DP("Adding entry %d to combined entry %d\n", i, id);
1506 cmb_entries[id].num_members++;
1507 // base_addr stays the same
1508 cmb_entries[id].begin_addr =
1509 std::min(cmb_entries[id].begin_addr, new_begin_addr);
1510 cmb_entries[id].end_addr =
1511 std::max(cmb_entries[id].end_addr, new_end_addr);
1512 member_of[i] = id;
1513 break;
1514 }
1515 }
1516 }
1517
1518 DP("New entries: %ld combined + %d original\n", num_combined, arg_num);
1519 new_arg_num = arg_num + num_combined;
1520 new_args_base = (void **) malloc(new_arg_num * sizeof(void *));
1521 new_args = (void **) malloc(new_arg_num * sizeof(void *));
1522 new_arg_sizes = (int64_t *) malloc(new_arg_num * sizeof(int64_t));
1523 new_arg_types = (int64_t *) malloc(new_arg_num * sizeof(int64_t));
1524
1525 const int64_t alignment = 8;
1526
1527 int next_id = 0; // next ID
1528 int next_cid = 0; // next combined ID
1529 int *combined_to_new_id = (int *) alloca(num_combined * sizeof(int));
1530 for (int i = 0; i < arg_num; ++i) {
1531 // It is member_of
1532 if (member_of[i] == next_cid) {
1533 int cid = next_cid++; // ID of this combined entry
1534 int nid = next_id++; // ID of the new (global) entry
1535 combined_to_new_id[cid] = nid;
1536 DP("Combined entry %3d will become new entry %3d\n", cid, nid);
1537
1538 int64_t padding = (int64_t)cmb_entries[cid].begin_addr % alignment;
1539 if (padding) {
1540 DP("Using a padding of %" PRId64 " for begin address " DPxMOD "\n",
1541 padding, DPxPTR(cmb_entries[cid].begin_addr));
1542 cmb_entries[cid].begin_addr =
1543 (char *)cmb_entries[cid].begin_addr - padding;
1544 }
1545
1546 new_args_base[nid] = cmb_entries[cid].base_addr;
1547 new_args[nid] = cmb_entries[cid].begin_addr;
1548 new_arg_sizes[nid] = (int64_t) ((char *)cmb_entries[cid].end_addr -
1549 (char *)cmb_entries[cid].begin_addr);
1550 new_arg_types[nid] = OMP_TGT_MAPTYPE_TARGET_PARAM;
1551 DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", "
1552 "size %" PRId64 ", type 0x%" PRIx64 "\n", nid,
1553 DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
1554 new_arg_types[nid]);
1555 } else if (member_of[i] != -1) {
1556 DP("Combined entry %3d has been encountered before, do nothing\n",
1557 member_of[i]);
1558 }
1559
1560 // Now that the combined entry (the one the old entry was a member of) has
1561 // been inserted into the new arguments list, proceed with the old entry.
1562 int nid = next_id++;
1563 DP("Old entry %3d will become new entry %3d\n", i, nid);
1564
1565 new_args_base[nid] = args_base[i];
1566 new_args[nid] = args[i];
1567 new_arg_sizes[nid] = arg_sizes[i];
George Rokos15a6e7d2017-02-15 20:45:37 +00001568 int64_t old_type = mod_arg_types[i];
George Rokos2467df62017-01-25 21:27:24 +00001569
1570 if (is_ptr_old[i]) {
1571 // Reset TO and FROM flags
1572 old_type &= ~(OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM);
1573 }
1574
1575 if (member_of[i] == -1) {
1576 if (!is_target_construct)
1577 old_type &= ~OMP_TGT_MAPTYPE_TARGET_PARAM;
1578 new_arg_types[nid] = old_type;
1579 DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64
1580 ", type 0x%" PRIx64 " (old entry %d not MEMBER_OF)\n", nid,
1581 DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
1582 new_arg_types[nid], i);
1583 } else {
1584 // Old entry is not FIRST_MAP
1585 old_type &= ~OMP_TGT_OLDMAPTYPE_FIRST_MAP;
1586 // Add MEMBER_OF
1587 int new_member_of = combined_to_new_id[member_of[i]];
1588 old_type |= ((int64_t)new_member_of + 1) << 48;
1589 new_arg_types[nid] = old_type;
1590 DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64
1591 ", type 0x%" PRIx64 " (old entry %d MEMBER_OF %d)\n", nid,
1592 DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
1593 new_arg_types[nid], i, new_member_of);
1594 }
1595 }
1596}
1597
1598static void cleanup_map(int32_t new_arg_num, void **new_args_base,
1599 void **new_args, int64_t *new_arg_sizes, int64_t *new_arg_types,
1600 int32_t arg_num, void **args_base) {
1601 if (new_arg_num > 0) {
1602 int offset = new_arg_num - arg_num;
1603 for (int32_t i = 0; i < arg_num; ++i) {
1604 // Restore old base address
1605 args_base[i] = new_args_base[i+offset];
1606 }
1607 free(new_args_base);
1608 free(new_args);
1609 free(new_arg_sizes);
1610 free(new_arg_types);
1611 }
1612}
1613
1614static short member_of(int64_t type) {
1615 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
1616}
1617
1618/// Internal function to do the mapping and transfer the data to the device
1619static int target_data_begin(DeviceTy &Device, int32_t arg_num,
1620 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
1621 // process each input.
1622 int rc = OFFLOAD_SUCCESS;
1623 for (int32_t i = 0; i < arg_num; ++i) {
1624 // Ignore private variables and arrays - there is no mapping for them.
1625 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
1626 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
1627 continue;
1628
1629 void *HstPtrBegin = args[i];
1630 void *HstPtrBase = args_base[i];
1631 // Address of pointer on the host and device, respectively.
1632 void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
1633 bool IsNew, Pointer_IsNew;
1634 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
1635 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
1636 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
1637 DP("Has a pointer entry: \n");
1638 // base is address of pointer.
1639 Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
1640 sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef);
1641 if (!Pointer_TgtPtrBegin) {
1642 DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
1643 "illegal mapping).\n");
1644 }
1645 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
1646 "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin),
1647 (Pointer_IsNew ? "" : " not"));
1648 Pointer_HstPtrBegin = HstPtrBase;
1649 // modify current entry.
1650 HstPtrBase = *(void **)HstPtrBase;
1651 UpdateRef = true; // subsequently update ref count of pointee
1652 }
1653
1654 void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
1655 arg_sizes[i], IsNew, IsImplicit, UpdateRef);
1656 if (!TgtPtrBegin && arg_sizes[i]) {
1657 // If arg_sizes[i]==0, then the argument is a pointer to NULL, so
1658 // getOrAlloc() returning NULL is not an error.
1659 DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
1660 "illegal mapping).\n");
1661 }
1662 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
1663 " - is%s new\n", arg_sizes[i], DPxPTR(TgtPtrBegin),
1664 (IsNew ? "" : " not"));
1665
1666 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
1667 void *ret_ptr;
1668 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1669 ret_ptr = Pointer_TgtPtrBegin;
1670 else {
1671 bool IsLast; // not used
1672 ret_ptr = Device.getTgtPtrBegin(HstPtrBegin, 0, IsLast, false);
1673 }
1674
1675 DP("Returning device pointer " DPxMOD "\n", DPxPTR(ret_ptr));
1676 args_base[i] = ret_ptr;
1677 }
1678
1679 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
1680 bool copy = false;
1681 if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
1682 copy = true;
1683 } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
1684 // Copy data only if the "parent" struct has RefCount==1.
1685 short parent_idx = member_of(arg_types[i]);
1686 long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
1687 assert(parent_rc > 0 && "parent struct not found");
1688 if (parent_rc == 1) {
1689 copy = true;
1690 }
1691 }
1692
1693 if (copy) {
1694 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
1695 arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
1696 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
1697 if (rt != OFFLOAD_SUCCESS) {
1698 DP("Copying data to device failed.\n");
1699 rc = OFFLOAD_FAIL;
1700 }
1701 }
1702 }
1703
1704 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
1705 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
1706 DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
1707 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1708 void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
1709 int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
1710 sizeof(void *));
1711 if (rt != OFFLOAD_SUCCESS) {
1712 DP("Copying data to device failed.\n");
1713 rc = OFFLOAD_FAIL;
1714 }
1715 // create shadow pointers for this entry
1716 Device.ShadowMtx.lock();
1717 Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase,
1718 Pointer_TgtPtrBegin, TgtPtrBase};
1719 Device.ShadowMtx.unlock();
1720 }
1721 }
1722
1723 return rc;
1724}
1725
1726EXTERN void __tgt_target_data_begin_nowait(int32_t device_id, int32_t arg_num,
1727 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
1728 int32_t depNum, void *depList, int32_t noAliasDepNum,
1729 void *noAliasDepList) {
1730 if (depNum + noAliasDepNum > 0)
1731 __kmpc_omp_taskwait(NULL, 0);
1732
1733 __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
1734 arg_types);
1735}
1736
1737/// creates host-to-target data mapping, stores it in the
1738/// libomptarget.so internal structure (an entry in a stack of data maps)
1739/// and passes the data to the device.
1740EXTERN void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
1741 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
1742 DP("Entering data begin region for device %d with %d mappings\n", device_id,
1743 arg_num);
1744
1745 // No devices available?
1746 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
1747 device_id = omp_get_default_device();
1748 DP("Use default device id %d\n", device_id);
1749 }
1750
1751 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
1752 DP("Failed to get device %d ready\n", device_id);
1753 return;
1754 }
1755
1756 DeviceTy& Device = Devices[device_id];
1757
1758 // Translate maps
1759 int32_t new_arg_num;
1760 void **new_args_base;
1761 void **new_args;
1762 int64_t *new_arg_sizes;
1763 int64_t *new_arg_types;
1764 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
1765 new_args_base, new_args, new_arg_sizes, new_arg_types, false);
1766
1767 //target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types);
1768 target_data_begin(Device, new_arg_num, new_args_base, new_args, new_arg_sizes,
1769 new_arg_types);
1770
1771 // Cleanup translation memory
1772 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
1773 new_arg_types, arg_num, args_base);
1774}
1775
1776/// Internal function to undo the mapping and retrieve the data from the device.
1777static int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
1778 void **args, int64_t *arg_sizes, int64_t *arg_types) {
1779 int rc = OFFLOAD_SUCCESS;
1780 // process each input.
1781 for (int32_t i = arg_num - 1; i >= 0; --i) {
1782 // Ignore private variables and arrays - there is no mapping for them.
1783 // Also, ignore the use_device_ptr directive, it has no effect here.
1784 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
1785 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
1786 continue;
1787
1788 void *HstPtrBegin = args[i];
1789 bool IsLast;
1790 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
1791 (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
1792 bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
1793
1794 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
1795 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
1796 UpdateRef);
1797 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
1798 " - is%s last\n", arg_sizes[i], DPxPTR(TgtPtrBegin),
1799 (IsLast ? "" : " not"));
1800
George Rokos15a6e7d2017-02-15 20:45:37 +00001801 bool DelEntry = IsLast || ForceDelete;
1802
George Rokos2467df62017-01-25 21:27:24 +00001803 if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1804 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
George Rokos15a6e7d2017-02-15 20:45:37 +00001805 DelEntry = false; // protect parent struct from being deallocated
George Rokos2467df62017-01-25 21:27:24 +00001806 }
1807
George Rokos2467df62017-01-25 21:27:24 +00001808 if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
1809 // Move data back to the host
1810 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
1811 bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
1812 bool CopyMember = false;
1813 if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1814 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
1815 // Copy data only if the "parent" struct has RefCount==1.
1816 short parent_idx = member_of(arg_types[i]);
1817 long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
1818 assert(parent_rc > 0 && "parent struct not found");
1819 if (parent_rc == 1) {
1820 CopyMember = true;
1821 }
1822 }
1823
1824 if (DelEntry || Always || CopyMember) {
1825 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
1826 arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
1827 int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]);
1828 if (rt != OFFLOAD_SUCCESS) {
1829 DP("Copying data from device failed.\n");
1830 rc = OFFLOAD_FAIL;
1831 }
1832 }
1833 }
1834
1835 // If we copied back to the host a struct/array containing pointers, we
1836 // need to restore the original host pointer values from their shadow
1837 // copies. If the struct is going to be deallocated, remove any remaining
1838 // shadow pointer entries for this struct.
1839 uintptr_t lb = (uintptr_t) HstPtrBegin;
1840 uintptr_t ub = (uintptr_t) HstPtrBegin + arg_sizes[i];
1841 Device.ShadowMtx.lock();
1842 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
1843 it != Device.ShadowPtrMap.end(); ++it) {
1844 void **ShadowHstPtrAddr = (void**) it->first;
1845
1846 // An STL map is sorted on its keys; use this property
1847 // to quickly determine when to break out of the loop.
1848 if ((uintptr_t) ShadowHstPtrAddr < lb)
1849 continue;
1850 if ((uintptr_t) ShadowHstPtrAddr >= ub)
1851 break;
1852
1853 // If we copied the struct to the host, we need to restore the pointer.
1854 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
1855 DP("Restoring original host pointer value " DPxMOD " for host "
1856 "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
1857 DPxPTR(ShadowHstPtrAddr));
1858 *ShadowHstPtrAddr = it->second.HstPtrVal;
1859 }
1860 // If the struct is to be deallocated, remove the shadow entry.
1861 if (DelEntry) {
1862 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
1863 Device.ShadowPtrMap.erase(it);
1864 }
1865 }
1866 Device.ShadowMtx.unlock();
1867
1868 // Deallocate map
1869 if (DelEntry) {
1870 int rt = Device.deallocTgtPtr(HstPtrBegin, arg_sizes[i], ForceDelete);
1871 if (rt != OFFLOAD_SUCCESS) {
1872 DP("Deallocating data from device failed.\n");
1873 rc = OFFLOAD_FAIL;
1874 }
1875 }
1876 }
1877 }
1878
1879 return rc;
1880}
1881
1882/// passes data from the target, releases target memory and destroys
1883/// the host-target mapping (top entry from the stack of data maps)
1884/// created by the last __tgt_target_data_begin.
1885EXTERN void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
1886 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
1887 DP("Entering data end region with %d mappings\n", arg_num);
1888
1889 // No devices available?
1890 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
1891 device_id = omp_get_default_device();
1892 }
1893
1894 RTLsMtx.lock();
1895 size_t Devices_size = Devices.size();
1896 RTLsMtx.unlock();
1897 if (Devices_size <= (size_t)device_id) {
1898 DP("Device ID %d does not have a matching RTL.\n", device_id);
1899 return;
1900 }
1901
1902 DeviceTy &Device = Devices[device_id];
1903 if (!Device.IsInit) {
1904 DP("uninit device: ignore");
1905 return;
1906 }
1907
1908 // Translate maps
1909 int32_t new_arg_num;
1910 void **new_args_base;
1911 void **new_args;
1912 int64_t *new_arg_sizes;
1913 int64_t *new_arg_types;
1914 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
1915 new_args_base, new_args, new_arg_sizes, new_arg_types, false);
1916
1917 //target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
1918 target_data_end(Device, new_arg_num, new_args_base, new_args, new_arg_sizes,
1919 new_arg_types);
1920
1921 // Cleanup translation memory
1922 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
1923 new_arg_types, arg_num, args_base);
1924}
1925
1926EXTERN void __tgt_target_data_end_nowait(int32_t device_id, int32_t arg_num,
1927 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
1928 int32_t depNum, void *depList, int32_t noAliasDepNum,
1929 void *noAliasDepList) {
1930 if (depNum + noAliasDepNum > 0)
1931 __kmpc_omp_taskwait(NULL, 0);
1932
1933 __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
1934 arg_types);
1935}
1936
1937/// passes data to/from the target.
1938EXTERN void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
1939 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
1940 DP("Entering data update with %d mappings\n", arg_num);
1941
1942 // No devices available?
1943 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
1944 device_id = omp_get_default_device();
1945 }
1946
1947 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
1948 DP("Failed to get device %d ready\n", device_id);
1949 return;
1950 }
1951
1952 DeviceTy& Device = Devices[device_id];
1953
1954 // process each input.
1955 for (int32_t i = 0; i < arg_num; ++i) {
1956 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
1957 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
1958 continue;
1959
1960 void *HstPtrBegin = args[i];
1961 int64_t MapSize = arg_sizes[i];
1962 bool IsLast;
1963 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
1964 false);
1965
1966 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
1967 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
1968 arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
1969 Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize);
1970
1971 uintptr_t lb = (uintptr_t) HstPtrBegin;
1972 uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
1973 Device.ShadowMtx.lock();
1974 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
1975 it != Device.ShadowPtrMap.end(); ++it) {
1976 void **ShadowHstPtrAddr = (void**) it->first;
1977 if ((uintptr_t) ShadowHstPtrAddr < lb)
1978 continue;
1979 if ((uintptr_t) ShadowHstPtrAddr >= ub)
1980 break;
1981 DP("Restoring original host pointer value " DPxMOD " for host pointer "
1982 DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
1983 DPxPTR(ShadowHstPtrAddr));
1984 *ShadowHstPtrAddr = it->second.HstPtrVal;
1985 }
1986 Device.ShadowMtx.unlock();
1987 }
1988
1989 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
1990 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
1991 arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
1992 Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize);
1993
1994 uintptr_t lb = (uintptr_t) HstPtrBegin;
1995 uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
1996 Device.ShadowMtx.lock();
1997 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
1998 it != Device.ShadowPtrMap.end(); ++it) {
1999 void **ShadowHstPtrAddr = (void**) it->first;
2000 if ((uintptr_t) ShadowHstPtrAddr < lb)
2001 continue;
2002 if ((uintptr_t) ShadowHstPtrAddr >= ub)
2003 break;
2004 DP("Restoring original target pointer value " DPxMOD " for target "
2005 "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
2006 DPxPTR(it->second.TgtPtrAddr));
2007 Device.data_submit(it->second.TgtPtrAddr,
2008 &it->second.TgtPtrVal, sizeof(void *));
2009 }
2010 Device.ShadowMtx.unlock();
2011 }
2012 }
2013}
2014
2015EXTERN void __tgt_target_data_update_nowait(
2016 int32_t device_id, int32_t arg_num, void **args_base, void **args,
2017 int64_t *arg_sizes, int32_t *arg_types, int32_t depNum, void *depList,
2018 int32_t noAliasDepNum, void *noAliasDepList) {
2019 if (depNum + noAliasDepNum > 0)
2020 __kmpc_omp_taskwait(NULL, 0);
2021
2022 __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes,
2023 arg_types);
2024}
2025
2026/// performs the same actions as data_begin in case arg_num is
2027/// non-zero and initiates run of the offloaded region on the target platform;
2028/// if arg_num is non-zero after the region execution is done it also
2029/// performs the same action as data_update and data_end above. This function
2030/// returns 0 if it was able to transfer the execution to a target and an
2031/// integer different from zero otherwise.
2032static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
2033 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
2034 int32_t team_num, int32_t thread_limit, int IsTeamConstruct) {
2035 DeviceTy &Device = Devices[device_id];
2036
2037 // Find the table information in the map or look it up in the translation
2038 // tables.
2039 TableMap *TM = 0;
2040 TblMapMtx.lock();
2041 HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr);
2042 if (TableMapIt == HostPtrToTableMap.end()) {
2043 // We don't have a map. So search all the registered libraries.
2044 TrlTblMtx.lock();
2045 for (HostEntriesBeginToTransTableTy::iterator
2046 ii = HostEntriesBeginToTransTable.begin(),
2047 ie = HostEntriesBeginToTransTable.end();
2048 !TM && ii != ie; ++ii) {
2049 // get the translation table (which contains all the good info).
2050 TranslationTable *TransTable = &ii->second;
2051 // iterate over all the host table entries to see if we can locate the
2052 // host_ptr.
2053 __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
2054 __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
2055 __tgt_offload_entry *cur = begin;
2056 for (uint32_t i = 0; cur < end; ++cur, ++i) {
2057 if (cur->addr != host_ptr)
2058 continue;
2059 // we got a match, now fill the HostPtrToTableMap so that we
2060 // may avoid this search next time.
2061 TM = &HostPtrToTableMap[host_ptr];
2062 TM->Table = TransTable;
2063 TM->Index = i;
2064 break;
2065 }
2066 }
2067 TrlTblMtx.unlock();
2068 } else {
2069 TM = &TableMapIt->second;
2070 }
2071 TblMapMtx.unlock();
2072
2073 // No map for this host pointer found!
2074 if (!TM) {
2075 DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
2076 DPxPTR(host_ptr));
2077 return OFFLOAD_FAIL;
2078 }
2079
2080 // get target table.
2081 TrlTblMtx.lock();
2082 assert(TM->Table->TargetsTable.size() > (size_t)device_id &&
2083 "Not expecting a device ID outside the table's bounds!");
2084 __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
2085 TrlTblMtx.unlock();
2086 assert(TargetTable && "Global data has not been mapped\n");
2087
2088 // Move data to device.
2089 int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
2090 arg_types);
2091
2092 if (rc != OFFLOAD_SUCCESS) {
2093 DP("Call to target_data_begin failed, skipping target execution.\n");
2094 // Call target_data_end to dealloc whatever target_data_begin allocated
2095 // and return OFFLOAD_FAIL.
2096 target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
2097 return OFFLOAD_FAIL;
2098 }
2099
2100 std::vector<void *> tgt_args;
2101
2102 // List of (first-)private arrays allocated for this target region
2103 std::vector<void *> fpArrays;
2104
2105 for (int32_t i = 0; i < arg_num; ++i) {
2106 if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
2107 // This is not a target parameter, do not push it into tgt_args.
2108 continue;
2109 }
2110 void *HstPtrBegin = args[i];
2111 void *HstPtrBase = args_base[i];
2112 void *TgtPtrBase;
2113 bool IsLast; // unused.
2114 if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
2115 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
2116 DPxPTR(HstPtrBase));
2117 TgtPtrBase = HstPtrBase;
2118 } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
2119 // Allocate memory for (first-)private array
2120 void *TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
2121 arg_sizes[i]);
2122 if (!TgtPtrBegin) {
2123 DP ("Data allocation for %sprivate array " DPxMOD " failed\n",
2124 (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
2125 DPxPTR(HstPtrBegin));
2126 rc = OFFLOAD_FAIL;
2127 break;
2128 } else {
2129 fpArrays.push_back(TgtPtrBegin);
2130 uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
2131 TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta);
2132 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
2133 "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
2134 arg_sizes[i], DPxPTR(TgtPtrBegin),
2135 (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
2136 DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase));
2137 // If first-private, copy data from host
2138 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
2139 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
2140 if (rt != OFFLOAD_SUCCESS) {
2141 DP ("Copying data to device failed.\n");
2142 rc = OFFLOAD_FAIL;
2143 break;
2144 }
2145 }
2146 }
2147 } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
2148 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *),
2149 IsLast, false);
2150 TgtPtrBase = TgtPtrBegin; // no offset for ptrs.
2151 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
2152 "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
2153 DPxPTR(HstPtrBase));
2154 } else {
2155 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i],
2156 IsLast, false);
2157 uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
2158 TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta);
2159 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
2160 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
2161 }
2162 tgt_args.push_back(TgtPtrBase);
2163 }
2164 // Push omp handle.
2165 tgt_args.push_back((void *)0);
2166
2167 // Pop loop trip count
2168 uint64_t ltc = Device.loopTripCnt;
2169 Device.loopTripCnt = 0;
2170
2171 // Launch device execution.
2172 if (rc == OFFLOAD_SUCCESS) {
2173 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
2174 TargetTable->EntriesBegin[TM->Index].name,
2175 DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
2176 if (IsTeamConstruct) {
2177 rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
2178 &tgt_args[0], tgt_args.size(), team_num, thread_limit, ltc);
2179 } else {
2180 rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
2181 &tgt_args[0], tgt_args.size());
2182 }
2183 } else {
2184 DP("Errors occurred while obtaining target arguments, skipping kernel "
2185 "execution\n");
2186 }
2187
2188 // Deallocate (first-)private arrays
2189 for (auto it : fpArrays) {
2190 int rt = Device.RTL->data_delete(Device.RTLDeviceID, it);
2191 if (rt != OFFLOAD_SUCCESS) {
2192 DP("Deallocation of (first-)private arrays failed.\n");
2193 rc = OFFLOAD_FAIL;
2194 }
2195 }
2196
2197 // Move data from device.
2198 int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
2199 arg_types);
2200
2201 if (rt != OFFLOAD_SUCCESS) {
2202 DP("Call to target_data_end failed.\n");
2203 rc = OFFLOAD_FAIL;
2204 }
2205
2206 return rc;
2207}
2208
2209EXTERN int __tgt_target(int32_t device_id, void *host_ptr, int32_t arg_num,
2210 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
George Rokos2467df62017-01-25 21:27:24 +00002211 DP("Entering target region with entry point " DPxMOD " and device Id %d\n",
2212 DPxPTR(host_ptr), device_id);
2213
2214 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
2215 device_id = omp_get_default_device();
2216 }
2217
2218 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
2219 DP("Failed to get device %d ready\n", device_id);
2220 return OFFLOAD_FAIL;
2221 }
2222
2223 // Translate maps
2224 int32_t new_arg_num;
2225 void **new_args_base;
2226 void **new_args;
2227 int64_t *new_arg_sizes;
2228 int64_t *new_arg_types;
2229 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
2230 new_args_base, new_args, new_arg_sizes, new_arg_types, true);
2231
2232 //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
2233 // arg_types, 0, 0, false /*team*/, false /*recursive*/);
2234 int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args,
2235 new_arg_sizes, new_arg_types, 0, 0, false /*team*/);
2236
2237 // Cleanup translation memory
2238 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
2239 new_arg_types, arg_num, args_base);
2240
2241 return rc;
2242}
2243
2244EXTERN int __tgt_target_nowait(int32_t device_id, void *host_ptr,
2245 int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
2246 int32_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum,
2247 void *noAliasDepList) {
2248 if (depNum + noAliasDepNum > 0)
2249 __kmpc_omp_taskwait(NULL, 0);
2250
2251 return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
2252 arg_types);
2253}
2254
2255EXTERN int __tgt_target_teams(int32_t device_id, void *host_ptr,
2256 int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
2257 int32_t *arg_types, int32_t team_num, int32_t thread_limit) {
George Rokos2467df62017-01-25 21:27:24 +00002258 DP("Entering target region with entry point " DPxMOD " and device Id %d\n",
2259 DPxPTR(host_ptr), device_id);
2260
2261 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
2262 device_id = omp_get_default_device();
2263 }
2264
2265 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
2266 DP("Failed to get device %d ready\n", device_id);
2267 return OFFLOAD_FAIL;
2268 }
2269
2270 // Translate maps
2271 int32_t new_arg_num;
2272 void **new_args_base;
2273 void **new_args;
2274 int64_t *new_arg_sizes;
2275 int64_t *new_arg_types;
2276 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
2277 new_args_base, new_args, new_arg_sizes, new_arg_types, true);
2278
2279 //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
2280 // arg_types, team_num, thread_limit, true /*team*/,
2281 // false /*recursive*/);
2282 int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args,
2283 new_arg_sizes, new_arg_types, team_num, thread_limit, true /*team*/);
2284
2285 // Cleanup translation memory
2286 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
2287 new_arg_types, arg_num, args_base);
2288
2289 return rc;
2290}
2291
2292EXTERN int __tgt_target_teams_nowait(int32_t device_id, void *host_ptr,
2293 int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
2294 int32_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum,
2295 void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
2296 if (depNum + noAliasDepNum > 0)
2297 __kmpc_omp_taskwait(NULL, 0);
2298
2299 return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args,
2300 arg_sizes, arg_types, team_num, thread_limit);
2301}
2302
2303
2304// The trip count mechanism will be revised - this scheme is not thread-safe.
2305EXTERN void __kmpc_push_target_tripcount(int32_t device_id,
2306 uint64_t loop_tripcount) {
2307 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
2308 device_id = omp_get_default_device();
2309 }
2310
2311 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
2312 DP("Failed to get device %d ready\n", device_id);
2313 return;
2314 }
2315
2316 DP("__kmpc_push_target_tripcount(%d, %" PRIu64 ")\n", device_id,
2317 loop_tripcount);
2318 Devices[device_id].loopTripCnt = loop_tripcount;
2319}
2320