blob: c182dfbcbb782a2368edb5f346938f7117b5bcb1 [file] [log] [blame]
George Rokos2467df62017-01-25 21:27:24 +00001//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is dual licensed under the MIT and the University of Illinois Open
6// Source Licenses. See LICENSE.txt for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// Implementation of the interface to be used by Clang during the codegen of a
11// target region.
12//
13//===----------------------------------------------------------------------===//
14
15#include <algorithm>
16#include <cassert>
17#include <climits>
18#include <cstdlib>
19#include <cstring>
20#include <dlfcn.h>
21#include <list>
22#include <map>
23#include <mutex>
24#include <string>
25#include <vector>
26
27// Header file global to this project
28#include "omptarget.h"
29
30#define DP(...) DEBUGP("Libomptarget", __VA_ARGS__)
31#define INF_REF_CNT (LONG_MAX>>1) // leave room for additions/subtractions
32#define CONSIDERED_INF(x) (x > (INF_REF_CNT>>1))
33
34// List of all plugins that can support offloading.
35static const char *RTLNames[] = {
36 /* PowerPC target */ "libomptarget.rtl.ppc64.so",
37 /* x86_64 target */ "libomptarget.rtl.x86_64.so",
Paul Osmialowski1e254c52017-03-06 21:00:07 +000038 /* CUDA target */ "libomptarget.rtl.cuda.so",
39 /* AArch64 target */ "libomptarget.rtl.aarch64.so"};
George Rokos2467df62017-01-25 21:27:24 +000040
41// forward declarations
42struct RTLInfoTy;
43static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
44 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
45 int32_t team_num, int32_t thread_limit, int IsTeamConstruct);
46
47/// Map between host data and target data.
48struct HostDataToTargetTy {
49 uintptr_t HstPtrBase; // host info.
50 uintptr_t HstPtrBegin;
51 uintptr_t HstPtrEnd; // non-inclusive.
52
53 uintptr_t TgtPtrBegin; // target info.
54
55 long RefCount;
56
57 HostDataToTargetTy()
58 : HstPtrBase(0), HstPtrBegin(0), HstPtrEnd(0),
59 TgtPtrBegin(0), RefCount(0) {}
60 HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB)
61 : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
62 TgtPtrBegin(TB), RefCount(1) {}
George Rokosd57681b2017-04-22 11:45:03 +000063 HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB,
64 long RF)
65 : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E),
66 TgtPtrBegin(TB), RefCount(RF) {}
George Rokos2467df62017-01-25 21:27:24 +000067};
68
69typedef std::list<HostDataToTargetTy> HostDataToTargetListTy;
70
71struct LookupResult {
72 struct {
73 unsigned IsContained : 1;
74 unsigned ExtendsBefore : 1;
75 unsigned ExtendsAfter : 1;
76 } Flags;
77
78 HostDataToTargetListTy::iterator Entry;
79
Jonas Hahnfeldcfe5ef52017-01-27 11:03:33 +000080 LookupResult() : Flags({0,0,0}), Entry() {}
George Rokos2467df62017-01-25 21:27:24 +000081};
82
83/// Map for shadow pointers
84struct ShadowPtrValTy {
85 void *HstPtrVal;
86 void *TgtPtrAddr;
87 void *TgtPtrVal;
88};
89typedef std::map<void *, ShadowPtrValTy> ShadowPtrListTy;
90
91///
92struct PendingCtorDtorListsTy {
93 std::list<void *> PendingCtors;
94 std::list<void *> PendingDtors;
95};
96typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy>
97 PendingCtorsDtorsPerLibrary;
98
99struct DeviceTy {
100 int32_t DeviceID;
101 RTLInfoTy *RTL;
102 int32_t RTLDeviceID;
103
104 bool IsInit;
105 std::once_flag InitFlag;
106 bool HasPendingGlobals;
107
108 HostDataToTargetListTy HostDataToTargetMap;
109 PendingCtorsDtorsPerLibrary PendingCtorsDtors;
110
111 ShadowPtrListTy ShadowPtrMap;
112
113 std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx;
114
115 uint64_t loopTripCnt;
116
117 DeviceTy(RTLInfoTy *RTL)
118 : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
119 HasPendingGlobals(false), HostDataToTargetMap(),
120 PendingCtorsDtors(), ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(),
121 ShadowMtx(), loopTripCnt(0) {}
122
123 // The existence of mutexes makes DeviceTy non-copyable. We need to
124 // provide a copy constructor and an assignment operator explicitly.
125 DeviceTy(const DeviceTy &d)
126 : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID),
127 IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals),
128 HostDataToTargetMap(d.HostDataToTargetMap),
129 PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap),
130 DataMapMtx(), PendingGlobalsMtx(),
131 ShadowMtx(), loopTripCnt(d.loopTripCnt) {}
132
133 DeviceTy& operator=(const DeviceTy &d) {
134 DeviceID = d.DeviceID;
135 RTL = d.RTL;
136 RTLDeviceID = d.RTLDeviceID;
137 IsInit = d.IsInit;
138 HasPendingGlobals = d.HasPendingGlobals;
139 HostDataToTargetMap = d.HostDataToTargetMap;
140 PendingCtorsDtors = d.PendingCtorsDtors;
141 ShadowPtrMap = d.ShadowPtrMap;
142 loopTripCnt = d.loopTripCnt;
143
144 return *this;
145 }
146
147 long getMapEntryRefCnt(void *HstPtrBegin);
148 LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
149 void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
150 bool &IsNew, bool IsImplicit, bool UpdateRefCount = true);
151 void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
152 void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
153 bool UpdateRefCount);
154 int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete);
155 int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
156 int disassociatePtr(void *HstPtrBegin);
157
158 // calls to RTL
159 int32_t initOnce();
160 __tgt_target_table *load_binary(void *Img);
161
162 int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size);
163 int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
164
165 int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, int32_t TgtVarsSize);
166 int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
167 int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit,
168 uint64_t LoopTripCount);
169
170private:
171 // Call to RTL
172 void init(); // To be called only via DeviceTy::initOnce()
173};
174
175/// Map between Device ID (i.e. openmp device id) and its DeviceTy.
176typedef std::vector<DeviceTy> DevicesTy;
177static DevicesTy Devices;
178
179struct RTLInfoTy {
180 typedef int32_t(is_valid_binary_ty)(void *);
181 typedef int32_t(number_of_devices_ty)();
182 typedef int32_t(init_device_ty)(int32_t);
183 typedef __tgt_target_table *(load_binary_ty)(int32_t, void *);
184 typedef void *(data_alloc_ty)(int32_t, int64_t);
185 typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t);
186 typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t);
187 typedef int32_t(data_delete_ty)(int32_t, void *);
188 typedef int32_t(run_region_ty)(int32_t, void *, void **, int32_t);
189 typedef int32_t(run_team_region_ty)(int32_t, void *, void **, int32_t,
190 int32_t, int32_t, uint64_t);
191
192 int32_t Idx; // RTL index, index is the number of devices
193 // of other RTLs that were registered before,
194 // i.e. the OpenMP index of the first device
195 // to be registered with this RTL.
196 int32_t NumberOfDevices; // Number of devices this RTL deals with.
197 std::vector<DeviceTy *> Devices; // one per device (NumberOfDevices in total).
198
199 void *LibraryHandler;
200
201#ifdef OMPTARGET_DEBUG
202 std::string RTLName;
203#endif
204
205 // Functions implemented in the RTL.
206 is_valid_binary_ty *is_valid_binary;
207 number_of_devices_ty *number_of_devices;
208 init_device_ty *init_device;
209 load_binary_ty *load_binary;
210 data_alloc_ty *data_alloc;
211 data_submit_ty *data_submit;
212 data_retrieve_ty *data_retrieve;
213 data_delete_ty *data_delete;
214 run_region_ty *run_region;
215 run_team_region_ty *run_team_region;
216
217 // Are there images associated with this RTL.
218 bool isUsed;
219
220 // Mutex for thread-safety when calling RTL interface functions.
221 // It is easier to enforce thread-safety at the libomptarget level,
222 // so that developers of new RTLs do not have to worry about it.
223 std::mutex Mtx;
224
225 // The existence of the mutex above makes RTLInfoTy non-copyable.
226 // We need to provide a copy constructor explicitly.
227 RTLInfoTy()
228 : Idx(-1), NumberOfDevices(-1), Devices(), LibraryHandler(0),
229#ifdef OMPTARGET_DEBUG
230 RTLName(),
231#endif
232 is_valid_binary(0), number_of_devices(0), init_device(0),
233 load_binary(0), data_alloc(0), data_submit(0), data_retrieve(0),
234 data_delete(0), run_region(0), run_team_region(0), isUsed(false),
235 Mtx() {}
236
237 RTLInfoTy(const RTLInfoTy &r) : Mtx() {
238 Idx = r.Idx;
239 NumberOfDevices = r.NumberOfDevices;
240 Devices = r.Devices;
241 LibraryHandler = r.LibraryHandler;
242#ifdef OMPTARGET_DEBUG
243 RTLName = r.RTLName;
244#endif
245 is_valid_binary = r.is_valid_binary;
246 number_of_devices = r.number_of_devices;
247 init_device = r.init_device;
248 load_binary = r.load_binary;
249 data_alloc = r.data_alloc;
250 data_submit = r.data_submit;
251 data_retrieve = r.data_retrieve;
252 data_delete = r.data_delete;
253 run_region = r.run_region;
254 run_team_region = r.run_team_region;
255 isUsed = r.isUsed;
256 }
257};
258
259/// RTLs identified in the system.
260class RTLsTy {
261private:
262 // Mutex-like object to guarantee thread-safety and unique initialization
263 // (i.e. the library attempts to load the RTLs (plugins) only once).
264 std::once_flag initFlag;
265 void LoadRTLs(); // not thread-safe
266
267public:
268 // List of the detected runtime libraries.
269 std::list<RTLInfoTy> AllRTLs;
270
271 // Array of pointers to the detected runtime libraries that have compatible
272 // binaries.
273 std::vector<RTLInfoTy *> UsedRTLs;
274
275 explicit RTLsTy() {}
276
277 // Load all the runtime libraries (plugins) if not done before.
278 void LoadRTLsOnce();
279};
280
281void RTLsTy::LoadRTLs() {
282 // Parse environment variable OMP_TARGET_OFFLOAD (if set)
283 char *envStr = getenv("OMP_TARGET_OFFLOAD");
284 if (envStr && !strcmp(envStr, "DISABLED")) {
285 DP("Target offloading disabled by environment\n");
286 return;
287 }
288
289 DP("Loading RTLs...\n");
290
291 // Attempt to open all the plugins and, if they exist, check if the interface
292 // is correct and if they are supporting any devices.
293 for (auto *Name : RTLNames) {
294 DP("Loading library '%s'...\n", Name);
295 void *dynlib_handle = dlopen(Name, RTLD_NOW);
296
297 if (!dynlib_handle) {
298 // Library does not exist or cannot be found.
299 DP("Unable to load library '%s': %s!\n", Name, dlerror());
300 continue;
301 }
302
303 DP("Successfully loaded library '%s'!\n", Name);
304
305 // Retrieve the RTL information from the runtime library.
306 RTLInfoTy R;
307
308 R.LibraryHandler = dynlib_handle;
309 R.isUsed = false;
310
311#ifdef OMPTARGET_DEBUG
312 R.RTLName = Name;
313#endif
314
315 if (!(R.is_valid_binary = (RTLInfoTy::is_valid_binary_ty *)dlsym(
316 dynlib_handle, "__tgt_rtl_is_valid_binary")))
317 continue;
318 if (!(R.number_of_devices = (RTLInfoTy::number_of_devices_ty *)dlsym(
319 dynlib_handle, "__tgt_rtl_number_of_devices")))
320 continue;
321 if (!(R.init_device = (RTLInfoTy::init_device_ty *)dlsym(
322 dynlib_handle, "__tgt_rtl_init_device")))
323 continue;
324 if (!(R.load_binary = (RTLInfoTy::load_binary_ty *)dlsym(
325 dynlib_handle, "__tgt_rtl_load_binary")))
326 continue;
327 if (!(R.data_alloc = (RTLInfoTy::data_alloc_ty *)dlsym(
328 dynlib_handle, "__tgt_rtl_data_alloc")))
329 continue;
330 if (!(R.data_submit = (RTLInfoTy::data_submit_ty *)dlsym(
331 dynlib_handle, "__tgt_rtl_data_submit")))
332 continue;
333 if (!(R.data_retrieve = (RTLInfoTy::data_retrieve_ty *)dlsym(
334 dynlib_handle, "__tgt_rtl_data_retrieve")))
335 continue;
336 if (!(R.data_delete = (RTLInfoTy::data_delete_ty *)dlsym(
337 dynlib_handle, "__tgt_rtl_data_delete")))
338 continue;
339 if (!(R.run_region = (RTLInfoTy::run_region_ty *)dlsym(
340 dynlib_handle, "__tgt_rtl_run_target_region")))
341 continue;
342 if (!(R.run_team_region = (RTLInfoTy::run_team_region_ty *)dlsym(
343 dynlib_handle, "__tgt_rtl_run_target_team_region")))
344 continue;
345
346 // No devices are supported by this RTL?
347 if (!(R.NumberOfDevices = R.number_of_devices())) {
348 DP("No devices supported in this RTL\n");
349 continue;
350 }
351
352 DP("Registering RTL %s supporting %d devices!\n",
353 R.RTLName.c_str(), R.NumberOfDevices);
354
355 // The RTL is valid! Will save the information in the RTLs list.
356 AllRTLs.push_back(R);
357 }
358
359 DP("RTLs loaded!\n");
360
361 return;
362}
363
364void RTLsTy::LoadRTLsOnce() {
365 // RTL.LoadRTLs() is called only once in a thread-safe fashion.
366 std::call_once(initFlag, &RTLsTy::LoadRTLs, this);
367}
368
369static RTLsTy RTLs;
370static std::mutex RTLsMtx;
371
372/// Map between the host entry begin and the translation table. Each
373/// registered library gets one TranslationTable. Use the map from
374/// __tgt_offload_entry so that we may quickly determine whether we
375/// are trying to (re)register an existing lib or really have a new one.
376struct TranslationTable {
377 __tgt_target_table HostTable;
378
379 // Image assigned to a given device.
380 std::vector<__tgt_device_image *> TargetsImages; // One image per device ID.
381
382 // Table of entry points or NULL if it was not already computed.
383 std::vector<__tgt_target_table *> TargetsTable; // One table per device ID.
384};
385typedef std::map<__tgt_offload_entry *, TranslationTable>
386 HostEntriesBeginToTransTableTy;
387static HostEntriesBeginToTransTableTy HostEntriesBeginToTransTable;
388static std::mutex TrlTblMtx;
389
390/// Map between the host ptr and a table index
391struct TableMap {
392 TranslationTable *Table; // table associated with the host ptr.
393 uint32_t Index; // index in which the host ptr translated entry is found.
394 TableMap() : Table(0), Index(0) {}
395 TableMap(TranslationTable *table, uint32_t index)
396 : Table(table), Index(index) {}
397};
398typedef std::map<void *, TableMap> HostPtrToTableMapTy;
399static HostPtrToTableMapTy HostPtrToTableMap;
400static std::mutex TblMapMtx;
401
402/// Check whether a device has an associated RTL and initialize it if it's not
403/// already initialized.
404static bool device_is_ready(int device_num) {
405 DP("Checking whether device %d is ready.\n", device_num);
406 // Devices.size() can only change while registering a new
407 // library, so try to acquire the lock of RTLs' mutex.
408 RTLsMtx.lock();
409 size_t Devices_size = Devices.size();
410 RTLsMtx.unlock();
411 if (Devices_size <= (size_t)device_num) {
412 DP("Device ID %d does not have a matching RTL\n", device_num);
413 return false;
414 }
415
416 // Get device info
417 DeviceTy &Device = Devices[device_num];
418
419 DP("Is the device %d (local ID %d) initialized? %d\n", device_num,
420 Device.RTLDeviceID, Device.IsInit);
421
422 // Init the device if not done before
423 if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) {
424 DP("Failed to init device %d\n", device_num);
425 return false;
426 }
427
428 DP("Device %d is ready to use.\n", device_num);
429
430 return true;
431}
432
433////////////////////////////////////////////////////////////////////////////////
434// Target API functions
435//
436EXTERN int omp_get_num_devices(void) {
437 RTLsMtx.lock();
438 size_t Devices_size = Devices.size();
439 RTLsMtx.unlock();
440
441 DP("Call to omp_get_num_devices returning %zd\n", Devices_size);
442
443 return Devices_size;
444}
445
446EXTERN int omp_get_initial_device(void) {
447 DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE);
448 return HOST_DEVICE;
449}
450
451EXTERN void *omp_target_alloc(size_t size, int device_num) {
452 DP("Call to omp_target_alloc for device %d requesting %zu bytes\n",
453 device_num, size);
454
455 if (size <= 0) {
456 DP("Call to omp_target_alloc with non-positive length\n");
457 return NULL;
458 }
459
460 void *rc = NULL;
461
462 if (device_num == omp_get_initial_device()) {
463 rc = malloc(size);
464 DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc));
465 return rc;
466 }
467
468 if (!device_is_ready(device_num)) {
469 DP("omp_target_alloc returns NULL ptr\n");
470 return NULL;
471 }
472
473 DeviceTy &Device = Devices[device_num];
474 rc = Device.RTL->data_alloc(Device.RTLDeviceID, size);
475 DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc));
476 return rc;
477}
478
479EXTERN void omp_target_free(void *device_ptr, int device_num) {
480 DP("Call to omp_target_free for device %d and address " DPxMOD "\n",
481 device_num, DPxPTR(device_ptr));
482
483 if (!device_ptr) {
484 DP("Call to omp_target_free with NULL ptr\n");
485 return;
486 }
487
488 if (device_num == omp_get_initial_device()) {
489 free(device_ptr);
490 DP("omp_target_free deallocated host ptr\n");
491 return;
492 }
493
494 if (!device_is_ready(device_num)) {
495 DP("omp_target_free returns, nothing to do\n");
496 return;
497 }
498
499 DeviceTy &Device = Devices[device_num];
500 Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr);
501 DP("omp_target_free deallocated device ptr\n");
502}
503
504EXTERN int omp_target_is_present(void *ptr, int device_num) {
505 DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n",
506 device_num, DPxPTR(ptr));
507
508 if (!ptr) {
509 DP("Call to omp_target_is_present with NULL ptr, returning false\n");
510 return false;
511 }
512
513 if (device_num == omp_get_initial_device()) {
514 DP("Call to omp_target_is_present on host, returning true\n");
515 return true;
516 }
517
518 RTLsMtx.lock();
519 size_t Devices_size = Devices.size();
520 RTLsMtx.unlock();
521 if (Devices_size <= (size_t)device_num) {
522 DP("Call to omp_target_is_present with invalid device ID, returning "
523 "false\n");
524 return false;
525 }
526
527 DeviceTy& Device = Devices[device_num];
528 bool IsLast; // not used
529 int rc = (Device.getTgtPtrBegin(ptr, 0, IsLast, false) != NULL);
530 DP("Call to omp_target_is_present returns %d\n", rc);
531 return rc;
532}
533
534EXTERN int omp_target_memcpy(void *dst, void *src, size_t length,
535 size_t dst_offset, size_t src_offset, int dst_device, int src_device) {
536 DP("Call to omp_target_memcpy, dst device %d, src device %d, "
537 "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, "
538 "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst),
539 DPxPTR(src), dst_offset, src_offset, length);
540
541 if (!dst || !src || length <= 0) {
542 DP("Call to omp_target_memcpy with invalid arguments\n");
543 return OFFLOAD_FAIL;
544 }
545
546 if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) {
547 DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
548 return OFFLOAD_FAIL;
549 }
550
551 if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) {
552 DP("omp_target_memcpy returns OFFLOAD_FAIL\n");
553 return OFFLOAD_FAIL;
554 }
555
556 int rc = OFFLOAD_SUCCESS;
557 void *srcAddr = (char *)src + src_offset;
558 void *dstAddr = (char *)dst + dst_offset;
559
560 if (src_device == omp_get_initial_device() &&
561 dst_device == omp_get_initial_device()) {
562 DP("copy from host to host\n");
563 const void *p = memcpy(dstAddr, srcAddr, length);
564 if (p == NULL)
565 rc = OFFLOAD_FAIL;
566 } else if (src_device == omp_get_initial_device()) {
567 DP("copy from host to device\n");
568 DeviceTy& DstDev = Devices[dst_device];
569 rc = DstDev.data_submit(dstAddr, srcAddr, length);
570 } else if (dst_device == omp_get_initial_device()) {
571 DP("copy from device to host\n");
572 DeviceTy& SrcDev = Devices[src_device];
573 rc = SrcDev.data_retrieve(dstAddr, srcAddr, length);
574 } else {
575 DP("copy from device to device\n");
576 void *buffer = malloc(length);
577 DeviceTy& SrcDev = Devices[src_device];
578 DeviceTy& DstDev = Devices[dst_device];
579 rc = SrcDev.data_retrieve(buffer, srcAddr, length);
580 if (rc == OFFLOAD_SUCCESS)
581 rc = DstDev.data_submit(dstAddr, buffer, length);
582 }
583
584 DP("omp_target_memcpy returns %d\n", rc);
585 return rc;
586}
587
588EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size,
589 int num_dims, const size_t *volume, const size_t *dst_offsets,
590 const size_t *src_offsets, const size_t *dst_dimensions,
591 const size_t *src_dimensions, int dst_device, int src_device) {
592 DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, "
593 "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", "
594 "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", "
595 "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device,
596 src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets),
597 DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions),
598 DPxPTR(volume), element_size, num_dims);
599
600 if (!(dst || src)) {
601 DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n",
602 INT_MAX);
603 return INT_MAX;
604 }
605
606 if (!dst || !src || element_size < 1 || num_dims < 1 || !volume ||
607 !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) {
608 DP("Call to omp_target_memcpy_rect with invalid arguments\n");
609 return OFFLOAD_FAIL;
610 }
611
612 int rc;
613 if (num_dims == 1) {
614 rc = omp_target_memcpy(dst, src, element_size * volume[0],
615 element_size * dst_offsets[0], element_size * src_offsets[0],
616 dst_device, src_device);
617 } else {
618 size_t dst_slice_size = element_size;
619 size_t src_slice_size = element_size;
620 for (int i=1; i<num_dims; ++i) {
621 dst_slice_size *= dst_dimensions[i];
622 src_slice_size *= src_dimensions[i];
623 }
624
625 size_t dst_off = dst_offsets[0] * dst_slice_size;
626 size_t src_off = src_offsets[0] * src_slice_size;
627 for (size_t i=0; i<volume[0]; ++i) {
628 rc = omp_target_memcpy_rect((char *) dst + dst_off + dst_slice_size * i,
629 (char *) src + src_off + src_slice_size * i, element_size,
630 num_dims - 1, volume + 1, dst_offsets + 1, src_offsets + 1,
631 dst_dimensions + 1, src_dimensions + 1, dst_device, src_device);
632
633 if (rc) {
634 DP("Recursive call to omp_target_memcpy_rect returns unsuccessfully\n");
635 return rc;
636 }
637 }
638 }
639
640 DP("omp_target_memcpy_rect returns %d\n", rc);
641 return rc;
642}
643
644EXTERN int omp_target_associate_ptr(void *host_ptr, void *device_ptr,
645 size_t size, size_t device_offset, int device_num) {
646 DP("Call to omp_target_associate_ptr with host_ptr " DPxMOD ", "
647 "device_ptr " DPxMOD ", size %zu, device_offset %zu, device_num %d\n",
648 DPxPTR(host_ptr), DPxPTR(device_ptr), size, device_offset, device_num);
649
650 if (!host_ptr || !device_ptr || size <= 0) {
651 DP("Call to omp_target_associate_ptr with invalid arguments\n");
652 return OFFLOAD_FAIL;
653 }
654
655 if (device_num == omp_get_initial_device()) {
656 DP("omp_target_associate_ptr: no association possible on the host\n");
657 return OFFLOAD_FAIL;
658 }
659
660 if (!device_is_ready(device_num)) {
661 DP("omp_target_associate_ptr returns OFFLOAD_FAIL\n");
662 return OFFLOAD_FAIL;
663 }
664
665 DeviceTy& Device = Devices[device_num];
666 void *device_addr = (void *)((uint64_t)device_ptr + (uint64_t)device_offset);
667 int rc = Device.associatePtr(host_ptr, device_addr, size);
668 DP("omp_target_associate_ptr returns %d\n", rc);
669 return rc;
670}
671
672EXTERN int omp_target_disassociate_ptr(void *host_ptr, int device_num) {
673 DP("Call to omp_target_disassociate_ptr with host_ptr " DPxMOD ", "
674 "device_num %d\n", DPxPTR(host_ptr), device_num);
675
676 if (!host_ptr) {
677 DP("Call to omp_target_associate_ptr with invalid host_ptr\n");
678 return OFFLOAD_FAIL;
679 }
680
681 if (device_num == omp_get_initial_device()) {
682 DP("omp_target_disassociate_ptr: no association possible on the host\n");
683 return OFFLOAD_FAIL;
684 }
685
686 if (!device_is_ready(device_num)) {
687 DP("omp_target_disassociate_ptr returns OFFLOAD_FAIL\n");
688 return OFFLOAD_FAIL;
689 }
690
691 DeviceTy& Device = Devices[device_num];
692 int rc = Device.disassociatePtr(host_ptr);
693 DP("omp_target_disassociate_ptr returns %d\n", rc);
694 return rc;
695}
696
697////////////////////////////////////////////////////////////////////////////////
698// functionality for device
699
700int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
701 DataMapMtx.lock();
702
703 // Check if entry exists
704 for (auto &HT : HostDataToTargetMap) {
705 if ((uintptr_t)HstPtrBegin == HT.HstPtrBegin) {
706 // Mapping already exists
707 bool isValid = HT.HstPtrBegin == (uintptr_t) HstPtrBegin &&
708 HT.HstPtrEnd == (uintptr_t) HstPtrBegin + Size &&
709 HT.TgtPtrBegin == (uintptr_t) TgtPtrBegin;
710 DataMapMtx.unlock();
711 if (isValid) {
712 DP("Attempt to re-associate the same device ptr+offset with the same "
713 "host ptr, nothing to do\n");
714 return OFFLOAD_SUCCESS;
715 } else {
716 DP("Not allowed to re-associate a different device ptr+offset with the "
717 "same host ptr\n");
718 return OFFLOAD_FAIL;
719 }
720 }
721 }
722
723 // Mapping does not exist, allocate it
724 HostDataToTargetTy newEntry;
725
726 // Set up missing fields
727 newEntry.HstPtrBase = (uintptr_t) HstPtrBegin;
728 newEntry.HstPtrBegin = (uintptr_t) HstPtrBegin;
729 newEntry.HstPtrEnd = (uintptr_t) HstPtrBegin + Size;
730 newEntry.TgtPtrBegin = (uintptr_t) TgtPtrBegin;
731 // refCount must be infinite
732 newEntry.RefCount = INF_REF_CNT;
733
734 DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", HstEnd="
735 DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(newEntry.HstPtrBase),
736 DPxPTR(newEntry.HstPtrBegin), DPxPTR(newEntry.HstPtrEnd),
737 DPxPTR(newEntry.TgtPtrBegin));
738 HostDataToTargetMap.push_front(newEntry);
739
740 DataMapMtx.unlock();
741
742 return OFFLOAD_SUCCESS;
743}
744
745int DeviceTy::disassociatePtr(void *HstPtrBegin) {
746 DataMapMtx.lock();
747
748 // Check if entry exists
749 for (HostDataToTargetListTy::iterator ii = HostDataToTargetMap.begin();
750 ii != HostDataToTargetMap.end(); ++ii) {
751 if ((uintptr_t)HstPtrBegin == ii->HstPtrBegin) {
752 // Mapping exists
753 if (CONSIDERED_INF(ii->RefCount)) {
754 DP("Association found, removing it\n");
755 HostDataToTargetMap.erase(ii);
756 DataMapMtx.unlock();
757 return OFFLOAD_SUCCESS;
758 } else {
759 DP("Trying to disassociate a pointer which was not mapped via "
760 "omp_target_associate_ptr\n");
761 break;
762 }
763 }
764 }
765
766 // Mapping not found
767 DataMapMtx.unlock();
768 DP("Association not found\n");
769 return OFFLOAD_FAIL;
770}
771
772// Get ref count of map entry containing HstPtrBegin
773long DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) {
774 uintptr_t hp = (uintptr_t)HstPtrBegin;
775 long RefCnt = -1;
776
777 DataMapMtx.lock();
778 for (auto &HT : HostDataToTargetMap) {
779 if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) {
780 DP("DeviceTy::getMapEntry: requested entry found\n");
781 RefCnt = HT.RefCount;
782 break;
783 }
784 }
785 DataMapMtx.unlock();
786
787 if (RefCnt < 0) {
788 DP("DeviceTy::getMapEntry: requested entry not found\n");
789 }
790
791 return RefCnt;
792}
793
794LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) {
795 uintptr_t hp = (uintptr_t)HstPtrBegin;
796 LookupResult lr;
797
798 DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp),
799 Size);
800 for (lr.Entry = HostDataToTargetMap.begin();
801 lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) {
802 auto &HT = *lr.Entry;
803 // Is it contained?
804 lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd &&
805 (hp+Size) <= HT.HstPtrEnd;
806 // Does it extend into an already mapped region?
807 lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin;
808 // Does it extend beyond the mapped region?
809 lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd;
810
811 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore ||
812 lr.Flags.ExtendsAfter) {
813 break;
814 }
815 }
816
817 if (lr.Flags.ExtendsBefore) {
818 DP("WARNING: Pointer is not mapped but section extends into already "
819 "mapped data\n");
820 }
821 if (lr.Flags.ExtendsAfter) {
822 DP("WARNING: Pointer is already mapped but section extends beyond mapped "
823 "region\n");
824 }
825
826 return lr;
827}
828
829// Used by target_data_begin
830// Return the target pointer begin (where the data will be moved).
831// Allocate memory if this is the first occurrence if this mapping.
832// Increment the reference counter.
833// If NULL is returned, then either data allocation failed or the user tried
834// to do an illegal mapping.
835void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
836 int64_t Size, bool &IsNew, bool IsImplicit, bool UpdateRefCount) {
837 void *rc = NULL;
838 DataMapMtx.lock();
839 LookupResult lr = lookupMapping(HstPtrBegin, Size);
840
841 // Check if the pointer is contained.
842 if (lr.Flags.IsContained ||
843 ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) {
844 auto &HT = *lr.Entry;
845 IsNew = false;
846
847 if (UpdateRefCount)
848 ++HT.RefCount;
849
850 uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
851 DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
852 "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""),
853 DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
854 (UpdateRefCount ? " updated" : ""),
855 (CONSIDERED_INF(HT.RefCount)) ? "INF" :
856 std::to_string(HT.RefCount).c_str());
857 rc = (void *)tp;
858 } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) {
859 // Explicit extension of mapped data - not allowed.
860 DP("Explicit extension of mapping is not allowed.\n");
861 } else if (Size) {
862 // If it is not contained and Size > 0 we should create a new entry for it.
863 IsNew = true;
864 uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size);
865 DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
866 "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
867 DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
868 HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
869 (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
870 rc = (void *)tp;
871 }
872
873 DataMapMtx.unlock();
874 return rc;
875}
876
877// Used by target_data_begin, target_data_end, target_data_update and target.
878// Return the target pointer begin (where the data will be moved).
879// Decrement the reference counter if called from target_data_end.
880void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
881 bool UpdateRefCount) {
882 void *rc = NULL;
883 DataMapMtx.lock();
884 LookupResult lr = lookupMapping(HstPtrBegin, Size);
885
886 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
887 auto &HT = *lr.Entry;
888 IsLast = !(HT.RefCount > 1);
889
890 if (HT.RefCount > 1 && UpdateRefCount)
891 --HT.RefCount;
892
893 uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
894 DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
895 "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size,
896 (UpdateRefCount ? " updated" : ""),
897 (CONSIDERED_INF(HT.RefCount)) ? "INF" :
898 std::to_string(HT.RefCount).c_str());
899 rc = (void *)tp;
900 } else {
901 IsLast = false;
902 }
903
904 DataMapMtx.unlock();
905 return rc;
906}
907
908// Return the target pointer begin (where the data will be moved).
George Rokosd57681b2017-04-22 11:45:03 +0000909// Lock-free version called when loading global symbols from the fat binary.
George Rokos2467df62017-01-25 21:27:24 +0000910void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) {
911 uintptr_t hp = (uintptr_t)HstPtrBegin;
912 LookupResult lr = lookupMapping(HstPtrBegin, Size);
913 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
914 auto &HT = *lr.Entry;
915 uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin);
916 return (void *)tp;
917 }
918
919 return NULL;
920}
921
922int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) {
923 // Check if the pointer is contained in any sub-nodes.
924 int rc;
925 DataMapMtx.lock();
926 LookupResult lr = lookupMapping(HstPtrBegin, Size);
927 if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
928 auto &HT = *lr.Entry;
929 if (ForceDelete)
930 HT.RefCount = 1;
931 if (--HT.RefCount <= 0) {
932 assert(HT.RefCount == 0 && "did not expect a negative ref count");
933 DP("Deleting tgt data " DPxMOD " of size %ld\n",
934 DPxPTR(HT.TgtPtrBegin), Size);
935 RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin);
936 DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
937 ", Size=%ld\n", (ForceDelete ? " (forced)" : ""),
938 DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size);
939 HostDataToTargetMap.erase(lr.Entry);
940 }
941 rc = OFFLOAD_SUCCESS;
942 } else {
943 DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated"
944 " memory\n", DPxPTR(HstPtrBegin));
945 rc = OFFLOAD_FAIL;
946 }
947
948 DataMapMtx.unlock();
949 return rc;
950}
951
952/// Init device, should not be called directly.
953void DeviceTy::init() {
954 int32_t rc = RTL->init_device(RTLDeviceID);
955 if (rc == OFFLOAD_SUCCESS) {
956 IsInit = true;
957 }
958}
959
960/// Thread-safe method to initialize the device only once.
961int32_t DeviceTy::initOnce() {
962 std::call_once(InitFlag, &DeviceTy::init, this);
963
964 // At this point, if IsInit is true, then either this thread or some other
965 // thread in the past successfully initialized the device, so we can return
966 // OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it
967 // failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means
968 // that some other thread already attempted to execute init() and if IsInit
969 // is still false, return OFFLOAD_FAIL.
970 if (IsInit)
971 return OFFLOAD_SUCCESS;
972 else
973 return OFFLOAD_FAIL;
974}
975
976// Load binary to device.
977__tgt_target_table *DeviceTy::load_binary(void *Img) {
978 RTL->Mtx.lock();
979 __tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img);
980 RTL->Mtx.unlock();
981 return rc;
982}
983
984// Submit data to device.
985int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin,
986 int64_t Size) {
987 return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
988}
989
990// Retrieve data from device.
991int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin,
992 int64_t Size) {
993 return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
994}
995
996// Run region on device
997int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr,
998 int32_t TgtVarsSize) {
999 return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize);
1000}
1001
1002// Run team region on device.
1003int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr,
1004 int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit,
1005 uint64_t LoopTripCount) {
1006 return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtVarsSize,
1007 NumTeams, ThreadLimit, LoopTripCount);
1008}
1009
1010////////////////////////////////////////////////////////////////////////////////
1011// Functionality for registering libs
1012
1013static void RegisterImageIntoTranslationTable(TranslationTable &TT,
1014 RTLInfoTy &RTL, __tgt_device_image *image) {
1015
1016 // same size, as when we increase one, we also increase the other.
1017 assert(TT.TargetsTable.size() == TT.TargetsImages.size() &&
1018 "We should have as many images as we have tables!");
1019
1020 // Resize the Targets Table and Images to accommodate the new targets if
1021 // required
1022 unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices;
1023
1024 if (TT.TargetsTable.size() < TargetsTableMinimumSize) {
1025 TT.TargetsImages.resize(TargetsTableMinimumSize, 0);
1026 TT.TargetsTable.resize(TargetsTableMinimumSize, 0);
1027 }
1028
1029 // Register the image in all devices for this target type.
1030 for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) {
1031 // If we are changing the image we are also invalidating the target table.
1032 if (TT.TargetsImages[RTL.Idx + i] != image) {
1033 TT.TargetsImages[RTL.Idx + i] = image;
1034 TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table.
1035 }
1036 }
1037}
1038
1039////////////////////////////////////////////////////////////////////////////////
1040// Functionality for registering Ctors/Dtors
1041
1042static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc,
1043 __tgt_device_image *img, RTLInfoTy *RTL) {
1044
1045 for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) {
1046 DeviceTy &Device = Devices[RTL->Idx + i];
1047 Device.PendingGlobalsMtx.lock();
1048 Device.HasPendingGlobals = true;
1049 for (__tgt_offload_entry *entry = img->EntriesBegin;
1050 entry != img->EntriesEnd; ++entry) {
1051 if (entry->flags & OMP_DECLARE_TARGET_CTOR) {
1052 DP("Adding ctor " DPxMOD " to the pending list.\n",
1053 DPxPTR(entry->addr));
1054 Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr);
1055 } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) {
1056 // Dtors are pushed in reverse order so they are executed from end
1057 // to beginning when unregistering the library!
1058 DP("Adding dtor " DPxMOD " to the pending list.\n",
1059 DPxPTR(entry->addr));
1060 Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr);
1061 }
1062
1063 if (entry->flags & OMP_DECLARE_TARGET_LINK) {
1064 DP("The \"link\" attribute is not yet supported!\n");
1065 }
1066 }
1067 Device.PendingGlobalsMtx.unlock();
1068 }
1069}
1070
1071////////////////////////////////////////////////////////////////////////////////
1072/// adds a target shared library to the target execution image
1073EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) {
1074
1075 // Attempt to load all plugins available in the system.
1076 RTLs.LoadRTLsOnce();
1077
1078 RTLsMtx.lock();
1079 // Register the images with the RTLs that understand them, if any.
1080 for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
1081 // Obtain the image.
1082 __tgt_device_image *img = &desc->DeviceImages[i];
1083
1084 RTLInfoTy *FoundRTL = NULL;
1085
1086 // Scan the RTLs that have associated images until we find one that supports
1087 // the current image.
1088 for (auto &R : RTLs.AllRTLs) {
1089 if (!R.is_valid_binary(img)) {
1090 DP("Image " DPxMOD " is NOT compatible with RTL %s!\n",
1091 DPxPTR(img->ImageStart), R.RTLName.c_str());
1092 continue;
1093 }
1094
1095 DP("Image " DPxMOD " is compatible with RTL %s!\n",
1096 DPxPTR(img->ImageStart), R.RTLName.c_str());
1097
1098 // If this RTL is not already in use, initialize it.
1099 if (!R.isUsed) {
1100 // Initialize the device information for the RTL we are about to use.
1101 DeviceTy device(&R);
1102
1103 size_t start = Devices.size();
1104 Devices.resize(start + R.NumberOfDevices, device);
1105 for (int32_t device_id = 0; device_id < R.NumberOfDevices;
1106 device_id++) {
1107 // global device ID
1108 Devices[start + device_id].DeviceID = start + device_id;
1109 // RTL local device ID
1110 Devices[start + device_id].RTLDeviceID = device_id;
1111
1112 // Save pointer to device in RTL in case we want to unregister the RTL
1113 R.Devices.push_back(&Devices[start + device_id]);
1114 }
1115
1116 // Initialize the index of this RTL and save it in the used RTLs.
1117 R.Idx = (RTLs.UsedRTLs.empty())
1118 ? 0
1119 : RTLs.UsedRTLs.back()->Idx +
1120 RTLs.UsedRTLs.back()->NumberOfDevices;
1121 assert((size_t) R.Idx == start &&
1122 "RTL index should equal the number of devices used so far.");
1123 R.isUsed = true;
1124 RTLs.UsedRTLs.push_back(&R);
1125
1126 DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx);
1127 }
1128
1129 // Initialize (if necessary) translation table for this library.
1130 TrlTblMtx.lock();
1131 if(!HostEntriesBeginToTransTable.count(desc->HostEntriesBegin)){
1132 TranslationTable &tt =
1133 HostEntriesBeginToTransTable[desc->HostEntriesBegin];
1134 tt.HostTable.EntriesBegin = desc->HostEntriesBegin;
1135 tt.HostTable.EntriesEnd = desc->HostEntriesEnd;
1136 }
1137
1138 // Retrieve translation table for this library.
1139 TranslationTable &TransTable =
1140 HostEntriesBeginToTransTable[desc->HostEntriesBegin];
1141
1142 DP("Registering image " DPxMOD " with RTL %s!\n",
1143 DPxPTR(img->ImageStart), R.RTLName.c_str());
1144 RegisterImageIntoTranslationTable(TransTable, R, img);
1145 TrlTblMtx.unlock();
1146 FoundRTL = &R;
1147
1148 // Load ctors/dtors for static objects
1149 RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL);
1150
1151 // if an RTL was found we are done - proceed to register the next image
1152 break;
1153 }
1154
1155 if (!FoundRTL) {
1156 DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart));
1157 }
1158 }
1159 RTLsMtx.unlock();
1160
1161
1162 DP("Done registering entries!\n");
1163}
1164
1165////////////////////////////////////////////////////////////////////////////////
1166/// unloads a target shared library
1167EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) {
1168 DP("Unloading target library!\n");
1169
1170 RTLsMtx.lock();
1171 // Find which RTL understands each image, if any.
1172 for (int32_t i = 0; i < desc->NumDeviceImages; ++i) {
1173 // Obtain the image.
1174 __tgt_device_image *img = &desc->DeviceImages[i];
1175
1176 RTLInfoTy *FoundRTL = NULL;
1177
1178 // Scan the RTLs that have associated images until we find one that supports
1179 // the current image. We only need to scan RTLs that are already being used.
1180 for (auto *R : RTLs.UsedRTLs) {
1181
1182 assert(R->isUsed && "Expecting used RTLs.");
1183
1184 if (!R->is_valid_binary(img)) {
1185 DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n",
1186 DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
1187 continue;
1188 }
1189
1190 DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n",
1191 DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
1192
1193 FoundRTL = R;
1194
1195 // Execute dtors for static objects if the device has been used, i.e.
1196 // if its PendingCtors list has been emptied.
1197 for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) {
1198 DeviceTy &Device = Devices[FoundRTL->Idx + i];
1199 Device.PendingGlobalsMtx.lock();
1200 if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) {
1201 for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) {
1202 int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1,
1203 1, true /*team*/);
1204 if (rc != OFFLOAD_SUCCESS) {
1205 DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor));
1206 }
1207 }
1208 // Remove this library's entry from PendingCtorsDtors
1209 Device.PendingCtorsDtors.erase(desc);
1210 }
1211 Device.PendingGlobalsMtx.unlock();
1212 }
1213
1214 DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n",
1215 DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler));
1216
1217 break;
1218 }
1219
1220 // if no RTL was found proceed to unregister the next image
1221 if (!FoundRTL){
1222 DP("No RTLs in use support the image " DPxMOD "!\n",
1223 DPxPTR(img->ImageStart));
1224 }
1225 }
1226 RTLsMtx.unlock();
1227 DP("Done unregistering images!\n");
1228
1229 // Remove entries from HostPtrToTableMap
1230 TblMapMtx.lock();
1231 for (__tgt_offload_entry *cur = desc->HostEntriesBegin;
1232 cur < desc->HostEntriesEnd; ++cur) {
1233 HostPtrToTableMap.erase(cur->addr);
1234 }
1235
1236 // Remove translation table for this descriptor.
1237 auto tt = HostEntriesBeginToTransTable.find(desc->HostEntriesBegin);
1238 if (tt != HostEntriesBeginToTransTable.end()) {
1239 DP("Removing translation table for descriptor " DPxMOD "\n",
1240 DPxPTR(desc->HostEntriesBegin));
1241 HostEntriesBeginToTransTable.erase(tt);
1242 } else {
1243 DP("Translation table for descriptor " DPxMOD " cannot be found, probably "
1244 "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin));
1245 }
1246
1247 TblMapMtx.unlock();
1248
1249 // TODO: Remove RTL and the devices it manages if it's not used anymore?
1250 // TODO: Write some RTL->unload_image(...) function?
1251
1252 DP("Done unregistering library!\n");
1253}
1254
1255/// Map global data and execute pending ctors
1256static int InitLibrary(DeviceTy& Device) {
1257 /*
1258 * Map global data
1259 */
1260 int32_t device_id = Device.DeviceID;
1261 int rc = OFFLOAD_SUCCESS;
1262
1263 Device.PendingGlobalsMtx.lock();
1264 TrlTblMtx.lock();
1265 for (HostEntriesBeginToTransTableTy::iterator
1266 ii = HostEntriesBeginToTransTable.begin();
1267 ii != HostEntriesBeginToTransTable.end(); ++ii) {
1268 TranslationTable *TransTable = &ii->second;
1269 if (TransTable->TargetsTable[device_id] != 0) {
1270 // Library entries have already been processed
1271 continue;
1272 }
1273
1274 // 1) get image.
1275 assert(TransTable->TargetsImages.size() > (size_t)device_id &&
1276 "Not expecting a device ID outside the table's bounds!");
1277 __tgt_device_image *img = TransTable->TargetsImages[device_id];
1278 if (!img) {
1279 DP("No image loaded for device id %d.\n", device_id);
1280 rc = OFFLOAD_FAIL;
1281 break;
1282 }
1283 // 2) load image into the target table.
1284 __tgt_target_table *TargetTable =
1285 TransTable->TargetsTable[device_id] = Device.load_binary(img);
1286 // Unable to get table for this image: invalidate image and fail.
1287 if (!TargetTable) {
1288 DP("Unable to generate entries table for device id %d.\n", device_id);
1289 TransTable->TargetsImages[device_id] = 0;
1290 rc = OFFLOAD_FAIL;
1291 break;
1292 }
1293
1294 // Verify whether the two table sizes match.
1295 size_t hsize =
1296 TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
1297 size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;
1298
1299 // Invalid image for these host entries!
1300 if (hsize != tsize) {
1301 DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n",
1302 device_id, hsize, tsize);
1303 TransTable->TargetsImages[device_id] = 0;
1304 TransTable->TargetsTable[device_id] = 0;
1305 rc = OFFLOAD_FAIL;
1306 break;
1307 }
1308
1309 // process global data that needs to be mapped.
George Rokosd57681b2017-04-22 11:45:03 +00001310 Device.DataMapMtx.lock();
George Rokos2467df62017-01-25 21:27:24 +00001311 __tgt_target_table *HostTable = &TransTable->HostTable;
1312 for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin,
1313 *CurrHostEntry = HostTable->EntriesBegin,
1314 *EntryDeviceEnd = TargetTable->EntriesEnd;
1315 CurrDeviceEntry != EntryDeviceEnd;
1316 CurrDeviceEntry++, CurrHostEntry++) {
1317 if (CurrDeviceEntry->size != 0) {
1318 // has data.
1319 assert(CurrDeviceEntry->size == CurrHostEntry->size &&
1320 "data size mismatch");
George Rokosba7380b2017-03-22 16:43:40 +00001321
1322 // Fortran may use multiple weak declarations for the same symbol,
1323 // therefore we must allow for multiple weak symbols to be loaded from
1324 // the fat binary. Treat these mappings as any other "regular" mapping.
1325 // Add entry to map.
George Rokosd57681b2017-04-22 11:45:03 +00001326 if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size))
1327 continue;
George Rokos2467df62017-01-25 21:27:24 +00001328 DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu"
1329 "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr),
1330 CurrDeviceEntry->size);
George Rokosd57681b2017-04-22 11:45:03 +00001331 Device.HostDataToTargetMap.push_front(HostDataToTargetTy(
1332 (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/,
1333 (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/,
1334 (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/,
1335 (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/,
1336 INF_REF_CNT /*RefCount*/));
George Rokos2467df62017-01-25 21:27:24 +00001337 }
1338 }
George Rokosd57681b2017-04-22 11:45:03 +00001339 Device.DataMapMtx.unlock();
George Rokos2467df62017-01-25 21:27:24 +00001340 }
1341 TrlTblMtx.unlock();
1342
1343 if (rc != OFFLOAD_SUCCESS) {
1344 Device.PendingGlobalsMtx.unlock();
1345 return rc;
1346 }
1347
1348 /*
1349 * Run ctors for static objects
1350 */
1351 if (!Device.PendingCtorsDtors.empty()) {
1352 // Call all ctors for all libraries registered so far
1353 for (auto &lib : Device.PendingCtorsDtors) {
1354 if (!lib.second.PendingCtors.empty()) {
1355 DP("Has pending ctors... call now\n");
1356 for (auto &entry : lib.second.PendingCtors) {
1357 void *ctor = entry;
1358 int rc = target(device_id, ctor, 0, NULL, NULL, NULL,
1359 NULL, 1, 1, true /*team*/);
1360 if (rc != OFFLOAD_SUCCESS) {
1361 DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor));
1362 Device.PendingGlobalsMtx.unlock();
1363 return OFFLOAD_FAIL;
1364 }
1365 }
1366 // Clear the list to indicate that this device has been used
1367 lib.second.PendingCtors.clear();
1368 DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first));
1369 }
1370 }
1371 }
1372 Device.HasPendingGlobals = false;
1373 Device.PendingGlobalsMtx.unlock();
1374
1375 return OFFLOAD_SUCCESS;
1376}
1377
1378// Check whether a device has been initialized, global ctors have been
1379// executed and global data has been mapped; do so if not already done.
1380static int CheckDevice(int32_t device_id) {
1381 // Is device ready?
1382 if (!device_is_ready(device_id)) {
1383 DP("Device %d is not ready.\n", device_id);
1384 return OFFLOAD_FAIL;
1385 }
1386
1387 // Get device info.
1388 DeviceTy &Device = Devices[device_id];
1389
1390 // Check whether global data has been mapped for this device
1391 Device.PendingGlobalsMtx.lock();
1392 bool hasPendingGlobals = Device.HasPendingGlobals;
1393 Device.PendingGlobalsMtx.unlock();
1394 if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) {
1395 DP("Failed to init globals on device %d\n", device_id);
1396 return OFFLOAD_FAIL;
1397 }
1398
1399 return OFFLOAD_SUCCESS;
1400}
1401
1402// Following datatypes and functions (tgt_oldmap_type, combined_entry_t,
1403// translate_map, cleanup_map) will be removed once the compiler starts using
1404// the new map types.
1405
1406// Old map types
1407enum tgt_oldmap_type {
1408 OMP_TGT_OLDMAPTYPE_TO = 0x001, // copy data from host to device
1409 OMP_TGT_OLDMAPTYPE_FROM = 0x002, // copy data from device to host
1410 OMP_TGT_OLDMAPTYPE_ALWAYS = 0x004, // copy regardless of the ref. count
1411 OMP_TGT_OLDMAPTYPE_DELETE = 0x008, // force unmapping of data
1412 OMP_TGT_OLDMAPTYPE_MAP_PTR = 0x010, // map pointer as well as pointee
1413 OMP_TGT_OLDMAPTYPE_FIRST_MAP = 0x020, // first occurrence of mapped variable
1414 OMP_TGT_OLDMAPTYPE_RETURN_PTR = 0x040, // return TgtBase addr of mapped data
1415 OMP_TGT_OLDMAPTYPE_PRIVATE_PTR = 0x080, // private variable - not mapped
1416 OMP_TGT_OLDMAPTYPE_PRIVATE_VAL = 0x100 // copy by value - not mapped
1417};
1418
1419// Temporary functions for map translation and cleanup
1420struct combined_entry_t {
1421 int num_members; // number of members in combined entry
1422 void *base_addr; // base address of combined entry
1423 void *begin_addr; // begin address of combined entry
1424 void *end_addr; // size of combined entry
1425};
1426
1427static void translate_map(int32_t arg_num, void **args_base, void **args,
1428 int64_t *arg_sizes, int32_t *arg_types, int32_t &new_arg_num,
1429 void **&new_args_base, void **&new_args, int64_t *&new_arg_sizes,
1430 int64_t *&new_arg_types, bool is_target_construct) {
1431 if (arg_num <= 0) {
1432 DP("Nothing to translate\n");
1433 new_arg_num = 0;
1434 return;
1435 }
1436
1437 // array of combined entries
1438 combined_entry_t *cmb_entries =
1439 (combined_entry_t *) alloca(arg_num * sizeof(combined_entry_t));
1440 // number of combined entries
1441 long num_combined = 0;
1442 // old entry is MAP_PTR?
1443 bool *is_ptr_old = (bool *) alloca(arg_num * sizeof(bool));
1444 // old entry is member of member_of[old] cmb_entry
1445 int *member_of = (int *) alloca(arg_num * sizeof(int));
George Rokos15a6e7d2017-02-15 20:45:37 +00001446 // temporary storage for modifications of the original arg_types
1447 int32_t *mod_arg_types = (int32_t *) alloca(arg_num *sizeof(int32_t));
George Rokos2467df62017-01-25 21:27:24 +00001448
1449 DP("Translating %d map entries\n", arg_num);
1450 for (int i = 0; i < arg_num; ++i) {
1451 member_of[i] = -1;
1452 is_ptr_old[i] = false;
George Rokos15a6e7d2017-02-15 20:45:37 +00001453 mod_arg_types[i] = arg_types[i];
George Rokos2467df62017-01-25 21:27:24 +00001454 // Scan previous entries to see whether this entry shares the same base
1455 for (int j = 0; j < i; ++j) {
1456 void *new_begin_addr = NULL;
1457 void *new_end_addr = NULL;
1458
George Rokos15a6e7d2017-02-15 20:45:37 +00001459 if (mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_MAP_PTR) {
George Rokos2467df62017-01-25 21:27:24 +00001460 if (args_base[i] == args[j]) {
George Rokos15a6e7d2017-02-15 20:45:37 +00001461 if (!(mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR)) {
George Rokos2467df62017-01-25 21:27:24 +00001462 DP("Entry %d has the same base as entry %d's begin address\n", i,
1463 j);
1464 new_begin_addr = args_base[i];
1465 new_end_addr = (char *)args_base[i] + sizeof(void *);
1466 assert(arg_sizes[j] == sizeof(void *));
1467 is_ptr_old[j] = true;
1468 } else {
1469 DP("Entry %d has the same base as entry %d's begin address, but "
1470 "%d's base was a MAP_PTR too\n", i, j, j);
George Rokos15a6e7d2017-02-15 20:45:37 +00001471 int32_t to_from_always_delete =
1472 OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM |
1473 OMP_TGT_OLDMAPTYPE_ALWAYS | OMP_TGT_OLDMAPTYPE_DELETE;
1474 if (mod_arg_types[j] & to_from_always_delete) {
1475 DP("Resetting to/from/always/delete flags for entry %d because "
1476 "it is only a pointer to pointer\n", j);
1477 mod_arg_types[j] &= ~to_from_always_delete;
1478 }
George Rokos2467df62017-01-25 21:27:24 +00001479 }
1480 }
1481 } else {
George Rokos15a6e7d2017-02-15 20:45:37 +00001482 if (!(mod_arg_types[i] & OMP_TGT_OLDMAPTYPE_FIRST_MAP) &&
George Rokos2467df62017-01-25 21:27:24 +00001483 args_base[i] == args_base[j]) {
1484 DP("Entry %d has the same base address as entry %d\n", i, j);
1485 new_begin_addr = args[i];
1486 new_end_addr = (char *)args[i] + arg_sizes[i];
1487 }
1488 }
1489
1490 // If we have combined the entry with a previous one
1491 if (new_begin_addr) {
1492 int id;
1493 if(member_of[j] == -1) {
1494 // We have a new entry
1495 id = num_combined++;
1496 DP("Creating new combined entry %d for old entry %d\n", id, j);
1497 // Initialize new entry
1498 cmb_entries[id].num_members = 1;
1499 cmb_entries[id].base_addr = args_base[j];
George Rokos15a6e7d2017-02-15 20:45:37 +00001500 if (mod_arg_types[j] & OMP_TGT_OLDMAPTYPE_MAP_PTR) {
George Rokos2467df62017-01-25 21:27:24 +00001501 cmb_entries[id].begin_addr = args_base[j];
1502 cmb_entries[id].end_addr = (char *)args_base[j] + arg_sizes[j];
1503 } else {
1504 cmb_entries[id].begin_addr = args[j];
1505 cmb_entries[id].end_addr = (char *)args[j] + arg_sizes[j];
1506 }
1507 member_of[j] = id;
1508 } else {
1509 // Reuse existing combined entry
1510 DP("Reusing existing combined entry %d\n", member_of[j]);
1511 id = member_of[j];
1512 }
1513
1514 // Update combined entry
1515 DP("Adding entry %d to combined entry %d\n", i, id);
1516 cmb_entries[id].num_members++;
1517 // base_addr stays the same
1518 cmb_entries[id].begin_addr =
1519 std::min(cmb_entries[id].begin_addr, new_begin_addr);
1520 cmb_entries[id].end_addr =
1521 std::max(cmb_entries[id].end_addr, new_end_addr);
1522 member_of[i] = id;
1523 break;
1524 }
1525 }
1526 }
1527
1528 DP("New entries: %ld combined + %d original\n", num_combined, arg_num);
1529 new_arg_num = arg_num + num_combined;
1530 new_args_base = (void **) malloc(new_arg_num * sizeof(void *));
1531 new_args = (void **) malloc(new_arg_num * sizeof(void *));
1532 new_arg_sizes = (int64_t *) malloc(new_arg_num * sizeof(int64_t));
1533 new_arg_types = (int64_t *) malloc(new_arg_num * sizeof(int64_t));
1534
1535 const int64_t alignment = 8;
1536
1537 int next_id = 0; // next ID
1538 int next_cid = 0; // next combined ID
1539 int *combined_to_new_id = (int *) alloca(num_combined * sizeof(int));
1540 for (int i = 0; i < arg_num; ++i) {
1541 // It is member_of
1542 if (member_of[i] == next_cid) {
1543 int cid = next_cid++; // ID of this combined entry
1544 int nid = next_id++; // ID of the new (global) entry
1545 combined_to_new_id[cid] = nid;
1546 DP("Combined entry %3d will become new entry %3d\n", cid, nid);
1547
1548 int64_t padding = (int64_t)cmb_entries[cid].begin_addr % alignment;
1549 if (padding) {
1550 DP("Using a padding of %" PRId64 " for begin address " DPxMOD "\n",
1551 padding, DPxPTR(cmb_entries[cid].begin_addr));
1552 cmb_entries[cid].begin_addr =
1553 (char *)cmb_entries[cid].begin_addr - padding;
1554 }
1555
1556 new_args_base[nid] = cmb_entries[cid].base_addr;
1557 new_args[nid] = cmb_entries[cid].begin_addr;
1558 new_arg_sizes[nid] = (int64_t) ((char *)cmb_entries[cid].end_addr -
1559 (char *)cmb_entries[cid].begin_addr);
1560 new_arg_types[nid] = OMP_TGT_MAPTYPE_TARGET_PARAM;
1561 DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", "
1562 "size %" PRId64 ", type 0x%" PRIx64 "\n", nid,
1563 DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
1564 new_arg_types[nid]);
1565 } else if (member_of[i] != -1) {
1566 DP("Combined entry %3d has been encountered before, do nothing\n",
1567 member_of[i]);
1568 }
1569
1570 // Now that the combined entry (the one the old entry was a member of) has
1571 // been inserted into the new arguments list, proceed with the old entry.
1572 int nid = next_id++;
1573 DP("Old entry %3d will become new entry %3d\n", i, nid);
1574
1575 new_args_base[nid] = args_base[i];
1576 new_args[nid] = args[i];
1577 new_arg_sizes[nid] = arg_sizes[i];
George Rokos15a6e7d2017-02-15 20:45:37 +00001578 int64_t old_type = mod_arg_types[i];
George Rokos2467df62017-01-25 21:27:24 +00001579
1580 if (is_ptr_old[i]) {
1581 // Reset TO and FROM flags
1582 old_type &= ~(OMP_TGT_OLDMAPTYPE_TO | OMP_TGT_OLDMAPTYPE_FROM);
1583 }
1584
1585 if (member_of[i] == -1) {
1586 if (!is_target_construct)
1587 old_type &= ~OMP_TGT_MAPTYPE_TARGET_PARAM;
1588 new_arg_types[nid] = old_type;
1589 DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64
1590 ", type 0x%" PRIx64 " (old entry %d not MEMBER_OF)\n", nid,
1591 DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
1592 new_arg_types[nid], i);
1593 } else {
1594 // Old entry is not FIRST_MAP
1595 old_type &= ~OMP_TGT_OLDMAPTYPE_FIRST_MAP;
1596 // Add MEMBER_OF
1597 int new_member_of = combined_to_new_id[member_of[i]];
1598 old_type |= ((int64_t)new_member_of + 1) << 48;
1599 new_arg_types[nid] = old_type;
1600 DP("Entry %3d: base_addr " DPxMOD ", begin_addr " DPxMOD ", size %" PRId64
1601 ", type 0x%" PRIx64 " (old entry %d MEMBER_OF %d)\n", nid,
1602 DPxPTR(new_args_base[nid]), DPxPTR(new_args[nid]), new_arg_sizes[nid],
1603 new_arg_types[nid], i, new_member_of);
1604 }
1605 }
1606}
1607
1608static void cleanup_map(int32_t new_arg_num, void **new_args_base,
1609 void **new_args, int64_t *new_arg_sizes, int64_t *new_arg_types,
1610 int32_t arg_num, void **args_base) {
1611 if (new_arg_num > 0) {
1612 int offset = new_arg_num - arg_num;
1613 for (int32_t i = 0; i < arg_num; ++i) {
1614 // Restore old base address
1615 args_base[i] = new_args_base[i+offset];
1616 }
1617 free(new_args_base);
1618 free(new_args);
1619 free(new_arg_sizes);
1620 free(new_arg_types);
1621 }
1622}
1623
1624static short member_of(int64_t type) {
1625 return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1;
1626}
1627
1628/// Internal function to do the mapping and transfer the data to the device
1629static int target_data_begin(DeviceTy &Device, int32_t arg_num,
1630 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) {
1631 // process each input.
1632 int rc = OFFLOAD_SUCCESS;
1633 for (int32_t i = 0; i < arg_num; ++i) {
1634 // Ignore private variables and arrays - there is no mapping for them.
1635 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
1636 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
1637 continue;
1638
1639 void *HstPtrBegin = args[i];
1640 void *HstPtrBase = args_base[i];
1641 // Address of pointer on the host and device, respectively.
1642 void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
1643 bool IsNew, Pointer_IsNew;
1644 bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
1645 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF);
1646 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
1647 DP("Has a pointer entry: \n");
1648 // base is address of pointer.
1649 Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
1650 sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef);
1651 if (!Pointer_TgtPtrBegin) {
1652 DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
1653 "illegal mapping).\n");
1654 }
1655 DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new"
1656 "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin),
1657 (Pointer_IsNew ? "" : " not"));
1658 Pointer_HstPtrBegin = HstPtrBase;
1659 // modify current entry.
1660 HstPtrBase = *(void **)HstPtrBase;
1661 UpdateRef = true; // subsequently update ref count of pointee
1662 }
1663
1664 void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
1665 arg_sizes[i], IsNew, IsImplicit, UpdateRef);
1666 if (!TgtPtrBegin && arg_sizes[i]) {
1667 // If arg_sizes[i]==0, then the argument is a pointer to NULL, so
1668 // getOrAlloc() returning NULL is not an error.
1669 DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
1670 "illegal mapping).\n");
1671 }
1672 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
1673 " - is%s new\n", arg_sizes[i], DPxPTR(TgtPtrBegin),
1674 (IsNew ? "" : " not"));
1675
1676 if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) {
1677 void *ret_ptr;
1678 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)
1679 ret_ptr = Pointer_TgtPtrBegin;
1680 else {
1681 bool IsLast; // not used
1682 ret_ptr = Device.getTgtPtrBegin(HstPtrBegin, 0, IsLast, false);
1683 }
1684
1685 DP("Returning device pointer " DPxMOD "\n", DPxPTR(ret_ptr));
1686 args_base[i] = ret_ptr;
1687 }
1688
1689 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
1690 bool copy = false;
1691 if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
1692 copy = true;
1693 } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
1694 // Copy data only if the "parent" struct has RefCount==1.
1695 short parent_idx = member_of(arg_types[i]);
1696 long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
1697 assert(parent_rc > 0 && "parent struct not found");
1698 if (parent_rc == 1) {
1699 copy = true;
1700 }
1701 }
1702
1703 if (copy) {
1704 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
1705 arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
1706 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
1707 if (rt != OFFLOAD_SUCCESS) {
1708 DP("Copying data to device failed.\n");
1709 rc = OFFLOAD_FAIL;
1710 }
1711 }
1712 }
1713
1714 if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
1715 DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
1716 DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
1717 uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
1718 void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta);
1719 int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase,
1720 sizeof(void *));
1721 if (rt != OFFLOAD_SUCCESS) {
1722 DP("Copying data to device failed.\n");
1723 rc = OFFLOAD_FAIL;
1724 }
1725 // create shadow pointers for this entry
1726 Device.ShadowMtx.lock();
1727 Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase,
1728 Pointer_TgtPtrBegin, TgtPtrBase};
1729 Device.ShadowMtx.unlock();
1730 }
1731 }
1732
1733 return rc;
1734}
1735
1736EXTERN void __tgt_target_data_begin_nowait(int32_t device_id, int32_t arg_num,
1737 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
1738 int32_t depNum, void *depList, int32_t noAliasDepNum,
1739 void *noAliasDepList) {
1740 if (depNum + noAliasDepNum > 0)
1741 __kmpc_omp_taskwait(NULL, 0);
1742
1743 __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes,
1744 arg_types);
1745}
1746
1747/// creates host-to-target data mapping, stores it in the
1748/// libomptarget.so internal structure (an entry in a stack of data maps)
1749/// and passes the data to the device.
1750EXTERN void __tgt_target_data_begin(int32_t device_id, int32_t arg_num,
1751 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
1752 DP("Entering data begin region for device %d with %d mappings\n", device_id,
1753 arg_num);
1754
1755 // No devices available?
1756 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
1757 device_id = omp_get_default_device();
1758 DP("Use default device id %d\n", device_id);
1759 }
1760
1761 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
1762 DP("Failed to get device %d ready\n", device_id);
1763 return;
1764 }
1765
1766 DeviceTy& Device = Devices[device_id];
1767
1768 // Translate maps
1769 int32_t new_arg_num;
1770 void **new_args_base;
1771 void **new_args;
1772 int64_t *new_arg_sizes;
1773 int64_t *new_arg_types;
1774 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
1775 new_args_base, new_args, new_arg_sizes, new_arg_types, false);
1776
1777 //target_data_begin(Device, arg_num, args_base, args, arg_sizes, arg_types);
1778 target_data_begin(Device, new_arg_num, new_args_base, new_args, new_arg_sizes,
1779 new_arg_types);
1780
1781 // Cleanup translation memory
1782 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
1783 new_arg_types, arg_num, args_base);
1784}
1785
1786/// Internal function to undo the mapping and retrieve the data from the device.
1787static int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
1788 void **args, int64_t *arg_sizes, int64_t *arg_types) {
1789 int rc = OFFLOAD_SUCCESS;
1790 // process each input.
1791 for (int32_t i = arg_num - 1; i >= 0; --i) {
1792 // Ignore private variables and arrays - there is no mapping for them.
1793 // Also, ignore the use_device_ptr directive, it has no effect here.
1794 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
1795 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
1796 continue;
1797
1798 void *HstPtrBegin = args[i];
1799 bool IsLast;
1800 bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
1801 (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
1802 bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
1803
1804 // If PTR_AND_OBJ, HstPtrBegin is address of pointee
1805 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
1806 UpdateRef);
1807 DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
1808 " - is%s last\n", arg_sizes[i], DPxPTR(TgtPtrBegin),
1809 (IsLast ? "" : " not"));
1810
George Rokos15a6e7d2017-02-15 20:45:37 +00001811 bool DelEntry = IsLast || ForceDelete;
1812
George Rokos2467df62017-01-25 21:27:24 +00001813 if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1814 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
George Rokos15a6e7d2017-02-15 20:45:37 +00001815 DelEntry = false; // protect parent struct from being deallocated
George Rokos2467df62017-01-25 21:27:24 +00001816 }
1817
George Rokos2467df62017-01-25 21:27:24 +00001818 if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) {
1819 // Move data back to the host
1820 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
1821 bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
1822 bool CopyMember = false;
1823 if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
1824 !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
1825 // Copy data only if the "parent" struct has RefCount==1.
1826 short parent_idx = member_of(arg_types[i]);
1827 long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
1828 assert(parent_rc > 0 && "parent struct not found");
1829 if (parent_rc == 1) {
1830 CopyMember = true;
1831 }
1832 }
1833
1834 if (DelEntry || Always || CopyMember) {
1835 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
1836 arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
1837 int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, arg_sizes[i]);
1838 if (rt != OFFLOAD_SUCCESS) {
1839 DP("Copying data from device failed.\n");
1840 rc = OFFLOAD_FAIL;
1841 }
1842 }
1843 }
1844
1845 // If we copied back to the host a struct/array containing pointers, we
1846 // need to restore the original host pointer values from their shadow
1847 // copies. If the struct is going to be deallocated, remove any remaining
1848 // shadow pointer entries for this struct.
1849 uintptr_t lb = (uintptr_t) HstPtrBegin;
1850 uintptr_t ub = (uintptr_t) HstPtrBegin + arg_sizes[i];
1851 Device.ShadowMtx.lock();
1852 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
1853 it != Device.ShadowPtrMap.end(); ++it) {
1854 void **ShadowHstPtrAddr = (void**) it->first;
1855
1856 // An STL map is sorted on its keys; use this property
1857 // to quickly determine when to break out of the loop.
1858 if ((uintptr_t) ShadowHstPtrAddr < lb)
1859 continue;
1860 if ((uintptr_t) ShadowHstPtrAddr >= ub)
1861 break;
1862
1863 // If we copied the struct to the host, we need to restore the pointer.
1864 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
1865 DP("Restoring original host pointer value " DPxMOD " for host "
1866 "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
1867 DPxPTR(ShadowHstPtrAddr));
1868 *ShadowHstPtrAddr = it->second.HstPtrVal;
1869 }
1870 // If the struct is to be deallocated, remove the shadow entry.
1871 if (DelEntry) {
1872 DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr));
1873 Device.ShadowPtrMap.erase(it);
1874 }
1875 }
1876 Device.ShadowMtx.unlock();
1877
1878 // Deallocate map
1879 if (DelEntry) {
1880 int rt = Device.deallocTgtPtr(HstPtrBegin, arg_sizes[i], ForceDelete);
1881 if (rt != OFFLOAD_SUCCESS) {
1882 DP("Deallocating data from device failed.\n");
1883 rc = OFFLOAD_FAIL;
1884 }
1885 }
1886 }
1887 }
1888
1889 return rc;
1890}
1891
1892/// passes data from the target, releases target memory and destroys
1893/// the host-target mapping (top entry from the stack of data maps)
1894/// created by the last __tgt_target_data_begin.
1895EXTERN void __tgt_target_data_end(int32_t device_id, int32_t arg_num,
1896 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
1897 DP("Entering data end region with %d mappings\n", arg_num);
1898
1899 // No devices available?
1900 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
1901 device_id = omp_get_default_device();
1902 }
1903
1904 RTLsMtx.lock();
1905 size_t Devices_size = Devices.size();
1906 RTLsMtx.unlock();
1907 if (Devices_size <= (size_t)device_id) {
1908 DP("Device ID %d does not have a matching RTL.\n", device_id);
1909 return;
1910 }
1911
1912 DeviceTy &Device = Devices[device_id];
1913 if (!Device.IsInit) {
1914 DP("uninit device: ignore");
1915 return;
1916 }
1917
1918 // Translate maps
1919 int32_t new_arg_num;
1920 void **new_args_base;
1921 void **new_args;
1922 int64_t *new_arg_sizes;
1923 int64_t *new_arg_types;
1924 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
1925 new_args_base, new_args, new_arg_sizes, new_arg_types, false);
1926
1927 //target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
1928 target_data_end(Device, new_arg_num, new_args_base, new_args, new_arg_sizes,
1929 new_arg_types);
1930
1931 // Cleanup translation memory
1932 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
1933 new_arg_types, arg_num, args_base);
1934}
1935
1936EXTERN void __tgt_target_data_end_nowait(int32_t device_id, int32_t arg_num,
1937 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types,
1938 int32_t depNum, void *depList, int32_t noAliasDepNum,
1939 void *noAliasDepList) {
1940 if (depNum + noAliasDepNum > 0)
1941 __kmpc_omp_taskwait(NULL, 0);
1942
1943 __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes,
1944 arg_types);
1945}
1946
1947/// passes data to/from the target.
1948EXTERN void __tgt_target_data_update(int32_t device_id, int32_t arg_num,
1949 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
1950 DP("Entering data update with %d mappings\n", arg_num);
1951
1952 // No devices available?
1953 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
1954 device_id = omp_get_default_device();
1955 }
1956
1957 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
1958 DP("Failed to get device %d ready\n", device_id);
1959 return;
1960 }
1961
1962 DeviceTy& Device = Devices[device_id];
1963
1964 // process each input.
1965 for (int32_t i = 0; i < arg_num; ++i) {
1966 if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) ||
1967 (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE))
1968 continue;
1969
1970 void *HstPtrBegin = args[i];
1971 int64_t MapSize = arg_sizes[i];
1972 bool IsLast;
1973 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
1974 false);
1975
1976 if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
1977 DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
1978 arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
1979 Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize);
1980
1981 uintptr_t lb = (uintptr_t) HstPtrBegin;
1982 uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
1983 Device.ShadowMtx.lock();
1984 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
1985 it != Device.ShadowPtrMap.end(); ++it) {
1986 void **ShadowHstPtrAddr = (void**) it->first;
1987 if ((uintptr_t) ShadowHstPtrAddr < lb)
1988 continue;
1989 if ((uintptr_t) ShadowHstPtrAddr >= ub)
1990 break;
1991 DP("Restoring original host pointer value " DPxMOD " for host pointer "
1992 DPxMOD "\n", DPxPTR(it->second.HstPtrVal),
1993 DPxPTR(ShadowHstPtrAddr));
1994 *ShadowHstPtrAddr = it->second.HstPtrVal;
1995 }
1996 Device.ShadowMtx.unlock();
1997 }
1998
1999 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
2000 DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
2001 arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
2002 Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize);
2003
2004 uintptr_t lb = (uintptr_t) HstPtrBegin;
2005 uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
2006 Device.ShadowMtx.lock();
2007 for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin();
2008 it != Device.ShadowPtrMap.end(); ++it) {
2009 void **ShadowHstPtrAddr = (void**) it->first;
2010 if ((uintptr_t) ShadowHstPtrAddr < lb)
2011 continue;
2012 if ((uintptr_t) ShadowHstPtrAddr >= ub)
2013 break;
2014 DP("Restoring original target pointer value " DPxMOD " for target "
2015 "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal),
2016 DPxPTR(it->second.TgtPtrAddr));
2017 Device.data_submit(it->second.TgtPtrAddr,
2018 &it->second.TgtPtrVal, sizeof(void *));
2019 }
2020 Device.ShadowMtx.unlock();
2021 }
2022 }
2023}
2024
2025EXTERN void __tgt_target_data_update_nowait(
2026 int32_t device_id, int32_t arg_num, void **args_base, void **args,
2027 int64_t *arg_sizes, int32_t *arg_types, int32_t depNum, void *depList,
2028 int32_t noAliasDepNum, void *noAliasDepList) {
2029 if (depNum + noAliasDepNum > 0)
2030 __kmpc_omp_taskwait(NULL, 0);
2031
2032 __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes,
2033 arg_types);
2034}
2035
2036/// performs the same actions as data_begin in case arg_num is
2037/// non-zero and initiates run of the offloaded region on the target platform;
2038/// if arg_num is non-zero after the region execution is done it also
2039/// performs the same action as data_update and data_end above. This function
2040/// returns 0 if it was able to transfer the execution to a target and an
2041/// integer different from zero otherwise.
2042static int target(int32_t device_id, void *host_ptr, int32_t arg_num,
2043 void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types,
2044 int32_t team_num, int32_t thread_limit, int IsTeamConstruct) {
2045 DeviceTy &Device = Devices[device_id];
2046
2047 // Find the table information in the map or look it up in the translation
2048 // tables.
2049 TableMap *TM = 0;
2050 TblMapMtx.lock();
2051 HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap.find(host_ptr);
2052 if (TableMapIt == HostPtrToTableMap.end()) {
2053 // We don't have a map. So search all the registered libraries.
2054 TrlTblMtx.lock();
2055 for (HostEntriesBeginToTransTableTy::iterator
2056 ii = HostEntriesBeginToTransTable.begin(),
2057 ie = HostEntriesBeginToTransTable.end();
2058 !TM && ii != ie; ++ii) {
2059 // get the translation table (which contains all the good info).
2060 TranslationTable *TransTable = &ii->second;
2061 // iterate over all the host table entries to see if we can locate the
2062 // host_ptr.
2063 __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin;
2064 __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd;
2065 __tgt_offload_entry *cur = begin;
2066 for (uint32_t i = 0; cur < end; ++cur, ++i) {
2067 if (cur->addr != host_ptr)
2068 continue;
2069 // we got a match, now fill the HostPtrToTableMap so that we
2070 // may avoid this search next time.
2071 TM = &HostPtrToTableMap[host_ptr];
2072 TM->Table = TransTable;
2073 TM->Index = i;
2074 break;
2075 }
2076 }
2077 TrlTblMtx.unlock();
2078 } else {
2079 TM = &TableMapIt->second;
2080 }
2081 TblMapMtx.unlock();
2082
2083 // No map for this host pointer found!
2084 if (!TM) {
2085 DP("Host ptr " DPxMOD " does not have a matching target pointer.\n",
2086 DPxPTR(host_ptr));
2087 return OFFLOAD_FAIL;
2088 }
2089
2090 // get target table.
2091 TrlTblMtx.lock();
2092 assert(TM->Table->TargetsTable.size() > (size_t)device_id &&
2093 "Not expecting a device ID outside the table's bounds!");
2094 __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id];
2095 TrlTblMtx.unlock();
2096 assert(TargetTable && "Global data has not been mapped\n");
2097
2098 // Move data to device.
2099 int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes,
2100 arg_types);
2101
2102 if (rc != OFFLOAD_SUCCESS) {
2103 DP("Call to target_data_begin failed, skipping target execution.\n");
2104 // Call target_data_end to dealloc whatever target_data_begin allocated
2105 // and return OFFLOAD_FAIL.
2106 target_data_end(Device, arg_num, args_base, args, arg_sizes, arg_types);
2107 return OFFLOAD_FAIL;
2108 }
2109
2110 std::vector<void *> tgt_args;
2111
2112 // List of (first-)private arrays allocated for this target region
2113 std::vector<void *> fpArrays;
2114
2115 for (int32_t i = 0; i < arg_num; ++i) {
2116 if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) {
2117 // This is not a target parameter, do not push it into tgt_args.
2118 continue;
2119 }
2120 void *HstPtrBegin = args[i];
2121 void *HstPtrBase = args_base[i];
2122 void *TgtPtrBase;
2123 bool IsLast; // unused.
2124 if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
2125 DP("Forwarding first-private value " DPxMOD " to the target construct\n",
2126 DPxPTR(HstPtrBase));
2127 TgtPtrBase = HstPtrBase;
2128 } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) {
2129 // Allocate memory for (first-)private array
2130 void *TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID,
2131 arg_sizes[i]);
2132 if (!TgtPtrBegin) {
2133 DP ("Data allocation for %sprivate array " DPxMOD " failed\n",
2134 (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
2135 DPxPTR(HstPtrBegin));
2136 rc = OFFLOAD_FAIL;
2137 break;
2138 } else {
2139 fpArrays.push_back(TgtPtrBegin);
2140 uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
2141 TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta);
2142 DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for "
2143 "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n",
2144 arg_sizes[i], DPxPTR(TgtPtrBegin),
2145 (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""),
2146 DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase));
2147 // If first-private, copy data from host
2148 if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
2149 int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i]);
2150 if (rt != OFFLOAD_SUCCESS) {
2151 DP ("Copying data to device failed.\n");
2152 rc = OFFLOAD_FAIL;
2153 break;
2154 }
2155 }
2156 }
2157 } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
2158 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *),
2159 IsLast, false);
2160 TgtPtrBase = TgtPtrBegin; // no offset for ptrs.
2161 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
2162 "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
2163 DPxPTR(HstPtrBase));
2164 } else {
2165 void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i],
2166 IsLast, false);
2167 uint64_t PtrDelta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
2168 TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - PtrDelta);
2169 DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n",
2170 DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin));
2171 }
2172 tgt_args.push_back(TgtPtrBase);
2173 }
2174 // Push omp handle.
2175 tgt_args.push_back((void *)0);
2176
2177 // Pop loop trip count
2178 uint64_t ltc = Device.loopTripCnt;
2179 Device.loopTripCnt = 0;
2180
2181 // Launch device execution.
2182 if (rc == OFFLOAD_SUCCESS) {
2183 DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n",
2184 TargetTable->EntriesBegin[TM->Index].name,
2185 DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index);
2186 if (IsTeamConstruct) {
2187 rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr,
2188 &tgt_args[0], tgt_args.size(), team_num, thread_limit, ltc);
2189 } else {
2190 rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr,
2191 &tgt_args[0], tgt_args.size());
2192 }
2193 } else {
2194 DP("Errors occurred while obtaining target arguments, skipping kernel "
2195 "execution\n");
2196 }
2197
2198 // Deallocate (first-)private arrays
2199 for (auto it : fpArrays) {
2200 int rt = Device.RTL->data_delete(Device.RTLDeviceID, it);
2201 if (rt != OFFLOAD_SUCCESS) {
2202 DP("Deallocation of (first-)private arrays failed.\n");
2203 rc = OFFLOAD_FAIL;
2204 }
2205 }
2206
2207 // Move data from device.
2208 int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes,
2209 arg_types);
2210
2211 if (rt != OFFLOAD_SUCCESS) {
2212 DP("Call to target_data_end failed.\n");
2213 rc = OFFLOAD_FAIL;
2214 }
2215
2216 return rc;
2217}
2218
2219EXTERN int __tgt_target(int32_t device_id, void *host_ptr, int32_t arg_num,
2220 void **args_base, void **args, int64_t *arg_sizes, int32_t *arg_types) {
George Rokos2467df62017-01-25 21:27:24 +00002221 DP("Entering target region with entry point " DPxMOD " and device Id %d\n",
2222 DPxPTR(host_ptr), device_id);
2223
2224 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
2225 device_id = omp_get_default_device();
2226 }
2227
2228 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
2229 DP("Failed to get device %d ready\n", device_id);
2230 return OFFLOAD_FAIL;
2231 }
2232
2233 // Translate maps
2234 int32_t new_arg_num;
2235 void **new_args_base;
2236 void **new_args;
2237 int64_t *new_arg_sizes;
2238 int64_t *new_arg_types;
2239 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
2240 new_args_base, new_args, new_arg_sizes, new_arg_types, true);
2241
2242 //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
2243 // arg_types, 0, 0, false /*team*/, false /*recursive*/);
2244 int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args,
2245 new_arg_sizes, new_arg_types, 0, 0, false /*team*/);
2246
2247 // Cleanup translation memory
2248 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
2249 new_arg_types, arg_num, args_base);
2250
2251 return rc;
2252}
2253
2254EXTERN int __tgt_target_nowait(int32_t device_id, void *host_ptr,
2255 int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
2256 int32_t *arg_types, int32_t depNum, void *depList, int32_t noAliasDepNum,
2257 void *noAliasDepList) {
2258 if (depNum + noAliasDepNum > 0)
2259 __kmpc_omp_taskwait(NULL, 0);
2260
2261 return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
2262 arg_types);
2263}
2264
2265EXTERN int __tgt_target_teams(int32_t device_id, void *host_ptr,
2266 int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
2267 int32_t *arg_types, int32_t team_num, int32_t thread_limit) {
George Rokos2467df62017-01-25 21:27:24 +00002268 DP("Entering target region with entry point " DPxMOD " and device Id %d\n",
2269 DPxPTR(host_ptr), device_id);
2270
2271 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
2272 device_id = omp_get_default_device();
2273 }
2274
2275 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
2276 DP("Failed to get device %d ready\n", device_id);
2277 return OFFLOAD_FAIL;
2278 }
2279
2280 // Translate maps
2281 int32_t new_arg_num;
2282 void **new_args_base;
2283 void **new_args;
2284 int64_t *new_arg_sizes;
2285 int64_t *new_arg_types;
2286 translate_map(arg_num, args_base, args, arg_sizes, arg_types, new_arg_num,
2287 new_args_base, new_args, new_arg_sizes, new_arg_types, true);
2288
2289 //return target(device_id, host_ptr, arg_num, args_base, args, arg_sizes,
2290 // arg_types, team_num, thread_limit, true /*team*/,
2291 // false /*recursive*/);
2292 int rc = target(device_id, host_ptr, new_arg_num, new_args_base, new_args,
2293 new_arg_sizes, new_arg_types, team_num, thread_limit, true /*team*/);
2294
2295 // Cleanup translation memory
2296 cleanup_map(new_arg_num, new_args_base, new_args, new_arg_sizes,
2297 new_arg_types, arg_num, args_base);
2298
2299 return rc;
2300}
2301
2302EXTERN int __tgt_target_teams_nowait(int32_t device_id, void *host_ptr,
2303 int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes,
2304 int32_t *arg_types, int32_t team_num, int32_t thread_limit, int32_t depNum,
2305 void *depList, int32_t noAliasDepNum, void *noAliasDepList) {
2306 if (depNum + noAliasDepNum > 0)
2307 __kmpc_omp_taskwait(NULL, 0);
2308
2309 return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args,
2310 arg_sizes, arg_types, team_num, thread_limit);
2311}
2312
2313
2314// The trip count mechanism will be revised - this scheme is not thread-safe.
2315EXTERN void __kmpc_push_target_tripcount(int32_t device_id,
2316 uint64_t loop_tripcount) {
2317 if (device_id == OFFLOAD_DEVICE_DEFAULT) {
2318 device_id = omp_get_default_device();
2319 }
2320
2321 if (CheckDevice(device_id) != OFFLOAD_SUCCESS) {
2322 DP("Failed to get device %d ready\n", device_id);
2323 return;
2324 }
2325
2326 DP("__kmpc_push_target_tripcount(%d, %" PRIu64 ")\n", device_id,
2327 loop_tripcount);
2328 Devices[device_id].loopTripCnt = loop_tripcount;
2329}
2330