blob: ac51de2b55da299a72ca52811915315aa235963d [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_taskdeps.cpp
3 * $Revision: 42539 $
4 * $Date: 2013-07-17 11:20:01 -0500 (Wed, 17 Jul 2013) $
5 */
6
7
8//===----------------------------------------------------------------------===//
9//
10// The LLVM Compiler Infrastructure
11//
12// This file is dual licensed under the MIT and the University of Illinois Open
13// Source Licenses. See LICENSE.txt for details.
14//
15//===----------------------------------------------------------------------===//
16
17
18//#define KMP_SUPPORT_GRAPH_OUTPUT 1
19
20#include "kmp.h"
21#include "kmp_io.h"
22
23#if OMP_40_ENABLED
24
25//TODO: Improve memory allocation? keep a list of pre-allocated structures? allocate in blocks? re-use list finished list entries?
26//TODO: don't use atomic ref counters for stack-allocated nodes.
27//TODO: find an alternate to atomic refs for heap-allocated nodes?
28//TODO: Finish graph output support
29//TODO: kmp_lock_t seems a tad to big (and heavy weight) for this. Check other runtime locks
30//TODO: Any ITT support needed?
31
32#ifdef KMP_SUPPORT_GRAPH_OUTPUT
33static kmp_int32 kmp_node_id_seed = 0;
34#endif
35
36static void
37__kmp_init_node ( kmp_depnode_t *node )
38{
39 node->dn.task = NULL; // set to null initially, it will point to the right task once dependences have been processed
40 node->dn.successors = NULL;
41 __kmp_init_lock(&node->dn.lock);
42 node->dn.nrefs = 1; // init creates the first reference to the node
43#ifdef KMP_SUPPORT_GRAPH_OUTPUT
44 node->dn.id = KMP_TEST_THEN_INC32(&kmp_node_id_seed);
45#endif
46}
47
48static inline kmp_depnode_t *
49__kmp_node_ref ( kmp_depnode_t *node )
50{
51 KMP_TEST_THEN_INC32(&node->dn.nrefs);
52 return node;
53}
54
55static inline void
56__kmp_node_deref ( kmp_info_t *thread, kmp_depnode_t *node )
57{
58 if (!node) return;
59
60 kmp_int32 n = KMP_TEST_THEN_DEC32(&node->dn.nrefs) - 1;
61 if ( n == 0 ) {
62 KMP_ASSERT(node->dn.nrefs == 0);
63#if USE_FAST_MEMORY
64 __kmp_fast_free(thread,node);
65#else
66 __kmp_thread_free(thread,node);
67#endif
68 }
69}
70
71#define KMP_ACQUIRE_DEPNODE(gtid,n) __kmp_acquire_lock(&(n)->dn.lock,(gtid))
72#define KMP_RELEASE_DEPNODE(gtid,n) __kmp_release_lock(&(n)->dn.lock,(gtid))
73
74static void
75__kmp_depnode_list_free ( kmp_info_t *thread, kmp_depnode_list *list );
76
77static const kmp_int32 kmp_dephash_log2 = 6;
78static const kmp_int32 kmp_dephash_size = (1 << kmp_dephash_log2);
79
80static inline kmp_int32
81__kmp_dephash_hash ( kmp_intptr_t addr )
82{
83 //TODO alternate to try: set = (((Addr64)(addrUsefulBits * 9.618)) % m_num_sets );
84 return ((addr >> kmp_dephash_log2) ^ addr) % kmp_dephash_size;
85}
86
87static kmp_dephash_t *
88__kmp_dephash_create ( kmp_info_t *thread )
89{
90 kmp_dephash_t *h;
91
92 kmp_int32 size = kmp_dephash_size * sizeof(kmp_dephash_entry_t) + sizeof(kmp_dephash_t);
93
94#if USE_FAST_MEMORY
95 h = (kmp_dephash_t *) __kmp_fast_allocate( thread, size );
96#else
97 h = (kmp_dephash_t *) __kmp_thread_malloc( thread, size );
98#endif
99
100#ifdef KMP_DEBUG
101 h->nelements = 0;
102#endif
103 h->buckets = (kmp_dephash_entry **)(h+1);
104
105 for ( kmp_int32 i = 0; i < kmp_dephash_size; i++ )
106 h->buckets[i] = 0;
107
108 return h;
109}
110
111static void
112__kmp_dephash_free ( kmp_info_t *thread, kmp_dephash_t *h )
113{
114 for ( kmp_int32 i=0; i < kmp_dephash_size; i++ ) {
115 if ( h->buckets[i] ) {
116 kmp_dephash_entry_t *next;
117 for ( kmp_dephash_entry_t *entry = h->buckets[i]; entry; entry = next ) {
118 next = entry->next_in_bucket;
119 __kmp_depnode_list_free(thread,entry->last_ins);
120 __kmp_node_deref(thread,entry->last_out);
121#if USE_FAST_MEMORY
122 __kmp_fast_free(thread,entry);
123#else
124 __kmp_thread_free(thread,entry);
125#endif
126 }
127 }
128 }
129#if USE_FAST_MEMORY
130 __kmp_fast_free(thread,h);
131#else
132 __kmp_thread_free(thread,h);
133#endif
134}
135
136static kmp_dephash_entry *
137__kmp_dephash_find ( kmp_info_t *thread, kmp_dephash_t *h, kmp_intptr_t addr )
138{
139 kmp_int32 bucket = __kmp_dephash_hash(addr);
140
141 kmp_dephash_entry_t *entry;
142 for ( entry = h->buckets[bucket]; entry; entry = entry->next_in_bucket )
143 if ( entry->addr == addr ) break;
144
145 if ( entry == NULL ) {
146 // create entry. This is only done by one thread so no locking required
147#if USE_FAST_MEMORY
148 entry = (kmp_dephash_entry_t *) __kmp_fast_allocate( thread, sizeof(kmp_dephash_entry_t) );
149#else
150 entry = (kmp_dephash_entry_t *) __kmp_thread_malloc( thread, sizeof(kmp_dephash_entry_t) );
151#endif
152 entry->addr = addr;
153 entry->last_out = NULL;
154 entry->last_ins = NULL;
155 entry->next_in_bucket = h->buckets[bucket];
156 h->buckets[bucket] = entry;
157#ifdef KMP_DEBUG
158 h->nelements++;
159 if ( entry->next_in_bucket ) h->nconflicts++;
160#endif
161 }
162 return entry;
163}
164
165static kmp_depnode_list_t *
166__kmp_add_node ( kmp_info_t *thread, kmp_depnode_list_t *list, kmp_depnode_t *node )
167{
168 kmp_depnode_list_t *new_head;
169
170#if USE_FAST_MEMORY
171 new_head = (kmp_depnode_list_t *) __kmp_fast_allocate(thread,sizeof(kmp_depnode_list_t));
172#else
173 new_head = (kmp_depnode_list_t *) __kmp_thread_malloc(thread,sizeof(kmp_depnode_list_t));
174#endif
175
176 new_head->node = __kmp_node_ref(node);
177 new_head->next = list;
178
179 return new_head;
180}
181
182static void
183__kmp_depnode_list_free ( kmp_info_t *thread, kmp_depnode_list *list )
184{
185 kmp_depnode_list *next;
186
187 for ( ; list ; list = next ) {
188 next = list->next;
189
190 __kmp_node_deref(thread,list->node);
191#if USE_FAST_MEMORY
192 __kmp_fast_free(thread,list);
193#else
194 __kmp_thread_free(thread,list);
195#endif
196 }
197}
198
199static inline void
200__kmp_track_dependence ( kmp_depnode_t *source, kmp_depnode_t *sink )
201{
202#ifdef KMP_SUPPORT_GRAPH_OUTPUT
203 kmp_taskdata_t * task_source = KMP_TASK_TO_TASKDATA(source->dn.task);
204 kmp_taskdata_t * task_sink = KMP_TASK_TO_TASKDATA(sink->dn.task); // this can be NULL when if(0) ...
205
206 __kmp_printf("%d(%s) -> %d(%s)\n", source->dn.id, task_source->td_ident->psource, sink->dn.id, task_sink->td_ident->psource);
207#endif
208}
209
210template< bool filter >
211static inline kmp_int32
212__kmp_process_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash,
213 bool dep_barrier,kmp_int32 ndeps, kmp_depend_info_t *dep_list)
214{
215 kmp_info_t *thread = __kmp_threads[ gtid ];
216 kmp_int32 npredecessors=0;
217 for ( kmp_int32 i = 0; i < ndeps ; i++ ) {
218 const kmp_depend_info_t * dep = &dep_list[i];
219
220 KMP_DEBUG_ASSERT(dep->flags.in);
221
222 if ( filter && dep->base_addr == 0 ) continue; // skip filtered entries
223
224 kmp_dephash_entry_t *info = __kmp_dephash_find(thread,hash,dep->base_addr);
225 kmp_depnode_t *last_out = info->last_out;
226
227 if ( dep->flags.out && info->last_ins ) {
228 for ( kmp_depnode_list_t * p = info->last_ins; p; p = p->next ) {
229 kmp_depnode_t * indep = p->node;
230 if ( indep->dn.task ) {
231 KMP_ACQUIRE_DEPNODE(gtid,indep);
232 if ( indep->dn.task ) {
233 __kmp_track_dependence(indep,node);
234 indep->dn.successors = __kmp_add_node(thread, indep->dn.successors, node);
235 npredecessors++;
236 }
237 KMP_RELEASE_DEPNODE(gtid,indep);
238 }
239 }
240
241 __kmp_depnode_list_free(thread,info->last_ins);
242 info->last_ins = NULL;
243
244 } else if ( last_out && last_out->dn.task ) {
245 KMP_ACQUIRE_DEPNODE(gtid,last_out);
246 if ( last_out->dn.task ) {
247 __kmp_track_dependence(last_out,node);
248 last_out->dn.successors = __kmp_add_node(thread, last_out->dn.successors, node);
249 npredecessors++;
250 }
251 KMP_RELEASE_DEPNODE(gtid,last_out);
252 }
253
254 if ( dep_barrier ) {
255 // if this is a sync point in the serial sequence and previous outputs are guaranteed to be completed after
256 // the execution of this task so the previous output nodes can be cleared.
257 __kmp_node_deref(thread,last_out);
258 info->last_out = NULL;
259 } else {
260 if ( dep->flags.out ) {
261 __kmp_node_deref(thread,last_out);
262 info->last_out = __kmp_node_ref(node);
263 } else
264 info->last_ins = __kmp_add_node(thread, info->last_ins, node);
265 }
266
267 }
268 return npredecessors;
269}
270
271#define NO_DEP_BARRIER (false)
272#define DEP_BARRIER (true)
273
274// returns true if the task has any outstanding dependence
275static bool
276__kmp_check_deps ( kmp_int32 gtid, kmp_depnode_t *node, kmp_task_t *task, kmp_dephash_t *hash, bool dep_barrier,
277 kmp_int32 ndeps, kmp_depend_info_t *dep_list,
278 kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
279{
280 int i;
281
282 // Filter deps in dep_list
283 // TODO: Different algorithm for large dep_list ( > 10 ? )
284 for ( i = 0; i < ndeps; i ++ ) {
285 if ( dep_list[i].base_addr != 0 )
286 for ( int j = i+1; j < ndeps; j++ )
287 if ( dep_list[i].base_addr == dep_list[j].base_addr ) {
288 dep_list[i].flags.in |= dep_list[j].flags.in;
289 dep_list[i].flags.out |= dep_list[j].flags.out;
290 dep_list[j].base_addr = 0; // Mark j element as void
291 }
292 }
293
294 // doesn't need to be atomic as no other thread is going to be accessing this node just yet
295 // npredecessors is set 1 to ensure that none of the releasing tasks queues this task before we have finished processing all the dependencies
296 node->dn.npredecessors = 1;
297
298 // used to pack all npredecessors additions into a single atomic operation at the end
299 int npredecessors;
300
301 npredecessors = __kmp_process_deps<true>(gtid, node, hash, dep_barrier, ndeps, dep_list);
302 npredecessors += __kmp_process_deps<false>(gtid, node, hash, dep_barrier, ndeps_noalias, noalias_dep_list);
303
304 KMP_TEST_THEN_ADD32(&node->dn.npredecessors, npredecessors);
305
306 // Remove the fake predecessor and find out if there's any outstanding dependence (some tasks may have finished while we processed the dependences)
307 node->dn.task = task;
308 KMP_MB();
309 npredecessors = KMP_TEST_THEN_DEC32(&node->dn.npredecessors) - 1;
310
311 // beyond this point the task could be queued (and executed) by a releasing task...
312 return npredecessors > 0 ? true : false;
313}
314
315void
316__kmp_release_deps ( kmp_int32 gtid, kmp_taskdata_t *task )
317{
318 kmp_info_t *thread = __kmp_threads[ gtid ];
319 kmp_depnode_t *node = task->td_depnode;
320
321 if ( task->td_dephash )
322 __kmp_dephash_free(thread,task->td_dephash);
323
324 if ( !node ) return;
325
326 KMP_ACQUIRE_DEPNODE(gtid,node);
327 node->dn.task = NULL; // mark this task as finished, so no new dependencies are generated
328 KMP_RELEASE_DEPNODE(gtid,node);
329
330 kmp_depnode_list_t *next;
331 for ( kmp_depnode_list_t *p = node->dn.successors; p; p = next ) {
332 kmp_depnode_t *successor = p->node;
333 kmp_int32 npredecessors = KMP_TEST_THEN_DEC32(&successor->dn.npredecessors) - 1;
334
335 // successor task can be NULL for wait_depends or because deps are still being processed
336 if ( npredecessors == 0 ) {
337 KMP_MB();
338 if ( successor->dn.task )
339 // loc_ref was already stored in successor's task_data
340 __kmpc_omp_task(NULL,gtid,successor->dn.task);
341 }
342
343 next = p->next;
344 __kmp_node_deref(thread,p->node);
345#if USE_FAST_MEMORY
346 __kmp_fast_free(thread,p);
347#else
348 __kmp_thread_free(thread,p);
349#endif
350 }
351
352 __kmp_node_deref(thread,node);
353}
354
355/*!
356@ingroup TASKING
357@param loc_ref location of the original task directive
358@param gtid Global Thread ID of encountering thread
359@param new_task task thunk allocated by __kmp_omp_task_alloc() for the ''new task''
360@param ndeps Number of depend items with possible aliasing
361@param dep_list List of depend items with possible aliasing
362@param ndeps_noalias Number of depend items with no aliasing
363@param noalias_dep_list List of depend items with no aliasing
364
365@return Returns either TASK_CURRENT_NOT_QUEUED if the current task was not suspendend and queued, or TASK_CURRENT_QUEUED if it was suspended and queued
366
367Schedule a non-thread-switchable task with dependences for execution
368*/
369kmp_int32
370__kmpc_omp_task_with_deps( ident_t *loc_ref, kmp_int32 gtid, kmp_task_t * new_task,
371 kmp_int32 ndeps, kmp_depend_info_t *dep_list,
372 kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
373{
374 kmp_info_t *thread = __kmp_threads[ gtid ];
375 kmp_taskdata_t * current_task = thread->th.th_current_task;
376
377 bool serial = current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final;
378
379 if ( !serial && ( ndeps > 0 || ndeps_noalias > 0 )) {
380 /* if no dependencies have been tracked yet, create the dependence hash */
381 if ( current_task->td_dephash == NULL )
382 current_task->td_dephash = __kmp_dephash_create(thread);
383
384#if USE_FAST_MEMORY
385 kmp_depnode_t *node = (kmp_depnode_t *) __kmp_fast_allocate(thread,sizeof(kmp_depnode_t));
386#else
387 kmp_depnode_t *node = (kmp_depnode_t *) __kmp_thread_malloc(thread,sizeof(kmp_depnode_t));
388#endif
389
390 __kmp_init_node(node);
391 KMP_TASK_TO_TASKDATA(new_task)->td_depnode = node;
392
393 if ( __kmp_check_deps( gtid, node, new_task, current_task->td_dephash, NO_DEP_BARRIER,
394 ndeps, dep_list, ndeps_noalias,noalias_dep_list ) )
395 return TASK_CURRENT_NOT_QUEUED;
396 }
397
398 return __kmpc_omp_task(loc_ref,gtid,new_task);
399}
400
401/*!
402@ingroup TASKING
403@param loc_ref location of the original task directive
404@param gtid Global Thread ID of encountering thread
405@param ndeps Number of depend items with possible aliasing
406@param dep_list List of depend items with possible aliasing
407@param ndeps_noalias Number of depend items with no aliasing
408@param noalias_dep_list List of depend items with no aliasing
409
410Blocks the current task until all specifies dependencies have been fulfilled.
411*/
412void
413__kmpc_omp_wait_deps ( ident_t *loc_ref, kmp_int32 gtid, kmp_int32 ndeps, kmp_depend_info_t *dep_list,
414 kmp_int32 ndeps_noalias, kmp_depend_info_t *noalias_dep_list )
415{
416 if ( ndeps == 0 && ndeps_noalias == 0 ) return;
417
418 kmp_info_t *thread = __kmp_threads[ gtid ];
419 kmp_taskdata_t * current_task = thread->th.th_current_task;
420
421 // dependences are not computed in serial teams
422 if ( current_task->td_flags.team_serial || current_task->td_flags.tasking_ser || current_task->td_flags.final)
423 return;
424
425 // if the dephash is not yet created it means we have nothing to wait for
426 if ( current_task->td_dephash == NULL ) return;
427
428 kmp_depnode_t node;
429 __kmp_init_node(&node);
430
431 if (!__kmp_check_deps( gtid, &node, NULL, current_task->td_dephash, DEP_BARRIER,
432 ndeps, dep_list, ndeps_noalias, noalias_dep_list ))
433 return;
434
435 int thread_finished = FALSE;
436 while ( node.dn.npredecessors > 0 ) {
437 __kmp_execute_tasks( thread, gtid, (volatile kmp_uint32 *)&(node.dn.npredecessors),
438 0, FALSE, &thread_finished,
439#if USE_ITT_BUILD
440 NULL,
441#endif
442 __kmp_task_stealing_constraint );
443 }
444
445}
446
447#endif /* OMP_40_ENABLED */
448