blob: b224efcaaea120dd0ddd63b46aa039667c933800 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16/*
17 * Dynamic scheduling initialization and dispatch.
18 *
19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20 * it may change values between parallel regions. __kmp_max_nth
21 * is the largest value __kmp_nth may take, 1 is the smallest.
22 *
23 */
24
25/* ------------------------------------------------------------------------ */
26/* ------------------------------------------------------------------------ */
27
28#include "kmp.h"
29#include "kmp_i18n.h"
30#include "kmp_itt.h"
31#include "kmp_str.h"
32#include "kmp_error.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000033#include "kmp_stats.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000034#if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36#endif
37
38/* ------------------------------------------------------------------------ */
39/* ------------------------------------------------------------------------ */
40
Jim Cownie4cc4bb42014-10-07 16:25:50 +000041// template for type limits
42template< typename T >
43struct i_maxmin {
44 static const T mx;
45 static const T mn;
46};
47template<>
48struct i_maxmin< int > {
49 static const int mx = 0x7fffffff;
50 static const int mn = 0x80000000;
51};
52template<>
53struct i_maxmin< unsigned int > {
54 static const unsigned int mx = 0xffffffff;
55 static const unsigned int mn = 0x00000000;
56};
57template<>
58struct i_maxmin< long long > {
59 static const long long mx = 0x7fffffffffffffffLL;
60 static const long long mn = 0x8000000000000000LL;
61};
62template<>
63struct i_maxmin< unsigned long long > {
64 static const unsigned long long mx = 0xffffffffffffffffLL;
65 static const unsigned long long mn = 0x0000000000000000LL;
66};
67//-------------------------------------------------------------------------
68
Jim Cownie5e8470a2013-09-27 10:38:44 +000069#ifdef KMP_STATIC_STEAL_ENABLED
70
71 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
72 template< typename T >
73 struct dispatch_private_infoXX_template {
74 typedef typename traits_t< T >::unsigned_t UT;
75 typedef typename traits_t< T >::signed_t ST;
76 UT count; // unsigned
77 T ub;
78 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
79 T lb;
80 ST st; // signed
81 UT tc; // unsigned
82 T static_steal_counter; // for static_steal only; maybe better to put after ub
83
84 /* parm[1-4] are used in different ways by different scheduling algorithms */
85
86 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
87 // a) parm3 is properly aligned and
88 // b) all parm1-4 are in the same cache line.
89 // Because of parm1-4 are used together, performance seems to be better
90 // if they are in the same line (not measured though).
91
92 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
93 T parm1;
94 T parm2;
95 T parm3;
96 T parm4;
97 };
98
99 UT ordered_lower; // unsigned
100 UT ordered_upper; // unsigned
101 #if KMP_OS_WINDOWS
102 T last_upper;
103 #endif /* KMP_OS_WINDOWS */
104 };
105
106#else /* KMP_STATIC_STEAL_ENABLED */
107
108 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
109 template< typename T >
110 struct dispatch_private_infoXX_template {
111 typedef typename traits_t< T >::unsigned_t UT;
112 typedef typename traits_t< T >::signed_t ST;
113 T lb;
114 T ub;
115 ST st; // signed
116 UT tc; // unsigned
117
118 T parm1;
119 T parm2;
120 T parm3;
121 T parm4;
122
123 UT count; // unsigned
124
125 UT ordered_lower; // unsigned
126 UT ordered_upper; // unsigned
127 #if KMP_OS_WINDOWS
128 T last_upper;
129 #endif /* KMP_OS_WINDOWS */
130 };
131
132#endif /* KMP_STATIC_STEAL_ENABLED */
133
134// replaces dispatch_private_info structure and dispatch_private_info_t type
135template< typename T >
136struct KMP_ALIGN_CACHE dispatch_private_info_template {
137 // duplicate alignment here, otherwise size of structure is not correct in our compiler
138 union KMP_ALIGN_CACHE private_info_tmpl {
139 dispatch_private_infoXX_template< T > p;
140 dispatch_private_info64_t p64;
141 } u;
142 enum sched_type schedule; /* scheduling algorithm */
143 kmp_uint32 ordered; /* ordered clause specified */
144 kmp_uint32 ordered_bumped;
145 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
146 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
147 kmp_uint32 nomerge; /* don't merge iters if serialized */
148 kmp_uint32 type_size;
149 enum cons_type pushed_ws;
150};
151
152
153// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
154template< typename UT >
155struct dispatch_shared_infoXX_template {
156 /* chunk index under dynamic, number of idle threads under static-steal;
157 iteration index otherwise */
158 volatile UT iteration;
159 volatile UT num_done;
160 volatile UT ordered_iteration;
161 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
162};
163
164// replaces dispatch_shared_info structure and dispatch_shared_info_t type
165template< typename UT >
166struct dispatch_shared_info_template {
167 // we need union here to keep the structure size
168 union shared_info_tmpl {
169 dispatch_shared_infoXX_template< UT > s;
170 dispatch_shared_info64_t s64;
171 } u;
172 volatile kmp_uint32 buffer_index;
173};
174
175/* ------------------------------------------------------------------------ */
176/* ------------------------------------------------------------------------ */
177
Jim Cownie5e8470a2013-09-27 10:38:44 +0000178#undef USE_TEST_LOCKS
179
180// test_then_add template (general template should NOT be used)
181template< typename T >
182static __forceinline T
183test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
184
185template<>
186__forceinline kmp_int32
187test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
188{
189 kmp_int32 r;
190 r = KMP_TEST_THEN_ADD32( p, d );
191 return r;
192}
193
194template<>
195__forceinline kmp_int64
196test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
197{
198 kmp_int64 r;
199 r = KMP_TEST_THEN_ADD64( p, d );
200 return r;
201}
202
203// test_then_inc_acq template (general template should NOT be used)
204template< typename T >
205static __forceinline T
206test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
207
208template<>
209__forceinline kmp_int32
210test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
211{
212 kmp_int32 r;
213 r = KMP_TEST_THEN_INC_ACQ32( p );
214 return r;
215}
216
217template<>
218__forceinline kmp_int64
219test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
220{
221 kmp_int64 r;
222 r = KMP_TEST_THEN_INC_ACQ64( p );
223 return r;
224}
225
226// test_then_inc template (general template should NOT be used)
227template< typename T >
228static __forceinline T
229test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
230
231template<>
232__forceinline kmp_int32
233test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
234{
235 kmp_int32 r;
236 r = KMP_TEST_THEN_INC32( p );
237 return r;
238}
239
240template<>
241__forceinline kmp_int64
242test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
243{
244 kmp_int64 r;
245 r = KMP_TEST_THEN_INC64( p );
246 return r;
247}
248
249// compare_and_swap template (general template should NOT be used)
250template< typename T >
251static __forceinline kmp_int32
252compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
253
254template<>
255__forceinline kmp_int32
256compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
257{
258 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
259}
260
261template<>
262__forceinline kmp_int32
263compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
264{
265 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
266}
267
268/*
269 Spin wait loop that first does pause, then yield.
270 Waits until function returns non-zero when called with *spinner and check.
271 Does NOT put threads to sleep.
272#if USE_ITT_BUILD
273 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000274 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000275 locks consistently. For example, if lock is acquired immediately, its address is
276 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
277 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
278 address, not an address of low-level spinner.
279#endif // USE_ITT_BUILD
280*/
281template< typename UT >
282// ToDo: make inline function (move to header file for icl)
283static UT // unsigned 4- or 8-byte type
284__kmp_wait_yield( volatile UT * spinner,
285 UT checker,
286 kmp_uint32 (* pred)( UT, UT )
287 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
288 )
289{
290 // note: we may not belong to a team at this point
291 register volatile UT * spin = spinner;
292 register UT check = checker;
293 register kmp_uint32 spins;
294 register kmp_uint32 (*f) ( UT, UT ) = pred;
295 register UT r;
296
297 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
298 KMP_INIT_YIELD( spins );
299 // main wait spin loop
300 while(!f(r = *spin, check))
301 {
302 KMP_FSYNC_SPIN_PREPARE( obj );
303 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
304 It causes problems with infinite recursion because of exit lock */
305 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
306 __kmp_abort_thread(); */
307
Jim Cownie5e8470a2013-09-27 10:38:44 +0000308 // if we are oversubscribed,
309 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
310 // pause is in the following code
311 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
312 KMP_YIELD_SPIN( spins );
313 }
314 KMP_FSYNC_SPIN_ACQUIRED( obj );
315 return r;
316}
317
318template< typename UT >
319static kmp_uint32 __kmp_eq( UT value, UT checker) {
320 return value == checker;
321}
322
323template< typename UT >
324static kmp_uint32 __kmp_neq( UT value, UT checker) {
325 return value != checker;
326}
327
328template< typename UT >
329static kmp_uint32 __kmp_lt( UT value, UT checker) {
330 return value < checker;
331}
332
333template< typename UT >
334static kmp_uint32 __kmp_ge( UT value, UT checker) {
335 return value >= checker;
336}
337
338template< typename UT >
339static kmp_uint32 __kmp_le( UT value, UT checker) {
340 return value <= checker;
341}
342
343
344/* ------------------------------------------------------------------------ */
345/* ------------------------------------------------------------------------ */
346
347static void
348__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
349{
350 kmp_info_t *th;
351
352 KMP_DEBUG_ASSERT( gtid_ref );
353
354 if ( __kmp_env_consistency_check ) {
355 th = __kmp_threads[*gtid_ref];
356 if ( th -> th.th_root -> r.r_active
357 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
358 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
359 }
360 }
361}
362
363template< typename UT >
364static void
365__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
366{
367 typedef typename traits_t< UT >::signed_t ST;
368 dispatch_private_info_template< UT > * pr;
369
370 int gtid = *gtid_ref;
371// int cid = *cid_ref;
372 kmp_info_t *th = __kmp_threads[ gtid ];
373 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
374
375 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
376 if ( __kmp_env_consistency_check ) {
377 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
378 ( th -> th.th_dispatch -> th_dispatch_pr_current );
379 if ( pr -> pushed_ws != ct_none ) {
380 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
381 }
382 }
383
384 if ( ! th -> th.th_team -> t.t_serialized ) {
385 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
386 ( th -> th.th_dispatch -> th_dispatch_sh_current );
387 UT lower;
388
389 if ( ! __kmp_env_consistency_check ) {
390 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
391 ( th -> th.th_dispatch -> th_dispatch_pr_current );
392 }
393 lower = pr->u.p.ordered_lower;
394
395 #if ! defined( KMP_GOMP_COMPAT )
396 if ( __kmp_env_consistency_check ) {
397 if ( pr->ordered_bumped ) {
398 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
399 __kmp_error_construct2(
400 kmp_i18n_msg_CnsMultipleNesting,
401 ct_ordered_in_pdo, loc_ref,
402 & p->stack_data[ p->w_top ]
403 );
404 }
405 }
406 #endif /* !defined(KMP_GOMP_COMPAT) */
407
408 KMP_MB();
409 #ifdef KMP_DEBUG
410 {
411 const char * buff;
412 // create format specifiers before the debug output
413 buff = __kmp_str_format(
414 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
415 traits_t< UT >::spec, traits_t< UT >::spec );
416 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
417 __kmp_str_free( &buff );
418 }
419 #endif
420
421 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
422 USE_ITT_BUILD_ARG( NULL )
423 );
424 KMP_MB(); /* is this necessary? */
425 #ifdef KMP_DEBUG
426 {
427 const char * buff;
428 // create format specifiers before the debug output
429 buff = __kmp_str_format(
430 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
431 traits_t< UT >::spec, traits_t< UT >::spec );
432 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
433 __kmp_str_free( &buff );
434 }
435 #endif
436 }
437 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
438}
439
440static void
441__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
442{
443 kmp_info_t *th;
444
445 if ( __kmp_env_consistency_check ) {
446 th = __kmp_threads[*gtid_ref];
447 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
448 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
449 }
450 }
451}
452
453template< typename UT >
454static void
455__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
456{
457 typedef typename traits_t< UT >::signed_t ST;
458 dispatch_private_info_template< UT > * pr;
459
460 int gtid = *gtid_ref;
461// int cid = *cid_ref;
462 kmp_info_t *th = __kmp_threads[ gtid ];
463 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
464
465 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
466 if ( __kmp_env_consistency_check ) {
467 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
468 ( th -> th.th_dispatch -> th_dispatch_pr_current );
469 if ( pr -> pushed_ws != ct_none ) {
470 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
471 }
472 }
473
474 if ( ! th -> th.th_team -> t.t_serialized ) {
475 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
476 ( th -> th.th_dispatch -> th_dispatch_sh_current );
477
478 if ( ! __kmp_env_consistency_check ) {
479 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
480 ( th -> th.th_dispatch -> th_dispatch_pr_current );
481 }
482
483 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
484 #if ! defined( KMP_GOMP_COMPAT )
485 if ( __kmp_env_consistency_check ) {
486 if ( pr->ordered_bumped != 0 ) {
487 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
488 /* How to test it? - OM */
489 __kmp_error_construct2(
490 kmp_i18n_msg_CnsMultipleNesting,
491 ct_ordered_in_pdo, loc_ref,
492 & p->stack_data[ p->w_top ]
493 );
494 }
495 }
496 #endif /* !defined(KMP_GOMP_COMPAT) */
497
498 KMP_MB(); /* Flush all pending memory write invalidates. */
499
500 pr->ordered_bumped += 1;
501
502 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
503 gtid, pr->ordered_bumped ) );
504
505 KMP_MB(); /* Flush all pending memory write invalidates. */
506
507 /* TODO use general release procedure? */
508 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
509
510 KMP_MB(); /* Flush all pending memory write invalidates. */
511 }
512 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
513}
514
515/* Computes and returns x to the power of y, where y must a non-negative integer */
516template< typename UT >
517static __forceinline long double
518__kmp_pow(long double x, UT y) {
519 long double s=1.0L;
520
521 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
522 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
523 while(y) {
524 if ( y & 1 )
525 s *= x;
526 x *= x;
527 y >>= 1;
528 }
529 return s;
530}
531
532/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
533 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
534 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
535 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
536*/
537template< typename T >
538static __inline typename traits_t< T >::unsigned_t
539__kmp_dispatch_guided_remaining(
540 T tc,
541 typename traits_t< T >::floating_t base,
542 typename traits_t< T >::unsigned_t idx
543) {
544 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
545 least for ICL 8.1, long double arithmetic may not really have
546 long double precision, even with /Qlong_double. Currently, we
547 workaround that in the caller code, by manipulating the FPCW for
548 Windows* OS on IA-32 architecture. The lack of precision is not
549 expected to be a correctness issue, though.
550 */
551 typedef typename traits_t< T >::unsigned_t UT;
552
553 long double x = tc * __kmp_pow< UT >(base, idx);
554 UT r = (UT) x;
555 if ( x == r )
556 return r;
557 return r + 1;
558}
559
560// Parameters of the guided-iterative algorithm:
561// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
562// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
563// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
564// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
565static int guided_int_param = 2;
566static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
567
568// UT - unsigned flavor of T, ST - signed flavor of T,
569// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
570template< typename T >
571static void
572__kmp_dispatch_init(
573 ident_t * loc,
574 int gtid,
575 enum sched_type schedule,
576 T lb,
577 T ub,
578 typename traits_t< T >::signed_t st,
579 typename traits_t< T >::signed_t chunk,
580 int push_ws
581) {
582 typedef typename traits_t< T >::unsigned_t UT;
583 typedef typename traits_t< T >::signed_t ST;
584 typedef typename traits_t< T >::floating_t DBL;
585 static const int ___kmp_size_type = sizeof( UT );
586
587 int active;
588 T tc;
589 kmp_info_t * th;
590 kmp_team_t * team;
591 kmp_uint32 my_buffer_index;
592 dispatch_private_info_template< T > * pr;
593 dispatch_shared_info_template< UT > volatile * sh;
594
595 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
596 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
597
598 if ( ! TCR_4( __kmp_init_parallel ) )
599 __kmp_parallel_initialize();
600
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000601#if INCLUDE_SSC_MARKS
602 SSC_MARK_DISPATCH_INIT();
603#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000604 #ifdef KMP_DEBUG
605 {
606 const char * buff;
607 // create format specifiers before the debug output
608 buff = __kmp_str_format(
609 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
610 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
611 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
612 __kmp_str_free( &buff );
613 }
614 #endif
615 /* setup data */
616 th = __kmp_threads[ gtid ];
617 team = th -> th.th_team;
618 active = ! team -> t.t_serialized;
619 th->th.th_ident = loc;
620
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000621#if USE_ITT_BUILD
622 kmp_uint64 cur_chunk = chunk;
623#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000624 if ( ! active ) {
625 pr = reinterpret_cast< dispatch_private_info_template< T >* >
626 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
627 } else {
628 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
629 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
630
631 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
632
633 /* What happens when number of threads changes, need to resize buffer? */
634 pr = reinterpret_cast< dispatch_private_info_template< T > * >
635 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
636 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
637 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
638 }
639
640 /* Pick up the nomerge/ordered bits from the scheduling type */
641 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
642 pr->nomerge = TRUE;
643 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
644 } else {
645 pr->nomerge = FALSE;
646 }
647 pr->type_size = ___kmp_size_type; // remember the size of variables
648 if ( kmp_ord_lower & schedule ) {
649 pr->ordered = TRUE;
650 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
651 } else {
652 pr->ordered = FALSE;
653 }
654 if ( schedule == kmp_sch_static ) {
655 schedule = __kmp_static;
656 } else {
657 if ( schedule == kmp_sch_runtime ) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000658 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
659 schedule = team -> t.t_sched.r_sched_type;
660 // Detail the schedule if needed (global controls are differentiated appropriately)
661 if ( schedule == kmp_sch_guided_chunked ) {
662 schedule = __kmp_guided;
663 } else if ( schedule == kmp_sch_static ) {
664 schedule = __kmp_static;
665 }
666 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
667 chunk = team -> t.t_sched.chunk;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000668
669 #ifdef KMP_DEBUG
670 {
671 const char * buff;
672 // create format specifiers before the debug output
673 buff = __kmp_str_format(
674 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
675 traits_t< ST >::spec );
676 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
677 __kmp_str_free( &buff );
678 }
679 #endif
680 } else {
681 if ( schedule == kmp_sch_guided_chunked ) {
682 schedule = __kmp_guided;
683 }
684 if ( chunk <= 0 ) {
685 chunk = KMP_DEFAULT_CHUNK;
686 }
687 }
688
Jim Cownie5e8470a2013-09-27 10:38:44 +0000689 if ( schedule == kmp_sch_auto ) {
690 // mapping and differentiation: in the __kmp_do_serial_initialize()
691 schedule = __kmp_auto;
692 #ifdef KMP_DEBUG
693 {
694 const char * buff;
695 // create format specifiers before the debug output
696 buff = __kmp_str_format(
697 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
698 traits_t< ST >::spec );
699 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
700 __kmp_str_free( &buff );
701 }
702 #endif
703 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000704
705 /* guided analytical not safe for too many threads */
706 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
707 schedule = kmp_sch_guided_iterative_chunked;
708 KMP_WARNING( DispatchManyThreads );
709 }
710 pr->u.p.parm1 = chunk;
711 }
712 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
713 "unknown scheduling type" );
714
715 pr->u.p.count = 0;
716
717 if ( __kmp_env_consistency_check ) {
718 if ( st == 0 ) {
719 __kmp_error_construct(
720 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
721 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
722 );
723 }
724 }
725
726 tc = ( ub - lb + st );
727 if ( st != 1 ) {
728 if ( st < 0 ) {
729 if ( lb < ub ) {
730 tc = 0; // zero-trip
731 } else { // lb >= ub
732 tc = (ST)tc / st; // convert to signed division
733 }
734 } else { // st > 0
735 if ( ub < lb ) {
736 tc = 0; // zero-trip
737 } else { // lb >= ub
738 tc /= st;
739 }
740 }
741 } else if ( ub < lb ) { // st == 1
742 tc = 0; // zero-trip
743 }
744
745 pr->u.p.lb = lb;
746 pr->u.p.ub = ub;
747 pr->u.p.st = st;
748 pr->u.p.tc = tc;
749
750 #if KMP_OS_WINDOWS
751 pr->u.p.last_upper = ub + st;
752 #endif /* KMP_OS_WINDOWS */
753
754 /* NOTE: only the active parallel region(s) has active ordered sections */
755
756 if ( active ) {
757 if ( pr->ordered == 0 ) {
758 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
759 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
760 } else {
761 pr->ordered_bumped = 0;
762
763 pr->u.p.ordered_lower = 1;
764 pr->u.p.ordered_upper = 0;
765
766 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
767 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
768 }
769 }
770
771 if ( __kmp_env_consistency_check ) {
772 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
773 if ( push_ws ) {
774 __kmp_push_workshare( gtid, ws, loc );
775 pr->pushed_ws = ws;
776 } else {
777 __kmp_check_workshare( gtid, ws, loc );
778 pr->pushed_ws = ct_none;
779 }
780 }
781
782 switch ( schedule ) {
783 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
784 case kmp_sch_static_steal:
785 {
786 T nproc = team->t.t_nproc;
787 T ntc, init;
788
789 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
790
791 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
792 if ( nproc > 1 && ntc >= nproc ) {
793 T id = __kmp_tid_from_gtid(gtid);
794 T small_chunk, extras;
795
796 small_chunk = ntc / nproc;
797 extras = ntc % nproc;
798
799 init = id * small_chunk + ( id < extras ? id : extras );
800 pr->u.p.count = init;
801 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
802
803 pr->u.p.parm2 = lb;
804 //pr->pfields.parm3 = 0; // it's not used in static_steal
805 pr->u.p.parm4 = id;
806 pr->u.p.st = st;
807 break;
808 } else {
809 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
810 gtid ) );
811 schedule = kmp_sch_static_balanced;
812 /* too few iterations: fall-through to kmp_sch_static_balanced */
813 } // if
814 /* FALL-THROUGH to static balanced */
815 } // case
816 #endif
817 case kmp_sch_static_balanced:
818 {
819 T nproc = team->t.t_nproc;
820 T init, limit;
821
822 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
823 gtid ) );
824
825 if ( nproc > 1 ) {
826 T id = __kmp_tid_from_gtid(gtid);
827
828 if ( tc < nproc ) {
829 if ( id < tc ) {
830 init = id;
831 limit = id;
832 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
833 } else {
834 pr->u.p.count = 1; /* means no more chunks to execute */
835 pr->u.p.parm1 = FALSE;
836 break;
837 }
838 } else {
839 T small_chunk = tc / nproc;
840 T extras = tc % nproc;
841 init = id * small_chunk + (id < extras ? id : extras);
842 limit = init + small_chunk - (id < extras ? 0 : 1);
843 pr->u.p.parm1 = (id == nproc - 1);
844 }
845 } else {
846 if ( tc > 0 ) {
847 init = 0;
848 limit = tc - 1;
849 pr->u.p.parm1 = TRUE;
850 } else {
851 // zero trip count
852 pr->u.p.count = 1; /* means no more chunks to execute */
853 pr->u.p.parm1 = FALSE;
854 break;
855 }
856 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000857#if USE_ITT_BUILD
858 // Calculate chunk for metadata report
859 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
860 cur_chunk = limit - init + 1;
861 }
862#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000863 if ( st == 1 ) {
864 pr->u.p.lb = lb + init;
865 pr->u.p.ub = lb + limit;
866 } else {
867 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
868 pr->u.p.lb = lb + init * st;
869 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
870 if ( st > 0 ) {
871 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
872 } else {
873 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
874 }
875 }
876 if ( pr->ordered ) {
877 pr->u.p.ordered_lower = init;
878 pr->u.p.ordered_upper = limit;
879 }
880 break;
881 } // case
882 case kmp_sch_guided_iterative_chunked :
883 {
884 T nproc = team->t.t_nproc;
885 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
886
887 if ( nproc > 1 ) {
888 if ( (2L * chunk + 1 ) * nproc >= tc ) {
889 /* chunk size too large, switch to dynamic */
890 schedule = kmp_sch_dynamic_chunked;
891 } else {
892 // when remaining iters become less than parm2 - switch to dynamic
893 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
894 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
895 }
896 } else {
897 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
898 schedule = kmp_sch_static_greedy;
899 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
900 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
901 pr->u.p.parm1 = tc;
902 } // if
903 } // case
904 break;
905 case kmp_sch_guided_analytical_chunked:
906 {
907 T nproc = team->t.t_nproc;
908 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
909
910 if ( nproc > 1 ) {
911 if ( (2L * chunk + 1 ) * nproc >= tc ) {
912 /* chunk size too large, switch to dynamic */
913 schedule = kmp_sch_dynamic_chunked;
914 } else {
915 /* commonly used term: (2 nproc - 1)/(2 nproc) */
916 DBL x;
917
918 #if KMP_OS_WINDOWS && KMP_ARCH_X86
919 /* Linux* OS already has 64-bit computation by default for
920 long double, and on Windows* OS on Intel(R) 64,
921 /Qlong_double doesn't work. On Windows* OS
922 on IA-32 architecture, we need to set precision to
923 64-bit instead of the default 53-bit. Even though long
924 double doesn't work on Windows* OS on Intel(R) 64, the
925 resulting lack of precision is not expected to impact
926 the correctness of the algorithm, but this has not been
927 mathematically proven.
928 */
929 // save original FPCW and set precision to 64-bit, as
930 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +0000931 unsigned int oldFpcw = _control87(0,0);
932 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +0000933 #endif
934 /* value used for comparison in solver for cross-over point */
935 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
936
937 /* crossover point--chunk indexes equal to or greater than
938 this point switch to dynamic-style scheduling */
939 UT cross;
940
941 /* commonly used term: (2 nproc - 1)/(2 nproc) */
942 x = (long double)1.0 - (long double)0.5 / nproc;
943
944 #ifdef KMP_DEBUG
945 { // test natural alignment
946 struct _test_a {
947 char a;
948 union {
949 char b;
950 DBL d;
951 };
952 } t;
953 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
954 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
955 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
956 }
957 #endif // KMP_DEBUG
958
959 /* save the term in thread private dispatch structure */
960 *(DBL*)&pr->u.p.parm3 = x;
961
962 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
963 {
964 UT left, right, mid;
965 long double p;
966
967 /* estimate initial upper and lower bound */
968
969 /* doesn't matter what value right is as long as it is positive, but
970 it affects performance of the solver
971 */
972 right = 229;
973 p = __kmp_pow< UT >(x,right);
974 if ( p > target ) {
975 do{
976 p *= p;
977 right <<= 1;
978 } while(p>target && right < (1<<27));
979 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
980 } else {
981 left = 0;
982 }
983
984 /* bisection root-finding method */
985 while ( left + 1 < right ) {
986 mid = (left + right) / 2;
987 if ( __kmp_pow< UT >(x,mid) > target ) {
988 left = mid;
989 } else {
990 right = mid;
991 }
992 } // while
993 cross = right;
994 }
995 /* assert sanity of computed crossover point */
996 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
997
998 /* save the crossover point in thread private dispatch structure */
999 pr->u.p.parm2 = cross;
1000
1001 // C75803
1002 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1003 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1004 #else
1005 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1006 #endif
1007 /* dynamic-style scheduling offset */
1008 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1009 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1010 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +00001011 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001012 #endif
1013 } // if
1014 } else {
1015 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1016 gtid ) );
1017 schedule = kmp_sch_static_greedy;
1018 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1019 pr->u.p.parm1 = tc;
1020 } // if
1021 } // case
1022 break;
1023 case kmp_sch_static_greedy:
1024 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1025 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1026 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1027 tc;
1028 break;
1029 case kmp_sch_static_chunked :
1030 case kmp_sch_dynamic_chunked :
1031 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1032 break;
1033 case kmp_sch_trapezoidal :
1034 {
1035 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1036
1037 T parm1, parm2, parm3, parm4;
1038 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1039
1040 parm1 = chunk;
1041
1042 /* F : size of the first cycle */
1043 parm2 = ( tc / (2 * team->t.t_nproc) );
1044
1045 if ( parm2 < 1 ) {
1046 parm2 = 1;
1047 }
1048
1049 /* L : size of the last cycle. Make sure the last cycle
1050 * is not larger than the first cycle.
1051 */
1052 if ( parm1 < 1 ) {
1053 parm1 = 1;
1054 } else if ( parm1 > parm2 ) {
1055 parm1 = parm2;
1056 }
1057
1058 /* N : number of cycles */
1059 parm3 = ( parm2 + parm1 );
1060 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1061
1062 if ( parm3 < 2 ) {
1063 parm3 = 2;
1064 }
1065
1066 /* sigma : decreasing incr of the trapezoid */
1067 parm4 = ( parm3 - 1 );
1068 parm4 = ( parm2 - parm1 ) / parm4;
1069
1070 // pointless check, because parm4 >= 0 always
1071 //if ( parm4 < 0 ) {
1072 // parm4 = 0;
1073 //}
1074
1075 pr->u.p.parm1 = parm1;
1076 pr->u.p.parm2 = parm2;
1077 pr->u.p.parm3 = parm3;
1078 pr->u.p.parm4 = parm4;
1079 } // case
1080 break;
1081
1082 default:
1083 {
1084 __kmp_msg(
1085 kmp_ms_fatal, // Severity
1086 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1087 KMP_HNT( GetNewerLibrary ), // Hint
1088 __kmp_msg_null // Variadic argument list terminator
1089 );
1090 }
1091 break;
1092 } // switch
1093 pr->schedule = schedule;
1094 if ( active ) {
1095 /* The name of this buffer should be my_buffer_index when it's free to use it */
1096
1097 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1098 gtid, my_buffer_index, sh->buffer_index) );
1099 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1100 USE_ITT_BUILD_ARG( NULL )
1101 );
1102 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1103 // *always* 32-bit integers.
1104 KMP_MB(); /* is this necessary? */
1105 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1106 gtid, my_buffer_index, sh->buffer_index) );
1107
1108 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1109 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1110#if USE_ITT_BUILD
1111 if ( pr->ordered ) {
1112 __kmp_itt_ordered_init( gtid );
1113 }; // if
1114#endif /* USE_ITT_BUILD */
1115 }; // if
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001116
1117#if USE_ITT_BUILD
1118 // Report loop metadata
1119 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
1120 kmp_uint32 tid = __kmp_tid_from_gtid( gtid );
1121 if (KMP_MASTER_TID(tid)) {
1122 kmp_uint64 schedtype = 0;
1123
1124 switch ( schedule ) {
1125 case kmp_sch_static_chunked:
1126 case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1127 break;
1128 case kmp_sch_static_greedy:
1129 cur_chunk = pr->u.p.parm1;
1130 break;
1131 case kmp_sch_dynamic_chunked:
1132 schedtype = 1;
1133 break;
1134 case kmp_sch_guided_iterative_chunked:
1135 case kmp_sch_guided_analytical_chunked:
1136 schedtype = 2;
1137 break;
1138 default:
1139// Should we put this case under "static"?
1140// case kmp_sch_static_steal:
1141 schedtype = 3;
1142 break;
1143 }
1144 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1145 }
1146 }
1147#endif /* USE_ITT_BUILD */
1148
Jim Cownie5e8470a2013-09-27 10:38:44 +00001149 #ifdef KMP_DEBUG
1150 {
1151 const char * buff;
1152 // create format specifiers before the debug output
1153 buff = __kmp_str_format(
1154 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1155 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1156 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1157 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1158 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1159 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1160 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1161 KD_TRACE(10, ( buff,
1162 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1163 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1164 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1165 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1166 __kmp_str_free( &buff );
1167 }
1168 #endif
1169 #if ( KMP_STATIC_STEAL_ENABLED )
1170 if ( ___kmp_size_type < 8 ) {
1171 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1172 // all the parm3 variables will contain the same value.
1173 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1174 // rather than program life-time increment.
1175 // So the dedicated variable is required. The 'static_steal_counter' is used.
1176 if( schedule == kmp_sch_static_steal ) {
1177 // Other threads will inspect this variable when searching for a victim.
1178 // This is a flag showing that other threads may steal from this thread since then.
1179 volatile T * p = &pr->u.p.static_steal_counter;
1180 *p = *p + 1;
1181 }
1182 }
1183 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1184}
1185
1186/*
1187 * For ordered loops, either __kmp_dispatch_finish() should be called after
1188 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1189 * every chunk of iterations. If the ordered section(s) were not executed
1190 * for this iteration (or every iteration in this chunk), we need to set the
1191 * ordered iteration counters so that the next thread can proceed.
1192 */
1193template< typename UT >
1194static void
1195__kmp_dispatch_finish( int gtid, ident_t *loc )
1196{
1197 typedef typename traits_t< UT >::signed_t ST;
1198 kmp_info_t *th = __kmp_threads[ gtid ];
1199
1200 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1201 if ( ! th -> th.th_team -> t.t_serialized ) {
1202
1203 dispatch_private_info_template< UT > * pr =
1204 reinterpret_cast< dispatch_private_info_template< UT >* >
1205 ( th->th.th_dispatch->th_dispatch_pr_current );
1206 dispatch_shared_info_template< UT > volatile * sh =
1207 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1208 ( th->th.th_dispatch->th_dispatch_sh_current );
1209 KMP_DEBUG_ASSERT( pr );
1210 KMP_DEBUG_ASSERT( sh );
1211 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1212 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1213
1214 if ( pr->ordered_bumped ) {
1215 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1216 gtid ) );
1217 pr->ordered_bumped = 0;
1218 } else {
1219 UT lower = pr->u.p.ordered_lower;
1220
1221 #ifdef KMP_DEBUG
1222 {
1223 const char * buff;
1224 // create format specifiers before the debug output
1225 buff = __kmp_str_format(
1226 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1227 traits_t< UT >::spec, traits_t< UT >::spec );
1228 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1229 __kmp_str_free( &buff );
1230 }
1231 #endif
1232
1233 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1234 USE_ITT_BUILD_ARG(NULL)
1235 );
1236 KMP_MB(); /* is this necessary? */
1237 #ifdef KMP_DEBUG
1238 {
1239 const char * buff;
1240 // create format specifiers before the debug output
1241 buff = __kmp_str_format(
1242 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1243 traits_t< UT >::spec, traits_t< UT >::spec );
1244 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1245 __kmp_str_free( &buff );
1246 }
1247 #endif
1248
1249 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1250 } // if
1251 } // if
1252 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1253}
1254
1255#ifdef KMP_GOMP_COMPAT
1256
1257template< typename UT >
1258static void
1259__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1260{
1261 typedef typename traits_t< UT >::signed_t ST;
1262 kmp_info_t *th = __kmp_threads[ gtid ];
1263
1264 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1265 if ( ! th -> th.th_team -> t.t_serialized ) {
1266// int cid;
1267 dispatch_private_info_template< UT > * pr =
1268 reinterpret_cast< dispatch_private_info_template< UT >* >
1269 ( th->th.th_dispatch->th_dispatch_pr_current );
1270 dispatch_shared_info_template< UT > volatile * sh =
1271 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1272 ( th->th.th_dispatch->th_dispatch_sh_current );
1273 KMP_DEBUG_ASSERT( pr );
1274 KMP_DEBUG_ASSERT( sh );
1275 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1276 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1277
1278// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1279 UT lower = pr->u.p.ordered_lower;
1280 UT upper = pr->u.p.ordered_upper;
1281 UT inc = upper - lower + 1;
1282
1283 if ( pr->ordered_bumped == inc ) {
1284 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1285 gtid ) );
1286 pr->ordered_bumped = 0;
1287 } else {
1288 inc -= pr->ordered_bumped;
1289
1290 #ifdef KMP_DEBUG
1291 {
1292 const char * buff;
1293 // create format specifiers before the debug output
1294 buff = __kmp_str_format(
1295 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1296 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1297 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1298 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1299 __kmp_str_free( &buff );
1300 }
1301 #endif
1302
1303 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1304 USE_ITT_BUILD_ARG(NULL)
1305 );
1306
1307 KMP_MB(); /* is this necessary? */
1308 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1309 gtid ) );
1310 pr->ordered_bumped = 0;
1311//!!!!! TODO check if the inc should be unsigned, or signed???
1312 #ifdef KMP_DEBUG
1313 {
1314 const char * buff;
1315 // create format specifiers before the debug output
1316 buff = __kmp_str_format(
1317 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1318 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1319 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1320 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1321 __kmp_str_free( &buff );
1322 }
1323 #endif
1324
1325 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1326 }
1327// }
1328 }
1329 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1330}
1331
1332#endif /* KMP_GOMP_COMPAT */
1333
1334template< typename T >
1335static int
1336__kmp_dispatch_next(
1337 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1338) {
1339
1340 typedef typename traits_t< T >::unsigned_t UT;
1341 typedef typename traits_t< T >::signed_t ST;
1342 typedef typename traits_t< T >::floating_t DBL;
1343 static const int ___kmp_size_type = sizeof( UT );
1344
1345 int status;
1346 dispatch_private_info_template< T > * pr;
1347 kmp_info_t * th = __kmp_threads[ gtid ];
1348 kmp_team_t * team = th -> th.th_team;
1349
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001350 KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie5e8470a2013-09-27 10:38:44 +00001351 #ifdef KMP_DEBUG
1352 {
1353 const char * buff;
1354 // create format specifiers before the debug output
1355 buff = __kmp_str_format(
1356 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1357 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1358 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1359 __kmp_str_free( &buff );
1360 }
1361 #endif
1362
1363 if ( team -> t.t_serialized ) {
1364 /* NOTE: serialize this dispatch becase we are not at the active level */
1365 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1366 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1367 KMP_DEBUG_ASSERT( pr );
1368
1369 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1370 *p_lb = 0;
1371 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001372// if ( p_last != NULL )
1373// *p_last = 0;
1374 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001375 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001376 if ( __kmp_env_consistency_check ) {
1377 if ( pr->pushed_ws != ct_none ) {
1378 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1379 }
1380 }
1381 } else if ( pr->nomerge ) {
1382 kmp_int32 last;
1383 T start;
1384 UT limit, trip, init;
1385 ST incr;
1386 T chunk = pr->u.p.parm1;
1387
1388 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1389
1390 init = chunk * pr->u.p.count++;
1391 trip = pr->u.p.tc - 1;
1392
1393 if ( (status = (init <= trip)) == 0 ) {
1394 *p_lb = 0;
1395 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001396// if ( p_last != NULL )
1397// *p_last = 0;
1398 if ( p_st != NULL )
1399 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001400 if ( __kmp_env_consistency_check ) {
1401 if ( pr->pushed_ws != ct_none ) {
1402 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1403 }
1404 }
1405 } else {
1406 start = pr->u.p.lb;
1407 limit = chunk + init - 1;
1408 incr = pr->u.p.st;
1409
1410 if ( (last = (limit >= trip)) != 0 ) {
1411 limit = trip;
1412 #if KMP_OS_WINDOWS
1413 pr->u.p.last_upper = pr->u.p.ub;
1414 #endif /* KMP_OS_WINDOWS */
1415 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001416 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001417 *p_last = last;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001418 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001419 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001420 if ( incr == 1 ) {
1421 *p_lb = start + init;
1422 *p_ub = start + limit;
1423 } else {
1424 *p_lb = start + init * incr;
1425 *p_ub = start + limit * incr;
1426 }
1427
1428 if ( pr->ordered ) {
1429 pr->u.p.ordered_lower = init;
1430 pr->u.p.ordered_upper = limit;
1431 #ifdef KMP_DEBUG
1432 {
1433 const char * buff;
1434 // create format specifiers before the debug output
1435 buff = __kmp_str_format(
1436 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1437 traits_t< UT >::spec, traits_t< UT >::spec );
1438 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1439 __kmp_str_free( &buff );
1440 }
1441 #endif
1442 } // if
1443 } // if
1444 } else {
1445 pr->u.p.tc = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001446 *p_lb = pr->u.p.lb;
1447 *p_ub = pr->u.p.ub;
1448 #if KMP_OS_WINDOWS
1449 pr->u.p.last_upper = *p_ub;
1450 #endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001451 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001452 *p_last = TRUE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001453 if ( p_st != NULL )
1454 *p_st = pr->u.p.st;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001455 } // if
1456 #ifdef KMP_DEBUG
1457 {
1458 const char * buff;
1459 // create format specifiers before the debug output
1460 buff = __kmp_str_format(
1461 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001462 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie5e8470a2013-09-27 10:38:44 +00001463 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001464 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001465 __kmp_str_free( &buff );
1466 }
1467 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001468#if INCLUDE_SSC_MARKS
1469 SSC_MARK_DISPATCH_NEXT();
1470#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001471 return status;
1472 } else {
1473 kmp_int32 last = 0;
1474 dispatch_shared_info_template< UT > *sh;
1475 T start;
1476 ST incr;
1477 UT limit, trip, init;
1478
1479 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1480 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1481
1482 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1483 ( th->th.th_dispatch->th_dispatch_pr_current );
1484 KMP_DEBUG_ASSERT( pr );
1485 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1486 ( th->th.th_dispatch->th_dispatch_sh_current );
1487 KMP_DEBUG_ASSERT( sh );
1488
1489 if ( pr->u.p.tc == 0 ) {
1490 // zero trip count
1491 status = 0;
1492 } else {
1493 switch (pr->schedule) {
1494 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1495 case kmp_sch_static_steal:
1496 {
1497 T chunk = pr->u.p.parm1;
1498
1499 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1500
1501 trip = pr->u.p.tc - 1;
1502
1503 if ( ___kmp_size_type > 4 ) {
1504 // Other threads do not look into the data of this thread,
1505 // so it's not necessary to make volatile casting.
1506 init = ( pr->u.p.count )++;
1507 status = ( init < (UT)pr->u.p.ub );
1508 } else {
1509 typedef union {
1510 struct {
1511 UT count;
1512 T ub;
1513 } p;
1514 kmp_int64 b;
1515 } union_i4;
1516 // All operations on 'count' or 'ub' must be combined atomically together.
1517 // stealing implemented only for 4-byte indexes
1518 {
1519 union_i4 vold, vnew;
1520 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1521 vnew = vold;
1522 vnew.p.count++;
1523 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1524 ( volatile kmp_int64* )&pr->u.p.count,
1525 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1526 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1527 KMP_CPU_PAUSE();
1528 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1529 vnew = vold;
1530 vnew.p.count++;
1531 }
1532 vnew = vold;
1533 init = vnew.p.count;
1534 status = ( init < (UT)vnew.p.ub ) ;
1535 }
1536
1537 if( !status ) {
1538 kmp_info_t **other_threads = team->t.t_threads;
1539 int while_limit = 10;
1540 int while_index = 0;
1541
1542 // TODO: algorithm of searching for a victim
1543 // should be cleaned up and measured
1544 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1545 union_i4 vold, vnew;
1546 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1547 T victimIdx = pr->u.p.parm4;
1548 T oldVictimIdx = victimIdx;
1549 dispatch_private_info_template< T > * victim;
1550
1551 do {
1552 if( !victimIdx ) {
1553 victimIdx = team->t.t_nproc - 1;
1554 } else {
1555 --victimIdx;
1556 }
1557 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1558 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1559 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1560 // TODO: think about a proper place of this test
1561 if ( ( !victim ) ||
1562 ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1563 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1564 // TODO: delay would be nice
1565 continue;
1566 // the victim is not ready yet to participate in stealing
1567 // because the victim is still in kmp_init_dispatch
1568 }
1569 if ( oldVictimIdx == victimIdx ) {
1570 break;
1571 }
1572 pr->u.p.parm4 = victimIdx;
1573
1574 while( 1 ) {
1575 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1576 vnew = vold;
1577
1578 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1579 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1580 break;
1581 }
1582 vnew.p.ub -= (remaining >> 2);
1583 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1584 #pragma warning( push )
1585 // disable warning on pointless comparison of unsigned with 0
1586 #pragma warning( disable: 186 )
1587 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1588 #pragma warning( pop )
1589 // TODO: Should this be acquire or release?
1590 if ( KMP_COMPARE_AND_STORE_ACQ64(
1591 ( volatile kmp_int64 * )&victim->u.p.count,
1592 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1593 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1594 status = 1;
1595 while_index = 0;
1596 // now update own count and ub
1597 #if KMP_ARCH_X86
1598 // stealing executed on non-KMP_ARCH_X86 only
1599 // Atomic 64-bit write on ia32 is
1600 // unavailable, so we do this in steps.
1601 // This code is not tested.
1602 init = vold.p.count;
1603 pr->u.p.ub = 0;
1604 pr->u.p.count = init + 1;
1605 pr->u.p.ub = vnew.p.count;
1606 #else
1607 init = vnew.p.ub;
1608 vold.p.count = init + 1;
1609 // TODO: is it safe and enough?
1610 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1611 #endif // KMP_ARCH_X86
1612 break;
1613 } // if
1614 KMP_CPU_PAUSE();
1615 } // while (1)
1616 } // while
1617 } // if
1618 } // if
1619 if ( !status ) {
1620 *p_lb = 0;
1621 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001622 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001623 } else {
1624 start = pr->u.p.parm2;
1625 init *= chunk;
1626 limit = chunk + init - 1;
1627 incr = pr->u.p.st;
1628
1629 KMP_DEBUG_ASSERT(init <= trip);
1630 if ( (last = (limit >= trip)) != 0 )
1631 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001632 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001633
1634 if ( incr == 1 ) {
1635 *p_lb = start + init;
1636 *p_ub = start + limit;
1637 } else {
1638 *p_lb = start + init * incr;
1639 *p_ub = start + limit * incr;
1640 }
1641
1642 if ( pr->ordered ) {
1643 pr->u.p.ordered_lower = init;
1644 pr->u.p.ordered_upper = limit;
1645 #ifdef KMP_DEBUG
1646 {
1647 const char * buff;
1648 // create format specifiers before the debug output
1649 buff = __kmp_str_format(
1650 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1651 traits_t< UT >::spec, traits_t< UT >::spec );
1652 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1653 __kmp_str_free( &buff );
1654 }
1655 #endif
1656 } // if
1657 } // if
1658 break;
1659 } // case
1660 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1661 case kmp_sch_static_balanced:
1662 {
1663 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1664 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1665 pr->u.p.count = 1;
1666 *p_lb = pr->u.p.lb;
1667 *p_ub = pr->u.p.ub;
1668 last = pr->u.p.parm1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001669 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001670 *p_st = pr->u.p.st;
1671 } else { /* no iterations to do */
1672 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1673 }
1674 if ( pr->ordered ) {
1675 #ifdef KMP_DEBUG
1676 {
1677 const char * buff;
1678 // create format specifiers before the debug output
1679 buff = __kmp_str_format(
1680 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1681 traits_t< UT >::spec, traits_t< UT >::spec );
1682 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1683 __kmp_str_free( &buff );
1684 }
1685 #endif
1686 } // if
1687 } // case
1688 break;
1689 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1690 case kmp_sch_static_chunked:
1691 {
1692 T parm1;
1693
1694 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1695 gtid ) );
1696 parm1 = pr->u.p.parm1;
1697
1698 trip = pr->u.p.tc - 1;
1699 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1700
1701 if ( (status = (init <= trip)) != 0 ) {
1702 start = pr->u.p.lb;
1703 incr = pr->u.p.st;
1704 limit = parm1 + init - 1;
1705
1706 if ( (last = (limit >= trip)) != 0 )
1707 limit = trip;
1708
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001709 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001710
1711 pr->u.p.count += team->t.t_nproc;
1712
1713 if ( incr == 1 ) {
1714 *p_lb = start + init;
1715 *p_ub = start + limit;
1716 }
1717 else {
1718 *p_lb = start + init * incr;
1719 *p_ub = start + limit * incr;
1720 }
1721
1722 if ( pr->ordered ) {
1723 pr->u.p.ordered_lower = init;
1724 pr->u.p.ordered_upper = limit;
1725 #ifdef KMP_DEBUG
1726 {
1727 const char * buff;
1728 // create format specifiers before the debug output
1729 buff = __kmp_str_format(
1730 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1731 traits_t< UT >::spec, traits_t< UT >::spec );
1732 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1733 __kmp_str_free( &buff );
1734 }
1735 #endif
1736 } // if
1737 } // if
1738 } // case
1739 break;
1740
1741 case kmp_sch_dynamic_chunked:
1742 {
1743 T chunk = pr->u.p.parm1;
1744
1745 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1746 gtid ) );
1747
1748 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1749 trip = pr->u.p.tc - 1;
1750
1751 if ( (status = (init <= trip)) == 0 ) {
1752 *p_lb = 0;
1753 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001754 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001755 } else {
1756 start = pr->u.p.lb;
1757 limit = chunk + init - 1;
1758 incr = pr->u.p.st;
1759
1760 if ( (last = (limit >= trip)) != 0 )
1761 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001762
1763 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001764
1765 if ( incr == 1 ) {
1766 *p_lb = start + init;
1767 *p_ub = start + limit;
1768 } else {
1769 *p_lb = start + init * incr;
1770 *p_ub = start + limit * incr;
1771 }
1772
1773 if ( pr->ordered ) {
1774 pr->u.p.ordered_lower = init;
1775 pr->u.p.ordered_upper = limit;
1776 #ifdef KMP_DEBUG
1777 {
1778 const char * buff;
1779 // create format specifiers before the debug output
1780 buff = __kmp_str_format(
1781 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1782 traits_t< UT >::spec, traits_t< UT >::spec );
1783 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1784 __kmp_str_free( &buff );
1785 }
1786 #endif
1787 } // if
1788 } // if
1789 } // case
1790 break;
1791
1792 case kmp_sch_guided_iterative_chunked:
1793 {
1794 T chunkspec = pr->u.p.parm1;
1795 KD_TRACE(100,
1796 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1797 trip = pr->u.p.tc;
1798 // Start atomic part of calculations
1799 while(1) {
1800 ST remaining; // signed, because can be < 0
1801 init = sh->u.s.iteration; // shared value
1802 remaining = trip - init;
1803 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1804 // nothing to do, don't try atomic op
1805 status = 0;
1806 break;
1807 }
1808 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1809 // use dynamic-style shcedule
1810 // atomically inrement iterations, get old value
1811 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1812 remaining = trip - init;
1813 if (remaining <= 0) {
1814 status = 0; // all iterations got by other threads
1815 } else {
1816 // got some iterations to work on
1817 status = 1;
1818 if ( (T)remaining > chunkspec ) {
1819 limit = init + chunkspec - 1;
1820 } else {
1821 last = 1; // the last chunk
1822 limit = init + remaining - 1;
1823 } // if
1824 } // if
1825 break;
1826 } // if
1827 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1828 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1829 // CAS was successful, chunk obtained
1830 status = 1;
1831 --limit;
1832 break;
1833 } // if
1834 } // while
1835 if ( status != 0 ) {
1836 start = pr->u.p.lb;
1837 incr = pr->u.p.st;
1838 if ( p_st != NULL )
1839 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001840 *p_lb = start + init * incr;
1841 *p_ub = start + limit * incr;
1842 if ( pr->ordered ) {
1843 pr->u.p.ordered_lower = init;
1844 pr->u.p.ordered_upper = limit;
1845 #ifdef KMP_DEBUG
1846 {
1847 const char * buff;
1848 // create format specifiers before the debug output
1849 buff = __kmp_str_format(
1850 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1851 traits_t< UT >::spec, traits_t< UT >::spec );
1852 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1853 __kmp_str_free( &buff );
1854 }
1855 #endif
1856 } // if
1857 } else {
1858 *p_lb = 0;
1859 *p_ub = 0;
1860 if ( p_st != NULL )
1861 *p_st = 0;
1862 } // if
1863 } // case
1864 break;
1865
1866 case kmp_sch_guided_analytical_chunked:
1867 {
1868 T chunkspec = pr->u.p.parm1;
1869 UT chunkIdx;
1870 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1871 /* for storing original FPCW value for Windows* OS on
1872 IA-32 architecture 8-byte version */
1873 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001874 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001875 #endif
1876 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1877 gtid ) );
1878
1879 trip = pr->u.p.tc;
1880
1881 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1882 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1883
1884 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1885 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1886 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1887 --trip;
1888 /* use dynamic-style scheduling */
1889 init = chunkIdx * chunkspec + pr->u.p.count;
1890 /* need to verify init > 0 in case of overflow in the above calculation */
1891 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1892 limit = init + chunkspec -1;
1893
1894 if ( (last = (limit >= trip)) != 0 )
1895 limit = trip;
1896 }
1897 break;
1898 } else {
1899 /* use exponential-style scheduling */
1900 /* The following check is to workaround the lack of long double precision on Windows* OS.
1901 This check works around the possible effect that init != 0 for chunkIdx == 0.
1902 */
1903 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1904 /* If we haven't already done so, save original
1905 FPCW and set precision to 64-bit, as Windows* OS
1906 on IA-32 architecture defaults to 53-bit */
1907 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00001908 oldFpcw = _control87(0,0);
1909 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001910 fpcwSet = 0x30000;
1911 }
1912 #endif
1913 if ( chunkIdx ) {
1914 init = __kmp_dispatch_guided_remaining< T >(
1915 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1916 KMP_DEBUG_ASSERT(init);
1917 init = trip - init;
1918 } else
1919 init = 0;
1920 limit = trip - __kmp_dispatch_guided_remaining< T >(
1921 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1922 KMP_ASSERT(init <= limit);
1923 if ( init < limit ) {
1924 KMP_DEBUG_ASSERT(limit <= trip);
1925 --limit;
1926 status = 1;
1927 break;
1928 } // if
1929 } // if
1930 } // while (1)
1931 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00001932 /* restore FPCW if necessary
1933 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1934 */
1935 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1936 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001937 #endif
1938 if ( status != 0 ) {
1939 start = pr->u.p.lb;
1940 incr = pr->u.p.st;
1941 if ( p_st != NULL )
1942 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001943 *p_lb = start + init * incr;
1944 *p_ub = start + limit * incr;
1945 if ( pr->ordered ) {
1946 pr->u.p.ordered_lower = init;
1947 pr->u.p.ordered_upper = limit;
1948 #ifdef KMP_DEBUG
1949 {
1950 const char * buff;
1951 // create format specifiers before the debug output
1952 buff = __kmp_str_format(
1953 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1954 traits_t< UT >::spec, traits_t< UT >::spec );
1955 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1956 __kmp_str_free( &buff );
1957 }
1958 #endif
1959 }
1960 } else {
1961 *p_lb = 0;
1962 *p_ub = 0;
1963 if ( p_st != NULL )
1964 *p_st = 0;
1965 }
1966 } // case
1967 break;
1968
1969 case kmp_sch_trapezoidal:
1970 {
1971 UT index;
1972 T parm2 = pr->u.p.parm2;
1973 T parm3 = pr->u.p.parm3;
1974 T parm4 = pr->u.p.parm4;
1975 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1976 gtid ) );
1977
1978 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1979
1980 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1981 trip = pr->u.p.tc - 1;
1982
1983 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1984 *p_lb = 0;
1985 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001986 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001987 } else {
1988 start = pr->u.p.lb;
1989 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1990 incr = pr->u.p.st;
1991
1992 if ( (last = (limit >= trip)) != 0 )
1993 limit = trip;
1994
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001995 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001996
1997 if ( incr == 1 ) {
1998 *p_lb = start + init;
1999 *p_ub = start + limit;
2000 } else {
2001 *p_lb = start + init * incr;
2002 *p_ub = start + limit * incr;
2003 }
2004
2005 if ( pr->ordered ) {
2006 pr->u.p.ordered_lower = init;
2007 pr->u.p.ordered_upper = limit;
2008 #ifdef KMP_DEBUG
2009 {
2010 const char * buff;
2011 // create format specifiers before the debug output
2012 buff = __kmp_str_format(
2013 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2014 traits_t< UT >::spec, traits_t< UT >::spec );
2015 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2016 __kmp_str_free( &buff );
2017 }
2018 #endif
2019 } // if
2020 } // if
2021 } // case
2022 break;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002023 default:
2024 {
2025 status = 0; // to avoid complaints on uninitialized variable use
2026 __kmp_msg(
2027 kmp_ms_fatal, // Severity
2028 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2029 KMP_HNT( GetNewerLibrary ), // Hint
2030 __kmp_msg_null // Variadic argument list terminator
2031 );
2032 }
2033 break;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002034 } // switch
2035 } // if tc == 0;
2036
2037 if ( status == 0 ) {
2038 UT num_done;
2039
2040 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2041 #ifdef KMP_DEBUG
2042 {
2043 const char * buff;
2044 // create format specifiers before the debug output
2045 buff = __kmp_str_format(
2046 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2047 traits_t< UT >::spec );
2048 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2049 __kmp_str_free( &buff );
2050 }
2051 #endif
2052
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002053 if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002054 /* NOTE: release this buffer to be reused */
2055
2056 KMP_MB(); /* Flush all pending memory write invalidates. */
2057
2058 sh->u.s.num_done = 0;
2059 sh->u.s.iteration = 0;
2060
2061 /* TODO replace with general release procedure? */
2062 if ( pr->ordered ) {
2063 sh->u.s.ordered_iteration = 0;
2064 }
2065
2066 KMP_MB(); /* Flush all pending memory write invalidates. */
2067
2068 sh -> buffer_index += KMP_MAX_DISP_BUF;
2069 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2070 gtid, sh->buffer_index) );
2071
2072 KMP_MB(); /* Flush all pending memory write invalidates. */
2073
2074 } // if
2075 if ( __kmp_env_consistency_check ) {
2076 if ( pr->pushed_ws != ct_none ) {
2077 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2078 }
2079 }
2080
2081 th -> th.th_dispatch -> th_deo_fcn = NULL;
2082 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2083 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2084 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2085 } // if (status == 0)
2086#if KMP_OS_WINDOWS
2087 else if ( last ) {
2088 pr->u.p.last_upper = pr->u.p.ub;
2089 }
2090#endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002091 if ( p_last != NULL && status != 0 )
2092 *p_last = last;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002093 } // if
2094
2095 #ifdef KMP_DEBUG
2096 {
2097 const char * buff;
2098 // create format specifiers before the debug output
2099 buff = __kmp_str_format(
2100 "__kmp_dispatch_next: T#%%d normal case: " \
2101 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2102 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2103 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2104 __kmp_str_free( &buff );
2105 }
2106 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002107#if INCLUDE_SSC_MARKS
2108 SSC_MARK_DISPATCH_NEXT();
2109#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00002110 return status;
2111}
2112
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002113template< typename T >
2114static void
2115__kmp_dist_get_bounds(
2116 ident_t *loc,
2117 kmp_int32 gtid,
2118 kmp_int32 *plastiter,
2119 T *plower,
2120 T *pupper,
2121 typename traits_t< T >::signed_t incr
2122) {
2123 KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2124 typedef typename traits_t< T >::unsigned_t UT;
2125 typedef typename traits_t< T >::signed_t ST;
2126 register kmp_uint32 team_id;
2127 register kmp_uint32 nteams;
2128 register UT trip_count;
2129 register kmp_team_t *team;
2130 kmp_info_t * th;
2131
2132 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2133 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2134 #ifdef KMP_DEBUG
2135 {
2136 const char * buff;
2137 // create format specifiers before the debug output
2138 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2139 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2140 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2141 traits_t< T >::spec );
2142 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2143 __kmp_str_free( &buff );
2144 }
2145 #endif
2146
2147 if( __kmp_env_consistency_check ) {
2148 if( incr == 0 ) {
2149 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2150 }
2151 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2152 // The loop is illegal.
2153 // Some zero-trip loops maintained by compiler, e.g.:
2154 // for(i=10;i<0;++i) // lower >= upper - run-time check
2155 // for(i=0;i>10;--i) // lower <= upper - run-time check
2156 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2157 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2158 // Compiler does not check the following illegal loops:
2159 // for(i=0;i<10;i+=incr) // where incr<0
2160 // for(i=10;i>0;i-=incr) // where incr<0
2161 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2162 }
2163 }
2164 th = __kmp_threads[gtid];
2165 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2166 team = th->th.th_team;
2167 #if OMP_40_ENABLED
2168 nteams = th->th.th_teams_size.nteams;
2169 #endif
2170 team_id = team->t.t_master_tid;
2171 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2172
2173 // compute global trip count
2174 if( incr == 1 ) {
2175 trip_count = *pupper - *plower + 1;
2176 } else if(incr == -1) {
2177 trip_count = *plower - *pupper + 1;
2178 } else {
2179 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2180 }
2181 if( trip_count <= nteams ) {
2182 KMP_DEBUG_ASSERT(
2183 __kmp_static == kmp_sch_static_greedy || \
2184 __kmp_static == kmp_sch_static_balanced
2185 ); // Unknown static scheduling type.
2186 // only some teams get single iteration, others get nothing
2187 if( team_id < trip_count ) {
2188 *pupper = *plower = *plower + team_id * incr;
2189 } else {
2190 *plower = *pupper + incr; // zero-trip loop
2191 }
2192 if( plastiter != NULL )
2193 *plastiter = ( team_id == trip_count - 1 );
2194 } else {
2195 if( __kmp_static == kmp_sch_static_balanced ) {
2196 register UT chunk = trip_count / nteams;
2197 register UT extras = trip_count % nteams;
2198 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2199 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2200 if( plastiter != NULL )
2201 *plastiter = ( team_id == nteams - 1 );
2202 } else {
2203 register T chunk_inc_count =
2204 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2205 register T upper = *pupper;
2206 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2207 // Unknown static scheduling type.
2208 *plower += team_id * chunk_inc_count;
2209 *pupper = *plower + chunk_inc_count - incr;
2210 // Check/correct bounds if needed
2211 if( incr > 0 ) {
2212 if( *pupper < *plower )
2213 *pupper = i_maxmin< T >::mx;
2214 if( plastiter != NULL )
2215 *plastiter = *plower <= upper && *pupper > upper - incr;
2216 if( *pupper > upper )
2217 *pupper = upper; // tracker C73258
2218 } else {
2219 if( *pupper > *plower )
2220 *pupper = i_maxmin< T >::mn;
2221 if( plastiter != NULL )
2222 *plastiter = *plower >= upper && *pupper < upper - incr;
2223 if( *pupper < upper )
2224 *pupper = upper; // tracker C73258
2225 }
2226 }
2227 }
2228}
2229
Jim Cownie5e8470a2013-09-27 10:38:44 +00002230//-----------------------------------------------------------------------------------------
2231// Dispatch routines
2232// Transfer call to template< type T >
2233// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2234// T lb, T ub, ST st, ST chunk )
2235extern "C" {
2236
2237/*!
2238@ingroup WORK_SHARING
2239@{
2240@param loc Source location
2241@param gtid Global thread id
2242@param schedule Schedule type
2243@param lb Lower bound
2244@param ub Upper bound
2245@param st Step (or increment if you prefer)
2246@param chunk The chunk size to block with
2247
2248This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2249These functions are all identical apart from the types of the arguments.
2250*/
2251
2252void
2253__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2254 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2255{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002256 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002257 KMP_DEBUG_ASSERT( __kmp_init_serial );
2258 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2259}
2260/*!
2261See @ref __kmpc_dispatch_init_4
2262*/
2263void
2264__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2265 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2266{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002267 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002268 KMP_DEBUG_ASSERT( __kmp_init_serial );
2269 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2270}
2271
2272/*!
2273See @ref __kmpc_dispatch_init_4
2274*/
2275void
2276__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2277 kmp_int64 lb, kmp_int64 ub,
2278 kmp_int64 st, kmp_int64 chunk )
2279{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002280 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002281 KMP_DEBUG_ASSERT( __kmp_init_serial );
2282 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2283}
2284
2285/*!
2286See @ref __kmpc_dispatch_init_4
2287*/
2288void
2289__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2290 kmp_uint64 lb, kmp_uint64 ub,
2291 kmp_int64 st, kmp_int64 chunk )
2292{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002293 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002294 KMP_DEBUG_ASSERT( __kmp_init_serial );
2295 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2296}
2297
2298/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002299See @ref __kmpc_dispatch_init_4
2300
2301Difference from __kmpc_dispatch_init set of functions is these functions
2302are called for composite distribute parallel for construct. Thus before
2303regular iterations dispatching we need to calc per-team iteration space.
2304
2305These functions are all identical apart from the types of the arguments.
2306*/
2307void
2308__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2309 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2310{
2311 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2312 KMP_DEBUG_ASSERT( __kmp_init_serial );
2313 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2314 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2315}
2316
2317void
2318__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2319 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2320{
2321 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2322 KMP_DEBUG_ASSERT( __kmp_init_serial );
2323 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2324 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2325}
2326
2327void
2328__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2329 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2330{
2331 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2332 KMP_DEBUG_ASSERT( __kmp_init_serial );
2333 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2334 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2335}
2336
2337void
2338__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2339 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2340{
2341 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2342 KMP_DEBUG_ASSERT( __kmp_init_serial );
2343 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2344 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2345}
2346
2347/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002348@param loc Source code location
2349@param gtid Global thread id
2350@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2351@param p_lb Pointer to the lower bound for the next chunk of work
2352@param p_ub Pointer to the upper bound for the next chunk of work
2353@param p_st Pointer to the stride for the next chunk of work
2354@return one if there is work to be done, zero otherwise
2355
2356Get the next dynamically allocated chunk of work for this thread.
2357If there is no more work, then the lb,ub and stride need not be modified.
2358*/
2359int
2360__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2361 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2362{
2363 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2364}
2365
2366/*!
2367See @ref __kmpc_dispatch_next_4
2368*/
2369int
2370__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2371 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2372{
2373 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2374}
2375
2376/*!
2377See @ref __kmpc_dispatch_next_4
2378*/
2379int
2380__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2381 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2382{
2383 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2384}
2385
2386/*!
2387See @ref __kmpc_dispatch_next_4
2388*/
2389int
2390__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2391 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2392{
2393 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2394}
2395
2396/*!
2397@param loc Source code location
2398@param gtid Global thread id
2399
2400Mark the end of a dynamic loop.
2401*/
2402void
2403__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2404{
2405 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2406}
2407
2408/*!
2409See @ref __kmpc_dispatch_fini_4
2410*/
2411void
2412__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2413{
2414 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2415}
2416
2417/*!
2418See @ref __kmpc_dispatch_fini_4
2419*/
2420void
2421__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2422{
2423 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2424}
2425
2426/*!
2427See @ref __kmpc_dispatch_fini_4
2428*/
2429void
2430__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2431{
2432 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2433}
2434/*! @} */
2435
2436//-----------------------------------------------------------------------------------------
2437//Non-template routines from kmp_dispatch.c used in other sources
2438
2439kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2440 return value == checker;
2441}
2442
2443kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2444 return value != checker;
2445}
2446
2447kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2448 return value < checker;
2449}
2450
2451kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2452 return value >= checker;
2453}
2454
2455kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2456 return value <= checker;
2457}
2458kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2459 return value == checker;
2460}
2461
2462kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2463 return value != checker;
2464}
2465
2466kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2467 return value < checker;
2468}
2469
2470kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2471 return value >= checker;
2472}
2473
2474kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2475 return value <= checker;
2476}
2477
2478kmp_uint32
2479__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2480 kmp_uint32 checker,
2481 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2482 , void * obj // Higher-level synchronization object, or NULL.
2483 )
2484{
2485 // note: we may not belong to a team at this point
2486 register volatile kmp_uint32 * spin = spinner;
2487 register kmp_uint32 check = checker;
2488 register kmp_uint32 spins;
2489 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2490 register kmp_uint32 r;
2491
2492 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2493 KMP_INIT_YIELD( spins );
2494 // main wait spin loop
2495 while(!f(r = TCR_4(*spin), check)) {
2496 KMP_FSYNC_SPIN_PREPARE( obj );
2497 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2498 It causes problems with infinite recursion because of exit lock */
2499 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2500 __kmp_abort_thread(); */
2501
Jim Cownie5e8470a2013-09-27 10:38:44 +00002502 /* if we have waited a bit, or are oversubscribed, yield */
2503 /* pause is in the following code */
2504 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2505 KMP_YIELD_SPIN( spins );
2506 }
2507 KMP_FSYNC_SPIN_ACQUIRED( obj );
2508 return r;
2509}
2510
2511kmp_uint64
2512__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2513 kmp_uint64 checker,
2514 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2515 , void * obj // Higher-level synchronization object, or NULL.
2516 )
2517{
2518 // note: we may not belong to a team at this point
2519 register volatile kmp_uint64 * spin = spinner;
2520 register kmp_uint64 check = checker;
2521 register kmp_uint32 spins;
2522 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2523 register kmp_uint64 r;
2524
2525 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2526 KMP_INIT_YIELD( spins );
2527 // main wait spin loop
2528 while(!f(r = *spin, check))
2529 {
2530 KMP_FSYNC_SPIN_PREPARE( obj );
2531 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2532 It causes problems with infinite recursion because of exit lock */
2533 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2534 __kmp_abort_thread(); */
2535
Jim Cownie5e8470a2013-09-27 10:38:44 +00002536 // if we are oversubscribed,
2537 // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2538 // pause is in the following code
2539 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2540 KMP_YIELD_SPIN( spins );
2541 }
2542 KMP_FSYNC_SPIN_ACQUIRED( obj );
2543 return r;
2544}
2545
2546} // extern "C"
2547
2548#ifdef KMP_GOMP_COMPAT
2549
2550void
2551__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2552 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2553 kmp_int32 chunk, int push_ws )
2554{
2555 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2556 push_ws );
2557}
2558
2559void
2560__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2561 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2562 kmp_int32 chunk, int push_ws )
2563{
2564 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2565 push_ws );
2566}
2567
2568void
2569__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2570 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2571 kmp_int64 chunk, int push_ws )
2572{
2573 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2574 push_ws );
2575}
2576
2577void
2578__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2579 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2580 kmp_int64 chunk, int push_ws )
2581{
2582 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2583 push_ws );
2584}
2585
2586void
2587__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2588{
2589 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2590}
2591
2592void
2593__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2594{
2595 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2596}
2597
2598void
2599__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2600{
2601 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2602}
2603
2604void
2605__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2606{
2607 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2608}
2609
2610#endif /* KMP_GOMP_COMPAT */
2611
2612/* ------------------------------------------------------------------------ */
2613/* ------------------------------------------------------------------------ */
2614