blob: 2c8bba6be4c00ac1d2adebbf0074ee8e08e3cd38 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16/*
17 * Dynamic scheduling initialization and dispatch.
18 *
19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20 * it may change values between parallel regions. __kmp_max_nth
21 * is the largest value __kmp_nth may take, 1 is the smallest.
22 *
23 */
24
25/* ------------------------------------------------------------------------ */
26/* ------------------------------------------------------------------------ */
27
28#include "kmp.h"
29#include "kmp_i18n.h"
30#include "kmp_itt.h"
31#include "kmp_str.h"
32#include "kmp_error.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000033#include "kmp_stats.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000034#if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36#endif
37
Andrey Churbanovd7d088f2015-04-29 16:42:24 +000038#if OMPT_SUPPORT
39#include "ompt-internal.h"
40#include "ompt-specific.h"
41#endif
42
Jim Cownie5e8470a2013-09-27 10:38:44 +000043/* ------------------------------------------------------------------------ */
44/* ------------------------------------------------------------------------ */
45
Jim Cownie4cc4bb42014-10-07 16:25:50 +000046// template for type limits
47template< typename T >
48struct i_maxmin {
49 static const T mx;
50 static const T mn;
51};
52template<>
53struct i_maxmin< int > {
54 static const int mx = 0x7fffffff;
55 static const int mn = 0x80000000;
56};
57template<>
58struct i_maxmin< unsigned int > {
59 static const unsigned int mx = 0xffffffff;
60 static const unsigned int mn = 0x00000000;
61};
62template<>
63struct i_maxmin< long long > {
64 static const long long mx = 0x7fffffffffffffffLL;
65 static const long long mn = 0x8000000000000000LL;
66};
67template<>
68struct i_maxmin< unsigned long long > {
69 static const unsigned long long mx = 0xffffffffffffffffLL;
70 static const unsigned long long mn = 0x0000000000000000LL;
71};
72//-------------------------------------------------------------------------
73
Jim Cownie5e8470a2013-09-27 10:38:44 +000074#ifdef KMP_STATIC_STEAL_ENABLED
75
76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77 template< typename T >
78 struct dispatch_private_infoXX_template {
79 typedef typename traits_t< T >::unsigned_t UT;
80 typedef typename traits_t< T >::signed_t ST;
81 UT count; // unsigned
82 T ub;
83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84 T lb;
85 ST st; // signed
86 UT tc; // unsigned
87 T static_steal_counter; // for static_steal only; maybe better to put after ub
88
89 /* parm[1-4] are used in different ways by different scheduling algorithms */
90
91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92 // a) parm3 is properly aligned and
93 // b) all parm1-4 are in the same cache line.
94 // Because of parm1-4 are used together, performance seems to be better
95 // if they are in the same line (not measured though).
96
97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98 T parm1;
99 T parm2;
100 T parm3;
101 T parm4;
102 };
103
104 UT ordered_lower; // unsigned
105 UT ordered_upper; // unsigned
106 #if KMP_OS_WINDOWS
107 T last_upper;
108 #endif /* KMP_OS_WINDOWS */
109 };
110
111#else /* KMP_STATIC_STEAL_ENABLED */
112
113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114 template< typename T >
115 struct dispatch_private_infoXX_template {
116 typedef typename traits_t< T >::unsigned_t UT;
117 typedef typename traits_t< T >::signed_t ST;
118 T lb;
119 T ub;
120 ST st; // signed
121 UT tc; // unsigned
122
123 T parm1;
124 T parm2;
125 T parm3;
126 T parm4;
127
128 UT count; // unsigned
129
130 UT ordered_lower; // unsigned
131 UT ordered_upper; // unsigned
132 #if KMP_OS_WINDOWS
133 T last_upper;
134 #endif /* KMP_OS_WINDOWS */
135 };
136
137#endif /* KMP_STATIC_STEAL_ENABLED */
138
139// replaces dispatch_private_info structure and dispatch_private_info_t type
140template< typename T >
141struct KMP_ALIGN_CACHE dispatch_private_info_template {
142 // duplicate alignment here, otherwise size of structure is not correct in our compiler
143 union KMP_ALIGN_CACHE private_info_tmpl {
144 dispatch_private_infoXX_template< T > p;
145 dispatch_private_info64_t p64;
146 } u;
147 enum sched_type schedule; /* scheduling algorithm */
148 kmp_uint32 ordered; /* ordered clause specified */
149 kmp_uint32 ordered_bumped;
150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152 kmp_uint32 nomerge; /* don't merge iters if serialized */
153 kmp_uint32 type_size;
154 enum cons_type pushed_ws;
155};
156
157
158// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159template< typename UT >
160struct dispatch_shared_infoXX_template {
161 /* chunk index under dynamic, number of idle threads under static-steal;
162 iteration index otherwise */
163 volatile UT iteration;
164 volatile UT num_done;
165 volatile UT ordered_iteration;
166 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167};
168
169// replaces dispatch_shared_info structure and dispatch_shared_info_t type
170template< typename UT >
171struct dispatch_shared_info_template {
172 // we need union here to keep the structure size
173 union shared_info_tmpl {
174 dispatch_shared_infoXX_template< UT > s;
175 dispatch_shared_info64_t s64;
176 } u;
177 volatile kmp_uint32 buffer_index;
178};
179
180/* ------------------------------------------------------------------------ */
181/* ------------------------------------------------------------------------ */
182
Jim Cownie5e8470a2013-09-27 10:38:44 +0000183#undef USE_TEST_LOCKS
184
185// test_then_add template (general template should NOT be used)
186template< typename T >
187static __forceinline T
188test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189
190template<>
191__forceinline kmp_int32
192test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193{
194 kmp_int32 r;
195 r = KMP_TEST_THEN_ADD32( p, d );
196 return r;
197}
198
199template<>
200__forceinline kmp_int64
201test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202{
203 kmp_int64 r;
204 r = KMP_TEST_THEN_ADD64( p, d );
205 return r;
206}
207
208// test_then_inc_acq template (general template should NOT be used)
209template< typename T >
210static __forceinline T
211test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212
213template<>
214__forceinline kmp_int32
215test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216{
217 kmp_int32 r;
218 r = KMP_TEST_THEN_INC_ACQ32( p );
219 return r;
220}
221
222template<>
223__forceinline kmp_int64
224test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225{
226 kmp_int64 r;
227 r = KMP_TEST_THEN_INC_ACQ64( p );
228 return r;
229}
230
231// test_then_inc template (general template should NOT be used)
232template< typename T >
233static __forceinline T
234test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235
236template<>
237__forceinline kmp_int32
238test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239{
240 kmp_int32 r;
241 r = KMP_TEST_THEN_INC32( p );
242 return r;
243}
244
245template<>
246__forceinline kmp_int64
247test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248{
249 kmp_int64 r;
250 r = KMP_TEST_THEN_INC64( p );
251 return r;
252}
253
254// compare_and_swap template (general template should NOT be used)
255template< typename T >
256static __forceinline kmp_int32
257compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258
259template<>
260__forceinline kmp_int32
261compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262{
263 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264}
265
266template<>
267__forceinline kmp_int32
268compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269{
270 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271}
272
273/*
274 Spin wait loop that first does pause, then yield.
275 Waits until function returns non-zero when called with *spinner and check.
276 Does NOT put threads to sleep.
277#if USE_ITT_BUILD
278 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000279 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000280 locks consistently. For example, if lock is acquired immediately, its address is
281 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283 address, not an address of low-level spinner.
284#endif // USE_ITT_BUILD
285*/
286template< typename UT >
287// ToDo: make inline function (move to header file for icl)
288static UT // unsigned 4- or 8-byte type
289__kmp_wait_yield( volatile UT * spinner,
290 UT checker,
291 kmp_uint32 (* pred)( UT, UT )
292 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
293 )
294{
295 // note: we may not belong to a team at this point
296 register volatile UT * spin = spinner;
297 register UT check = checker;
298 register kmp_uint32 spins;
299 register kmp_uint32 (*f) ( UT, UT ) = pred;
300 register UT r;
301
302 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303 KMP_INIT_YIELD( spins );
304 // main wait spin loop
305 while(!f(r = *spin, check))
306 {
307 KMP_FSYNC_SPIN_PREPARE( obj );
308 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309 It causes problems with infinite recursion because of exit lock */
310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311 __kmp_abort_thread(); */
312
Jim Cownie5e8470a2013-09-27 10:38:44 +0000313 // if we are oversubscribed,
314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315 // pause is in the following code
316 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317 KMP_YIELD_SPIN( spins );
318 }
319 KMP_FSYNC_SPIN_ACQUIRED( obj );
320 return r;
321}
322
323template< typename UT >
324static kmp_uint32 __kmp_eq( UT value, UT checker) {
325 return value == checker;
326}
327
328template< typename UT >
329static kmp_uint32 __kmp_neq( UT value, UT checker) {
330 return value != checker;
331}
332
333template< typename UT >
334static kmp_uint32 __kmp_lt( UT value, UT checker) {
335 return value < checker;
336}
337
338template< typename UT >
339static kmp_uint32 __kmp_ge( UT value, UT checker) {
340 return value >= checker;
341}
342
343template< typename UT >
344static kmp_uint32 __kmp_le( UT value, UT checker) {
345 return value <= checker;
346}
347
348
349/* ------------------------------------------------------------------------ */
350/* ------------------------------------------------------------------------ */
351
352static void
353__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354{
355 kmp_info_t *th;
356
357 KMP_DEBUG_ASSERT( gtid_ref );
358
359 if ( __kmp_env_consistency_check ) {
360 th = __kmp_threads[*gtid_ref];
361 if ( th -> th.th_root -> r.r_active
362 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000363#if KMP_USE_DYNAMIC_LOCK
364 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000366 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000367#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000368 }
369 }
370}
371
372template< typename UT >
373static void
374__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375{
376 typedef typename traits_t< UT >::signed_t ST;
377 dispatch_private_info_template< UT > * pr;
378
379 int gtid = *gtid_ref;
380// int cid = *cid_ref;
381 kmp_info_t *th = __kmp_threads[ gtid ];
382 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383
384 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385 if ( __kmp_env_consistency_check ) {
386 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387 ( th -> th.th_dispatch -> th_dispatch_pr_current );
388 if ( pr -> pushed_ws != ct_none ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000389#if KMP_USE_DYNAMIC_LOCK
390 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000392 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000393#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000394 }
395 }
396
397 if ( ! th -> th.th_team -> t.t_serialized ) {
398 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399 ( th -> th.th_dispatch -> th_dispatch_sh_current );
400 UT lower;
401
402 if ( ! __kmp_env_consistency_check ) {
403 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404 ( th -> th.th_dispatch -> th_dispatch_pr_current );
405 }
406 lower = pr->u.p.ordered_lower;
407
408 #if ! defined( KMP_GOMP_COMPAT )
409 if ( __kmp_env_consistency_check ) {
410 if ( pr->ordered_bumped ) {
411 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412 __kmp_error_construct2(
413 kmp_i18n_msg_CnsMultipleNesting,
414 ct_ordered_in_pdo, loc_ref,
415 & p->stack_data[ p->w_top ]
416 );
417 }
418 }
419 #endif /* !defined(KMP_GOMP_COMPAT) */
420
421 KMP_MB();
422 #ifdef KMP_DEBUG
423 {
424 const char * buff;
425 // create format specifiers before the debug output
426 buff = __kmp_str_format(
427 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428 traits_t< UT >::spec, traits_t< UT >::spec );
429 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430 __kmp_str_free( &buff );
431 }
432 #endif
433
434 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435 USE_ITT_BUILD_ARG( NULL )
436 );
437 KMP_MB(); /* is this necessary? */
438 #ifdef KMP_DEBUG
439 {
440 const char * buff;
441 // create format specifiers before the debug output
442 buff = __kmp_str_format(
443 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444 traits_t< UT >::spec, traits_t< UT >::spec );
445 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446 __kmp_str_free( &buff );
447 }
448 #endif
449 }
450 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451}
452
453static void
454__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455{
456 kmp_info_t *th;
457
458 if ( __kmp_env_consistency_check ) {
459 th = __kmp_threads[*gtid_ref];
460 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462 }
463 }
464}
465
466template< typename UT >
467static void
468__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469{
470 typedef typename traits_t< UT >::signed_t ST;
471 dispatch_private_info_template< UT > * pr;
472
473 int gtid = *gtid_ref;
474// int cid = *cid_ref;
475 kmp_info_t *th = __kmp_threads[ gtid ];
476 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477
478 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479 if ( __kmp_env_consistency_check ) {
480 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481 ( th -> th.th_dispatch -> th_dispatch_pr_current );
482 if ( pr -> pushed_ws != ct_none ) {
483 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484 }
485 }
486
487 if ( ! th -> th.th_team -> t.t_serialized ) {
488 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489 ( th -> th.th_dispatch -> th_dispatch_sh_current );
490
491 if ( ! __kmp_env_consistency_check ) {
492 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493 ( th -> th.th_dispatch -> th_dispatch_pr_current );
494 }
495
496 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497 #if ! defined( KMP_GOMP_COMPAT )
498 if ( __kmp_env_consistency_check ) {
499 if ( pr->ordered_bumped != 0 ) {
500 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501 /* How to test it? - OM */
502 __kmp_error_construct2(
503 kmp_i18n_msg_CnsMultipleNesting,
504 ct_ordered_in_pdo, loc_ref,
505 & p->stack_data[ p->w_top ]
506 );
507 }
508 }
509 #endif /* !defined(KMP_GOMP_COMPAT) */
510
511 KMP_MB(); /* Flush all pending memory write invalidates. */
512
513 pr->ordered_bumped += 1;
514
515 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516 gtid, pr->ordered_bumped ) );
517
518 KMP_MB(); /* Flush all pending memory write invalidates. */
519
520 /* TODO use general release procedure? */
521 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522
523 KMP_MB(); /* Flush all pending memory write invalidates. */
524 }
525 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526}
527
528/* Computes and returns x to the power of y, where y must a non-negative integer */
529template< typename UT >
530static __forceinline long double
531__kmp_pow(long double x, UT y) {
532 long double s=1.0L;
533
534 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536 while(y) {
537 if ( y & 1 )
538 s *= x;
539 x *= x;
540 y >>= 1;
541 }
542 return s;
543}
544
545/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549*/
550template< typename T >
551static __inline typename traits_t< T >::unsigned_t
552__kmp_dispatch_guided_remaining(
553 T tc,
554 typename traits_t< T >::floating_t base,
555 typename traits_t< T >::unsigned_t idx
556) {
557 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558 least for ICL 8.1, long double arithmetic may not really have
559 long double precision, even with /Qlong_double. Currently, we
560 workaround that in the caller code, by manipulating the FPCW for
561 Windows* OS on IA-32 architecture. The lack of precision is not
562 expected to be a correctness issue, though.
563 */
564 typedef typename traits_t< T >::unsigned_t UT;
565
566 long double x = tc * __kmp_pow< UT >(base, idx);
567 UT r = (UT) x;
568 if ( x == r )
569 return r;
570 return r + 1;
571}
572
573// Parameters of the guided-iterative algorithm:
574// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
575// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
576// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578static int guided_int_param = 2;
579static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580
581// UT - unsigned flavor of T, ST - signed flavor of T,
582// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583template< typename T >
584static void
585__kmp_dispatch_init(
586 ident_t * loc,
587 int gtid,
588 enum sched_type schedule,
589 T lb,
590 T ub,
591 typename traits_t< T >::signed_t st,
592 typename traits_t< T >::signed_t chunk,
593 int push_ws
594) {
595 typedef typename traits_t< T >::unsigned_t UT;
596 typedef typename traits_t< T >::signed_t ST;
597 typedef typename traits_t< T >::floating_t DBL;
598 static const int ___kmp_size_type = sizeof( UT );
599
600 int active;
601 T tc;
602 kmp_info_t * th;
603 kmp_team_t * team;
604 kmp_uint32 my_buffer_index;
605 dispatch_private_info_template< T > * pr;
606 dispatch_shared_info_template< UT > volatile * sh;
607
608 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610
611 if ( ! TCR_4( __kmp_init_parallel ) )
612 __kmp_parallel_initialize();
613
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000614#if INCLUDE_SSC_MARKS
615 SSC_MARK_DISPATCH_INIT();
616#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000617 #ifdef KMP_DEBUG
618 {
619 const char * buff;
620 // create format specifiers before the debug output
621 buff = __kmp_str_format(
622 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625 __kmp_str_free( &buff );
626 }
627 #endif
628 /* setup data */
629 th = __kmp_threads[ gtid ];
630 team = th -> th.th_team;
631 active = ! team -> t.t_serialized;
632 th->th.th_ident = loc;
633
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000634#if USE_ITT_BUILD
635 kmp_uint64 cur_chunk = chunk;
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000636 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
637 KMP_MASTER_GTID(gtid) &&
638#if OMP_40_ENABLED
639 th->th.th_teams_microtask == NULL &&
640#endif
641 team->t.t_active_level == 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000642#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000643 if ( ! active ) {
644 pr = reinterpret_cast< dispatch_private_info_template< T >* >
645 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
646 } else {
647 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
648 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
649
650 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
651
652 /* What happens when number of threads changes, need to resize buffer? */
653 pr = reinterpret_cast< dispatch_private_info_template< T > * >
654 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
655 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
656 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657 }
658
659 /* Pick up the nomerge/ordered bits from the scheduling type */
660 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
661 pr->nomerge = TRUE;
662 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
663 } else {
664 pr->nomerge = FALSE;
665 }
666 pr->type_size = ___kmp_size_type; // remember the size of variables
667 if ( kmp_ord_lower & schedule ) {
668 pr->ordered = TRUE;
669 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
670 } else {
671 pr->ordered = FALSE;
672 }
Jonathan Peyton45be4502015-08-11 21:36:41 +0000673
Jim Cownie5e8470a2013-09-27 10:38:44 +0000674 if ( schedule == kmp_sch_static ) {
675 schedule = __kmp_static;
676 } else {
677 if ( schedule == kmp_sch_runtime ) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000678 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
679 schedule = team -> t.t_sched.r_sched_type;
680 // Detail the schedule if needed (global controls are differentiated appropriately)
681 if ( schedule == kmp_sch_guided_chunked ) {
682 schedule = __kmp_guided;
683 } else if ( schedule == kmp_sch_static ) {
684 schedule = __kmp_static;
685 }
686 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
687 chunk = team -> t.t_sched.chunk;
Jonathan Peyton00afbd02015-11-12 21:26:22 +0000688#if USE_ITT_BUILD
689 cur_chunk = chunk;
690#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000691 #ifdef KMP_DEBUG
692 {
693 const char * buff;
694 // create format specifiers before the debug output
695 buff = __kmp_str_format(
696 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
697 traits_t< ST >::spec );
698 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
699 __kmp_str_free( &buff );
700 }
701 #endif
702 } else {
703 if ( schedule == kmp_sch_guided_chunked ) {
704 schedule = __kmp_guided;
705 }
706 if ( chunk <= 0 ) {
707 chunk = KMP_DEFAULT_CHUNK;
708 }
709 }
710
Jim Cownie5e8470a2013-09-27 10:38:44 +0000711 if ( schedule == kmp_sch_auto ) {
712 // mapping and differentiation: in the __kmp_do_serial_initialize()
713 schedule = __kmp_auto;
714 #ifdef KMP_DEBUG
715 {
716 const char * buff;
717 // create format specifiers before the debug output
718 buff = __kmp_str_format(
719 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
720 traits_t< ST >::spec );
721 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
722 __kmp_str_free( &buff );
723 }
724 #endif
725 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000726
727 /* guided analytical not safe for too many threads */
728 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
729 schedule = kmp_sch_guided_iterative_chunked;
730 KMP_WARNING( DispatchManyThreads );
731 }
732 pr->u.p.parm1 = chunk;
733 }
734 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
735 "unknown scheduling type" );
736
737 pr->u.p.count = 0;
738
739 if ( __kmp_env_consistency_check ) {
740 if ( st == 0 ) {
741 __kmp_error_construct(
742 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
743 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
744 );
745 }
746 }
747
748 tc = ( ub - lb + st );
749 if ( st != 1 ) {
750 if ( st < 0 ) {
751 if ( lb < ub ) {
752 tc = 0; // zero-trip
753 } else { // lb >= ub
754 tc = (ST)tc / st; // convert to signed division
755 }
756 } else { // st > 0
757 if ( ub < lb ) {
758 tc = 0; // zero-trip
759 } else { // lb >= ub
760 tc /= st;
761 }
762 }
763 } else if ( ub < lb ) { // st == 1
764 tc = 0; // zero-trip
765 }
766
Jonathan Peyton45be4502015-08-11 21:36:41 +0000767 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
768 // when statistics are disabled.
769 if (schedule == __kmp_static)
770 {
771 KMP_COUNT_BLOCK(OMP_FOR_static);
772 KMP_COUNT_VALUE(FOR_static_iterations, tc);
773 }
774 else
775 {
776 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
777 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
778 }
779
Jim Cownie5e8470a2013-09-27 10:38:44 +0000780 pr->u.p.lb = lb;
781 pr->u.p.ub = ub;
782 pr->u.p.st = st;
783 pr->u.p.tc = tc;
784
785 #if KMP_OS_WINDOWS
786 pr->u.p.last_upper = ub + st;
787 #endif /* KMP_OS_WINDOWS */
788
789 /* NOTE: only the active parallel region(s) has active ordered sections */
790
791 if ( active ) {
792 if ( pr->ordered == 0 ) {
793 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
794 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
795 } else {
796 pr->ordered_bumped = 0;
797
798 pr->u.p.ordered_lower = 1;
799 pr->u.p.ordered_upper = 0;
800
801 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
802 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
803 }
804 }
805
806 if ( __kmp_env_consistency_check ) {
807 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
808 if ( push_ws ) {
809 __kmp_push_workshare( gtid, ws, loc );
810 pr->pushed_ws = ws;
811 } else {
812 __kmp_check_workshare( gtid, ws, loc );
813 pr->pushed_ws = ct_none;
814 }
815 }
816
817 switch ( schedule ) {
818 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
819 case kmp_sch_static_steal:
820 {
821 T nproc = team->t.t_nproc;
822 T ntc, init;
823
824 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
825
826 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
827 if ( nproc > 1 && ntc >= nproc ) {
828 T id = __kmp_tid_from_gtid(gtid);
829 T small_chunk, extras;
830
831 small_chunk = ntc / nproc;
832 extras = ntc % nproc;
833
834 init = id * small_chunk + ( id < extras ? id : extras );
835 pr->u.p.count = init;
836 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
837
838 pr->u.p.parm2 = lb;
839 //pr->pfields.parm3 = 0; // it's not used in static_steal
840 pr->u.p.parm4 = id;
841 pr->u.p.st = st;
842 break;
843 } else {
844 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
845 gtid ) );
846 schedule = kmp_sch_static_balanced;
847 /* too few iterations: fall-through to kmp_sch_static_balanced */
848 } // if
849 /* FALL-THROUGH to static balanced */
850 } // case
851 #endif
852 case kmp_sch_static_balanced:
853 {
854 T nproc = team->t.t_nproc;
855 T init, limit;
856
857 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
858 gtid ) );
859
860 if ( nproc > 1 ) {
861 T id = __kmp_tid_from_gtid(gtid);
862
863 if ( tc < nproc ) {
864 if ( id < tc ) {
865 init = id;
866 limit = id;
867 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
868 } else {
869 pr->u.p.count = 1; /* means no more chunks to execute */
870 pr->u.p.parm1 = FALSE;
871 break;
872 }
873 } else {
874 T small_chunk = tc / nproc;
875 T extras = tc % nproc;
876 init = id * small_chunk + (id < extras ? id : extras);
877 limit = init + small_chunk - (id < extras ? 0 : 1);
878 pr->u.p.parm1 = (id == nproc - 1);
879 }
880 } else {
881 if ( tc > 0 ) {
882 init = 0;
883 limit = tc - 1;
884 pr->u.p.parm1 = TRUE;
885 } else {
886 // zero trip count
887 pr->u.p.count = 1; /* means no more chunks to execute */
888 pr->u.p.parm1 = FALSE;
889 break;
890 }
891 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000892#if USE_ITT_BUILD
893 // Calculate chunk for metadata report
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000894 if ( itt_need_metadata_reporting )
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000895 cur_chunk = limit - init + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000896#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000897 if ( st == 1 ) {
898 pr->u.p.lb = lb + init;
899 pr->u.p.ub = lb + limit;
900 } else {
901 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
902 pr->u.p.lb = lb + init * st;
903 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
904 if ( st > 0 ) {
905 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
906 } else {
907 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
908 }
909 }
910 if ( pr->ordered ) {
911 pr->u.p.ordered_lower = init;
912 pr->u.p.ordered_upper = limit;
913 }
914 break;
915 } // case
916 case kmp_sch_guided_iterative_chunked :
917 {
918 T nproc = team->t.t_nproc;
919 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
920
921 if ( nproc > 1 ) {
922 if ( (2L * chunk + 1 ) * nproc >= tc ) {
923 /* chunk size too large, switch to dynamic */
924 schedule = kmp_sch_dynamic_chunked;
925 } else {
926 // when remaining iters become less than parm2 - switch to dynamic
927 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
928 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
929 }
930 } else {
931 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
932 schedule = kmp_sch_static_greedy;
933 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
934 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
935 pr->u.p.parm1 = tc;
936 } // if
937 } // case
938 break;
939 case kmp_sch_guided_analytical_chunked:
940 {
941 T nproc = team->t.t_nproc;
942 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
943
944 if ( nproc > 1 ) {
945 if ( (2L * chunk + 1 ) * nproc >= tc ) {
946 /* chunk size too large, switch to dynamic */
947 schedule = kmp_sch_dynamic_chunked;
948 } else {
949 /* commonly used term: (2 nproc - 1)/(2 nproc) */
950 DBL x;
951
952 #if KMP_OS_WINDOWS && KMP_ARCH_X86
953 /* Linux* OS already has 64-bit computation by default for
954 long double, and on Windows* OS on Intel(R) 64,
955 /Qlong_double doesn't work. On Windows* OS
956 on IA-32 architecture, we need to set precision to
957 64-bit instead of the default 53-bit. Even though long
958 double doesn't work on Windows* OS on Intel(R) 64, the
959 resulting lack of precision is not expected to impact
960 the correctness of the algorithm, but this has not been
961 mathematically proven.
962 */
963 // save original FPCW and set precision to 64-bit, as
964 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +0000965 unsigned int oldFpcw = _control87(0,0);
966 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +0000967 #endif
968 /* value used for comparison in solver for cross-over point */
969 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
970
971 /* crossover point--chunk indexes equal to or greater than
972 this point switch to dynamic-style scheduling */
973 UT cross;
974
975 /* commonly used term: (2 nproc - 1)/(2 nproc) */
976 x = (long double)1.0 - (long double)0.5 / nproc;
977
978 #ifdef KMP_DEBUG
979 { // test natural alignment
980 struct _test_a {
981 char a;
982 union {
983 char b;
984 DBL d;
985 };
986 } t;
987 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
988 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
989 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
990 }
991 #endif // KMP_DEBUG
992
993 /* save the term in thread private dispatch structure */
994 *(DBL*)&pr->u.p.parm3 = x;
995
996 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
997 {
998 UT left, right, mid;
999 long double p;
1000
1001 /* estimate initial upper and lower bound */
1002
1003 /* doesn't matter what value right is as long as it is positive, but
1004 it affects performance of the solver
1005 */
1006 right = 229;
1007 p = __kmp_pow< UT >(x,right);
1008 if ( p > target ) {
1009 do{
1010 p *= p;
1011 right <<= 1;
1012 } while(p>target && right < (1<<27));
1013 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1014 } else {
1015 left = 0;
1016 }
1017
1018 /* bisection root-finding method */
1019 while ( left + 1 < right ) {
1020 mid = (left + right) / 2;
1021 if ( __kmp_pow< UT >(x,mid) > target ) {
1022 left = mid;
1023 } else {
1024 right = mid;
1025 }
1026 } // while
1027 cross = right;
1028 }
1029 /* assert sanity of computed crossover point */
1030 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1031
1032 /* save the crossover point in thread private dispatch structure */
1033 pr->u.p.parm2 = cross;
1034
1035 // C75803
1036 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1037 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1038 #else
1039 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1040 #endif
1041 /* dynamic-style scheduling offset */
1042 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1043 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1044 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +00001045 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001046 #endif
1047 } // if
1048 } else {
1049 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1050 gtid ) );
1051 schedule = kmp_sch_static_greedy;
1052 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1053 pr->u.p.parm1 = tc;
1054 } // if
1055 } // case
1056 break;
1057 case kmp_sch_static_greedy:
1058 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1059 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1060 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1061 tc;
1062 break;
1063 case kmp_sch_static_chunked :
1064 case kmp_sch_dynamic_chunked :
Jonathan Peyton70bda912015-11-06 20:32:44 +00001065 if ( pr->u.p.parm1 <= 0 ) {
1066 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1067 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00001068 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1069 break;
1070 case kmp_sch_trapezoidal :
1071 {
1072 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1073
1074 T parm1, parm2, parm3, parm4;
1075 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1076
1077 parm1 = chunk;
1078
1079 /* F : size of the first cycle */
1080 parm2 = ( tc / (2 * team->t.t_nproc) );
1081
1082 if ( parm2 < 1 ) {
1083 parm2 = 1;
1084 }
1085
1086 /* L : size of the last cycle. Make sure the last cycle
1087 * is not larger than the first cycle.
1088 */
1089 if ( parm1 < 1 ) {
1090 parm1 = 1;
1091 } else if ( parm1 > parm2 ) {
1092 parm1 = parm2;
1093 }
1094
1095 /* N : number of cycles */
1096 parm3 = ( parm2 + parm1 );
1097 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1098
1099 if ( parm3 < 2 ) {
1100 parm3 = 2;
1101 }
1102
1103 /* sigma : decreasing incr of the trapezoid */
1104 parm4 = ( parm3 - 1 );
1105 parm4 = ( parm2 - parm1 ) / parm4;
1106
1107 // pointless check, because parm4 >= 0 always
1108 //if ( parm4 < 0 ) {
1109 // parm4 = 0;
1110 //}
1111
1112 pr->u.p.parm1 = parm1;
1113 pr->u.p.parm2 = parm2;
1114 pr->u.p.parm3 = parm3;
1115 pr->u.p.parm4 = parm4;
1116 } // case
1117 break;
1118
1119 default:
1120 {
1121 __kmp_msg(
1122 kmp_ms_fatal, // Severity
1123 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1124 KMP_HNT( GetNewerLibrary ), // Hint
1125 __kmp_msg_null // Variadic argument list terminator
1126 );
1127 }
1128 break;
1129 } // switch
1130 pr->schedule = schedule;
1131 if ( active ) {
1132 /* The name of this buffer should be my_buffer_index when it's free to use it */
1133
1134 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1135 gtid, my_buffer_index, sh->buffer_index) );
1136 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1137 USE_ITT_BUILD_ARG( NULL )
1138 );
1139 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1140 // *always* 32-bit integers.
1141 KMP_MB(); /* is this necessary? */
1142 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1143 gtid, my_buffer_index, sh->buffer_index) );
1144
1145 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1146 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1147#if USE_ITT_BUILD
1148 if ( pr->ordered ) {
1149 __kmp_itt_ordered_init( gtid );
1150 }; // if
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001151 // Report loop metadata
1152 if ( itt_need_metadata_reporting ) {
1153 // Only report metadata by master of active team at level 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001154 kmp_uint64 schedtype = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001155 switch ( schedule ) {
1156 case kmp_sch_static_chunked:
1157 case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1158 break;
1159 case kmp_sch_static_greedy:
1160 cur_chunk = pr->u.p.parm1;
1161 break;
1162 case kmp_sch_dynamic_chunked:
1163 schedtype = 1;
1164 break;
1165 case kmp_sch_guided_iterative_chunked:
1166 case kmp_sch_guided_analytical_chunked:
1167 schedtype = 2;
1168 break;
1169 default:
1170// Should we put this case under "static"?
1171// case kmp_sch_static_steal:
1172 schedtype = 3;
1173 break;
1174 }
1175 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1176 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001177#endif /* USE_ITT_BUILD */
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001178 }; // if
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001179
Jim Cownie5e8470a2013-09-27 10:38:44 +00001180 #ifdef KMP_DEBUG
1181 {
1182 const char * buff;
1183 // create format specifiers before the debug output
1184 buff = __kmp_str_format(
1185 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1186 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1187 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1188 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1189 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1190 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1191 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1192 KD_TRACE(10, ( buff,
1193 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1194 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1195 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1196 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1197 __kmp_str_free( &buff );
1198 }
1199 #endif
1200 #if ( KMP_STATIC_STEAL_ENABLED )
1201 if ( ___kmp_size_type < 8 ) {
1202 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1203 // all the parm3 variables will contain the same value.
1204 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1205 // rather than program life-time increment.
1206 // So the dedicated variable is required. The 'static_steal_counter' is used.
1207 if( schedule == kmp_sch_static_steal ) {
1208 // Other threads will inspect this variable when searching for a victim.
1209 // This is a flag showing that other threads may steal from this thread since then.
1210 volatile T * p = &pr->u.p.static_steal_counter;
1211 *p = *p + 1;
1212 }
1213 }
1214 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001215
1216#if OMPT_SUPPORT && OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001217 if (ompt_enabled &&
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001218 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1219 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1220 ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1221 ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1222 team_info->parallel_id, task_info->task_id, team_info->microtask);
1223 }
1224#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001225}
1226
1227/*
1228 * For ordered loops, either __kmp_dispatch_finish() should be called after
1229 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1230 * every chunk of iterations. If the ordered section(s) were not executed
1231 * for this iteration (or every iteration in this chunk), we need to set the
1232 * ordered iteration counters so that the next thread can proceed.
1233 */
1234template< typename UT >
1235static void
1236__kmp_dispatch_finish( int gtid, ident_t *loc )
1237{
1238 typedef typename traits_t< UT >::signed_t ST;
1239 kmp_info_t *th = __kmp_threads[ gtid ];
1240
1241 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1242 if ( ! th -> th.th_team -> t.t_serialized ) {
1243
1244 dispatch_private_info_template< UT > * pr =
1245 reinterpret_cast< dispatch_private_info_template< UT >* >
1246 ( th->th.th_dispatch->th_dispatch_pr_current );
1247 dispatch_shared_info_template< UT > volatile * sh =
1248 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1249 ( th->th.th_dispatch->th_dispatch_sh_current );
1250 KMP_DEBUG_ASSERT( pr );
1251 KMP_DEBUG_ASSERT( sh );
1252 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1253 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1254
1255 if ( pr->ordered_bumped ) {
1256 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1257 gtid ) );
1258 pr->ordered_bumped = 0;
1259 } else {
1260 UT lower = pr->u.p.ordered_lower;
1261
1262 #ifdef KMP_DEBUG
1263 {
1264 const char * buff;
1265 // create format specifiers before the debug output
1266 buff = __kmp_str_format(
1267 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1268 traits_t< UT >::spec, traits_t< UT >::spec );
1269 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1270 __kmp_str_free( &buff );
1271 }
1272 #endif
1273
1274 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1275 USE_ITT_BUILD_ARG(NULL)
1276 );
1277 KMP_MB(); /* is this necessary? */
1278 #ifdef KMP_DEBUG
1279 {
1280 const char * buff;
1281 // create format specifiers before the debug output
1282 buff = __kmp_str_format(
1283 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1284 traits_t< UT >::spec, traits_t< UT >::spec );
1285 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1286 __kmp_str_free( &buff );
1287 }
1288 #endif
1289
1290 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1291 } // if
1292 } // if
1293 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1294}
1295
1296#ifdef KMP_GOMP_COMPAT
1297
1298template< typename UT >
1299static void
1300__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1301{
1302 typedef typename traits_t< UT >::signed_t ST;
1303 kmp_info_t *th = __kmp_threads[ gtid ];
1304
1305 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1306 if ( ! th -> th.th_team -> t.t_serialized ) {
1307// int cid;
1308 dispatch_private_info_template< UT > * pr =
1309 reinterpret_cast< dispatch_private_info_template< UT >* >
1310 ( th->th.th_dispatch->th_dispatch_pr_current );
1311 dispatch_shared_info_template< UT > volatile * sh =
1312 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1313 ( th->th.th_dispatch->th_dispatch_sh_current );
1314 KMP_DEBUG_ASSERT( pr );
1315 KMP_DEBUG_ASSERT( sh );
1316 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1317 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1318
1319// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1320 UT lower = pr->u.p.ordered_lower;
1321 UT upper = pr->u.p.ordered_upper;
1322 UT inc = upper - lower + 1;
1323
1324 if ( pr->ordered_bumped == inc ) {
1325 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1326 gtid ) );
1327 pr->ordered_bumped = 0;
1328 } else {
1329 inc -= pr->ordered_bumped;
1330
1331 #ifdef KMP_DEBUG
1332 {
1333 const char * buff;
1334 // create format specifiers before the debug output
1335 buff = __kmp_str_format(
1336 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1337 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1338 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1339 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1340 __kmp_str_free( &buff );
1341 }
1342 #endif
1343
1344 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1345 USE_ITT_BUILD_ARG(NULL)
1346 );
1347
1348 KMP_MB(); /* is this necessary? */
1349 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1350 gtid ) );
1351 pr->ordered_bumped = 0;
1352//!!!!! TODO check if the inc should be unsigned, or signed???
1353 #ifdef KMP_DEBUG
1354 {
1355 const char * buff;
1356 // create format specifiers before the debug output
1357 buff = __kmp_str_format(
1358 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1359 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1360 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1361 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1362 __kmp_str_free( &buff );
1363 }
1364 #endif
1365
1366 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1367 }
1368// }
1369 }
1370 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1371}
1372
1373#endif /* KMP_GOMP_COMPAT */
1374
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001375/* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1376 * (no more work), then tell OMPT the loop is over. In some cases
1377 * kmp_dispatch_fini() is not called. */
1378#if OMPT_SUPPORT && OMPT_TRACE
1379#define OMPT_LOOP_END \
1380 if (status == 0) { \
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001381 if (ompt_enabled && \
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001382 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1383 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1384 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1385 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1386 team_info->parallel_id, task_info->task_id); \
1387 } \
1388 }
1389#else
1390#define OMPT_LOOP_END // no-op
1391#endif
1392
Jim Cownie5e8470a2013-09-27 10:38:44 +00001393template< typename T >
1394static int
1395__kmp_dispatch_next(
1396 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1397) {
1398
1399 typedef typename traits_t< T >::unsigned_t UT;
1400 typedef typename traits_t< T >::signed_t ST;
1401 typedef typename traits_t< T >::floating_t DBL;
Jonathan Peyton2321d572015-06-08 19:25:25 +00001402#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001403 static const int ___kmp_size_type = sizeof( UT );
Jonathan Peyton2321d572015-06-08 19:25:25 +00001404#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001405
Jonathan Peyton45be4502015-08-11 21:36:41 +00001406 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1407 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1408 // more than a compile time choice to use static scheduling would.)
1409 KMP_TIME_BLOCK(FOR_dynamic_scheduling);
1410
Jim Cownie5e8470a2013-09-27 10:38:44 +00001411 int status;
1412 dispatch_private_info_template< T > * pr;
1413 kmp_info_t * th = __kmp_threads[ gtid ];
1414 kmp_team_t * team = th -> th.th_team;
1415
Andrey Churbanov9ad5c3a2015-07-13 17:52:41 +00001416 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie5e8470a2013-09-27 10:38:44 +00001417 #ifdef KMP_DEBUG
1418 {
1419 const char * buff;
1420 // create format specifiers before the debug output
1421 buff = __kmp_str_format(
1422 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1423 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1424 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1425 __kmp_str_free( &buff );
1426 }
1427 #endif
1428
1429 if ( team -> t.t_serialized ) {
1430 /* NOTE: serialize this dispatch becase we are not at the active level */
1431 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1432 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1433 KMP_DEBUG_ASSERT( pr );
1434
1435 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1436 *p_lb = 0;
1437 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001438// if ( p_last != NULL )
1439// *p_last = 0;
1440 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001441 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001442 if ( __kmp_env_consistency_check ) {
1443 if ( pr->pushed_ws != ct_none ) {
1444 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1445 }
1446 }
1447 } else if ( pr->nomerge ) {
1448 kmp_int32 last;
1449 T start;
1450 UT limit, trip, init;
1451 ST incr;
1452 T chunk = pr->u.p.parm1;
1453
1454 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1455
1456 init = chunk * pr->u.p.count++;
1457 trip = pr->u.p.tc - 1;
1458
1459 if ( (status = (init <= trip)) == 0 ) {
1460 *p_lb = 0;
1461 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001462// if ( p_last != NULL )
1463// *p_last = 0;
1464 if ( p_st != NULL )
1465 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001466 if ( __kmp_env_consistency_check ) {
1467 if ( pr->pushed_ws != ct_none ) {
1468 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1469 }
1470 }
1471 } else {
1472 start = pr->u.p.lb;
1473 limit = chunk + init - 1;
1474 incr = pr->u.p.st;
1475
1476 if ( (last = (limit >= trip)) != 0 ) {
1477 limit = trip;
1478 #if KMP_OS_WINDOWS
1479 pr->u.p.last_upper = pr->u.p.ub;
1480 #endif /* KMP_OS_WINDOWS */
1481 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001482 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001483 *p_last = last;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001484 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001485 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001486 if ( incr == 1 ) {
1487 *p_lb = start + init;
1488 *p_ub = start + limit;
1489 } else {
1490 *p_lb = start + init * incr;
1491 *p_ub = start + limit * incr;
1492 }
1493
1494 if ( pr->ordered ) {
1495 pr->u.p.ordered_lower = init;
1496 pr->u.p.ordered_upper = limit;
1497 #ifdef KMP_DEBUG
1498 {
1499 const char * buff;
1500 // create format specifiers before the debug output
1501 buff = __kmp_str_format(
1502 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1503 traits_t< UT >::spec, traits_t< UT >::spec );
1504 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1505 __kmp_str_free( &buff );
1506 }
1507 #endif
1508 } // if
1509 } // if
1510 } else {
1511 pr->u.p.tc = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001512 *p_lb = pr->u.p.lb;
1513 *p_ub = pr->u.p.ub;
1514 #if KMP_OS_WINDOWS
1515 pr->u.p.last_upper = *p_ub;
1516 #endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001517 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001518 *p_last = TRUE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001519 if ( p_st != NULL )
1520 *p_st = pr->u.p.st;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001521 } // if
1522 #ifdef KMP_DEBUG
1523 {
1524 const char * buff;
1525 // create format specifiers before the debug output
1526 buff = __kmp_str_format(
1527 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001528 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie5e8470a2013-09-27 10:38:44 +00001529 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001530 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001531 __kmp_str_free( &buff );
1532 }
1533 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001534#if INCLUDE_SSC_MARKS
1535 SSC_MARK_DISPATCH_NEXT();
1536#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001537 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001538 return status;
1539 } else {
1540 kmp_int32 last = 0;
1541 dispatch_shared_info_template< UT > *sh;
1542 T start;
1543 ST incr;
1544 UT limit, trip, init;
1545
1546 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1547 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1548
1549 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1550 ( th->th.th_dispatch->th_dispatch_pr_current );
1551 KMP_DEBUG_ASSERT( pr );
1552 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1553 ( th->th.th_dispatch->th_dispatch_sh_current );
1554 KMP_DEBUG_ASSERT( sh );
1555
1556 if ( pr->u.p.tc == 0 ) {
1557 // zero trip count
1558 status = 0;
1559 } else {
1560 switch (pr->schedule) {
1561 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1562 case kmp_sch_static_steal:
1563 {
1564 T chunk = pr->u.p.parm1;
1565
1566 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1567
1568 trip = pr->u.p.tc - 1;
1569
1570 if ( ___kmp_size_type > 4 ) {
1571 // Other threads do not look into the data of this thread,
1572 // so it's not necessary to make volatile casting.
1573 init = ( pr->u.p.count )++;
1574 status = ( init < (UT)pr->u.p.ub );
1575 } else {
1576 typedef union {
1577 struct {
1578 UT count;
1579 T ub;
1580 } p;
1581 kmp_int64 b;
1582 } union_i4;
1583 // All operations on 'count' or 'ub' must be combined atomically together.
1584 // stealing implemented only for 4-byte indexes
1585 {
1586 union_i4 vold, vnew;
1587 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1588 vnew = vold;
1589 vnew.p.count++;
1590 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1591 ( volatile kmp_int64* )&pr->u.p.count,
1592 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1593 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1594 KMP_CPU_PAUSE();
1595 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1596 vnew = vold;
1597 vnew.p.count++;
1598 }
1599 vnew = vold;
1600 init = vnew.p.count;
1601 status = ( init < (UT)vnew.p.ub ) ;
1602 }
1603
1604 if( !status ) {
1605 kmp_info_t **other_threads = team->t.t_threads;
1606 int while_limit = 10;
1607 int while_index = 0;
1608
1609 // TODO: algorithm of searching for a victim
1610 // should be cleaned up and measured
1611 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1612 union_i4 vold, vnew;
1613 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1614 T victimIdx = pr->u.p.parm4;
1615 T oldVictimIdx = victimIdx;
1616 dispatch_private_info_template< T > * victim;
1617
1618 do {
1619 if( !victimIdx ) {
1620 victimIdx = team->t.t_nproc - 1;
1621 } else {
1622 --victimIdx;
1623 }
1624 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1625 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1626 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1627 // TODO: think about a proper place of this test
1628 if ( ( !victim ) ||
1629 ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1630 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1631 // TODO: delay would be nice
1632 continue;
1633 // the victim is not ready yet to participate in stealing
1634 // because the victim is still in kmp_init_dispatch
1635 }
1636 if ( oldVictimIdx == victimIdx ) {
1637 break;
1638 }
1639 pr->u.p.parm4 = victimIdx;
1640
1641 while( 1 ) {
1642 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1643 vnew = vold;
1644
1645 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1646 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1647 break;
1648 }
1649 vnew.p.ub -= (remaining >> 2);
1650 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1651 #pragma warning( push )
1652 // disable warning on pointless comparison of unsigned with 0
1653 #pragma warning( disable: 186 )
1654 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1655 #pragma warning( pop )
1656 // TODO: Should this be acquire or release?
1657 if ( KMP_COMPARE_AND_STORE_ACQ64(
1658 ( volatile kmp_int64 * )&victim->u.p.count,
1659 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1660 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1661 status = 1;
1662 while_index = 0;
1663 // now update own count and ub
1664 #if KMP_ARCH_X86
1665 // stealing executed on non-KMP_ARCH_X86 only
1666 // Atomic 64-bit write on ia32 is
1667 // unavailable, so we do this in steps.
1668 // This code is not tested.
1669 init = vold.p.count;
1670 pr->u.p.ub = 0;
1671 pr->u.p.count = init + 1;
1672 pr->u.p.ub = vnew.p.count;
1673 #else
1674 init = vnew.p.ub;
1675 vold.p.count = init + 1;
1676 // TODO: is it safe and enough?
1677 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1678 #endif // KMP_ARCH_X86
1679 break;
1680 } // if
1681 KMP_CPU_PAUSE();
1682 } // while (1)
1683 } // while
1684 } // if
1685 } // if
1686 if ( !status ) {
1687 *p_lb = 0;
1688 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001689 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001690 } else {
1691 start = pr->u.p.parm2;
1692 init *= chunk;
1693 limit = chunk + init - 1;
1694 incr = pr->u.p.st;
1695
1696 KMP_DEBUG_ASSERT(init <= trip);
1697 if ( (last = (limit >= trip)) != 0 )
1698 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001699 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001700
1701 if ( incr == 1 ) {
1702 *p_lb = start + init;
1703 *p_ub = start + limit;
1704 } else {
1705 *p_lb = start + init * incr;
1706 *p_ub = start + limit * incr;
1707 }
1708
1709 if ( pr->ordered ) {
1710 pr->u.p.ordered_lower = init;
1711 pr->u.p.ordered_upper = limit;
1712 #ifdef KMP_DEBUG
1713 {
1714 const char * buff;
1715 // create format specifiers before the debug output
1716 buff = __kmp_str_format(
1717 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1718 traits_t< UT >::spec, traits_t< UT >::spec );
1719 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1720 __kmp_str_free( &buff );
1721 }
1722 #endif
1723 } // if
1724 } // if
1725 break;
1726 } // case
1727 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1728 case kmp_sch_static_balanced:
1729 {
1730 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1731 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1732 pr->u.p.count = 1;
1733 *p_lb = pr->u.p.lb;
1734 *p_ub = pr->u.p.ub;
1735 last = pr->u.p.parm1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001736 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001737 *p_st = pr->u.p.st;
1738 } else { /* no iterations to do */
1739 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1740 }
1741 if ( pr->ordered ) {
1742 #ifdef KMP_DEBUG
1743 {
1744 const char * buff;
1745 // create format specifiers before the debug output
1746 buff = __kmp_str_format(
1747 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1748 traits_t< UT >::spec, traits_t< UT >::spec );
1749 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1750 __kmp_str_free( &buff );
1751 }
1752 #endif
1753 } // if
1754 } // case
1755 break;
1756 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1757 case kmp_sch_static_chunked:
1758 {
1759 T parm1;
1760
1761 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1762 gtid ) );
1763 parm1 = pr->u.p.parm1;
1764
1765 trip = pr->u.p.tc - 1;
1766 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1767
1768 if ( (status = (init <= trip)) != 0 ) {
1769 start = pr->u.p.lb;
1770 incr = pr->u.p.st;
1771 limit = parm1 + init - 1;
1772
1773 if ( (last = (limit >= trip)) != 0 )
1774 limit = trip;
1775
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001776 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001777
1778 pr->u.p.count += team->t.t_nproc;
1779
1780 if ( incr == 1 ) {
1781 *p_lb = start + init;
1782 *p_ub = start + limit;
1783 }
1784 else {
1785 *p_lb = start + init * incr;
1786 *p_ub = start + limit * incr;
1787 }
1788
1789 if ( pr->ordered ) {
1790 pr->u.p.ordered_lower = init;
1791 pr->u.p.ordered_upper = limit;
1792 #ifdef KMP_DEBUG
1793 {
1794 const char * buff;
1795 // create format specifiers before the debug output
1796 buff = __kmp_str_format(
1797 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1798 traits_t< UT >::spec, traits_t< UT >::spec );
1799 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1800 __kmp_str_free( &buff );
1801 }
1802 #endif
1803 } // if
1804 } // if
1805 } // case
1806 break;
1807
1808 case kmp_sch_dynamic_chunked:
1809 {
1810 T chunk = pr->u.p.parm1;
1811
1812 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1813 gtid ) );
1814
1815 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1816 trip = pr->u.p.tc - 1;
1817
1818 if ( (status = (init <= trip)) == 0 ) {
1819 *p_lb = 0;
1820 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001821 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001822 } else {
1823 start = pr->u.p.lb;
1824 limit = chunk + init - 1;
1825 incr = pr->u.p.st;
1826
1827 if ( (last = (limit >= trip)) != 0 )
1828 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001829
1830 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001831
1832 if ( incr == 1 ) {
1833 *p_lb = start + init;
1834 *p_ub = start + limit;
1835 } else {
1836 *p_lb = start + init * incr;
1837 *p_ub = start + limit * incr;
1838 }
1839
1840 if ( pr->ordered ) {
1841 pr->u.p.ordered_lower = init;
1842 pr->u.p.ordered_upper = limit;
1843 #ifdef KMP_DEBUG
1844 {
1845 const char * buff;
1846 // create format specifiers before the debug output
1847 buff = __kmp_str_format(
1848 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1849 traits_t< UT >::spec, traits_t< UT >::spec );
1850 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1851 __kmp_str_free( &buff );
1852 }
1853 #endif
1854 } // if
1855 } // if
1856 } // case
1857 break;
1858
1859 case kmp_sch_guided_iterative_chunked:
1860 {
1861 T chunkspec = pr->u.p.parm1;
1862 KD_TRACE(100,
1863 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1864 trip = pr->u.p.tc;
1865 // Start atomic part of calculations
1866 while(1) {
1867 ST remaining; // signed, because can be < 0
1868 init = sh->u.s.iteration; // shared value
1869 remaining = trip - init;
1870 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1871 // nothing to do, don't try atomic op
1872 status = 0;
1873 break;
1874 }
1875 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1876 // use dynamic-style shcedule
1877 // atomically inrement iterations, get old value
1878 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1879 remaining = trip - init;
1880 if (remaining <= 0) {
1881 status = 0; // all iterations got by other threads
1882 } else {
1883 // got some iterations to work on
1884 status = 1;
1885 if ( (T)remaining > chunkspec ) {
1886 limit = init + chunkspec - 1;
1887 } else {
1888 last = 1; // the last chunk
1889 limit = init + remaining - 1;
1890 } // if
1891 } // if
1892 break;
1893 } // if
1894 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1895 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1896 // CAS was successful, chunk obtained
1897 status = 1;
1898 --limit;
1899 break;
1900 } // if
1901 } // while
1902 if ( status != 0 ) {
1903 start = pr->u.p.lb;
1904 incr = pr->u.p.st;
1905 if ( p_st != NULL )
1906 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001907 *p_lb = start + init * incr;
1908 *p_ub = start + limit * incr;
1909 if ( pr->ordered ) {
1910 pr->u.p.ordered_lower = init;
1911 pr->u.p.ordered_upper = limit;
1912 #ifdef KMP_DEBUG
1913 {
1914 const char * buff;
1915 // create format specifiers before the debug output
1916 buff = __kmp_str_format(
1917 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1918 traits_t< UT >::spec, traits_t< UT >::spec );
1919 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1920 __kmp_str_free( &buff );
1921 }
1922 #endif
1923 } // if
1924 } else {
1925 *p_lb = 0;
1926 *p_ub = 0;
1927 if ( p_st != NULL )
1928 *p_st = 0;
1929 } // if
1930 } // case
1931 break;
1932
1933 case kmp_sch_guided_analytical_chunked:
1934 {
1935 T chunkspec = pr->u.p.parm1;
1936 UT chunkIdx;
1937 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1938 /* for storing original FPCW value for Windows* OS on
1939 IA-32 architecture 8-byte version */
1940 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001941 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001942 #endif
1943 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1944 gtid ) );
1945
1946 trip = pr->u.p.tc;
1947
1948 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1949 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1950
1951 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1952 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1953 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1954 --trip;
1955 /* use dynamic-style scheduling */
1956 init = chunkIdx * chunkspec + pr->u.p.count;
1957 /* need to verify init > 0 in case of overflow in the above calculation */
1958 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1959 limit = init + chunkspec -1;
1960
1961 if ( (last = (limit >= trip)) != 0 )
1962 limit = trip;
1963 }
1964 break;
1965 } else {
1966 /* use exponential-style scheduling */
1967 /* The following check is to workaround the lack of long double precision on Windows* OS.
1968 This check works around the possible effect that init != 0 for chunkIdx == 0.
1969 */
1970 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1971 /* If we haven't already done so, save original
1972 FPCW and set precision to 64-bit, as Windows* OS
1973 on IA-32 architecture defaults to 53-bit */
1974 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00001975 oldFpcw = _control87(0,0);
1976 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001977 fpcwSet = 0x30000;
1978 }
1979 #endif
1980 if ( chunkIdx ) {
1981 init = __kmp_dispatch_guided_remaining< T >(
1982 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1983 KMP_DEBUG_ASSERT(init);
1984 init = trip - init;
1985 } else
1986 init = 0;
1987 limit = trip - __kmp_dispatch_guided_remaining< T >(
1988 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1989 KMP_ASSERT(init <= limit);
1990 if ( init < limit ) {
1991 KMP_DEBUG_ASSERT(limit <= trip);
1992 --limit;
1993 status = 1;
1994 break;
1995 } // if
1996 } // if
1997 } // while (1)
1998 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00001999 /* restore FPCW if necessary
2000 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2001 */
2002 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2003 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002004 #endif
2005 if ( status != 0 ) {
2006 start = pr->u.p.lb;
2007 incr = pr->u.p.st;
2008 if ( p_st != NULL )
2009 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002010 *p_lb = start + init * incr;
2011 *p_ub = start + limit * incr;
2012 if ( pr->ordered ) {
2013 pr->u.p.ordered_lower = init;
2014 pr->u.p.ordered_upper = limit;
2015 #ifdef KMP_DEBUG
2016 {
2017 const char * buff;
2018 // create format specifiers before the debug output
2019 buff = __kmp_str_format(
2020 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2021 traits_t< UT >::spec, traits_t< UT >::spec );
2022 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2023 __kmp_str_free( &buff );
2024 }
2025 #endif
2026 }
2027 } else {
2028 *p_lb = 0;
2029 *p_ub = 0;
2030 if ( p_st != NULL )
2031 *p_st = 0;
2032 }
2033 } // case
2034 break;
2035
2036 case kmp_sch_trapezoidal:
2037 {
2038 UT index;
2039 T parm2 = pr->u.p.parm2;
2040 T parm3 = pr->u.p.parm3;
2041 T parm4 = pr->u.p.parm4;
2042 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2043 gtid ) );
2044
2045 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2046
2047 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2048 trip = pr->u.p.tc - 1;
2049
2050 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2051 *p_lb = 0;
2052 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002053 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002054 } else {
2055 start = pr->u.p.lb;
2056 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2057 incr = pr->u.p.st;
2058
2059 if ( (last = (limit >= trip)) != 0 )
2060 limit = trip;
2061
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002062 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002063
2064 if ( incr == 1 ) {
2065 *p_lb = start + init;
2066 *p_ub = start + limit;
2067 } else {
2068 *p_lb = start + init * incr;
2069 *p_ub = start + limit * incr;
2070 }
2071
2072 if ( pr->ordered ) {
2073 pr->u.p.ordered_lower = init;
2074 pr->u.p.ordered_upper = limit;
2075 #ifdef KMP_DEBUG
2076 {
2077 const char * buff;
2078 // create format specifiers before the debug output
2079 buff = __kmp_str_format(
2080 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2081 traits_t< UT >::spec, traits_t< UT >::spec );
2082 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2083 __kmp_str_free( &buff );
2084 }
2085 #endif
2086 } // if
2087 } // if
2088 } // case
2089 break;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002090 default:
2091 {
2092 status = 0; // to avoid complaints on uninitialized variable use
2093 __kmp_msg(
2094 kmp_ms_fatal, // Severity
2095 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2096 KMP_HNT( GetNewerLibrary ), // Hint
2097 __kmp_msg_null // Variadic argument list terminator
2098 );
2099 }
2100 break;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002101 } // switch
2102 } // if tc == 0;
2103
2104 if ( status == 0 ) {
2105 UT num_done;
2106
2107 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2108 #ifdef KMP_DEBUG
2109 {
2110 const char * buff;
2111 // create format specifiers before the debug output
2112 buff = __kmp_str_format(
2113 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2114 traits_t< UT >::spec );
2115 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2116 __kmp_str_free( &buff );
2117 }
2118 #endif
2119
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002120 if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002121 /* NOTE: release this buffer to be reused */
2122
2123 KMP_MB(); /* Flush all pending memory write invalidates. */
2124
2125 sh->u.s.num_done = 0;
2126 sh->u.s.iteration = 0;
2127
2128 /* TODO replace with general release procedure? */
2129 if ( pr->ordered ) {
2130 sh->u.s.ordered_iteration = 0;
2131 }
2132
2133 KMP_MB(); /* Flush all pending memory write invalidates. */
2134
2135 sh -> buffer_index += KMP_MAX_DISP_BUF;
2136 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2137 gtid, sh->buffer_index) );
2138
2139 KMP_MB(); /* Flush all pending memory write invalidates. */
2140
2141 } // if
2142 if ( __kmp_env_consistency_check ) {
2143 if ( pr->pushed_ws != ct_none ) {
2144 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2145 }
2146 }
2147
2148 th -> th.th_dispatch -> th_deo_fcn = NULL;
2149 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2150 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2151 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2152 } // if (status == 0)
2153#if KMP_OS_WINDOWS
2154 else if ( last ) {
2155 pr->u.p.last_upper = pr->u.p.ub;
2156 }
2157#endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002158 if ( p_last != NULL && status != 0 )
2159 *p_last = last;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002160 } // if
2161
2162 #ifdef KMP_DEBUG
2163 {
2164 const char * buff;
2165 // create format specifiers before the debug output
2166 buff = __kmp_str_format(
2167 "__kmp_dispatch_next: T#%%d normal case: " \
2168 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2169 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2170 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2171 __kmp_str_free( &buff );
2172 }
2173 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002174#if INCLUDE_SSC_MARKS
2175 SSC_MARK_DISPATCH_NEXT();
2176#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00002177 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002178 return status;
2179}
2180
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002181template< typename T >
2182static void
2183__kmp_dist_get_bounds(
2184 ident_t *loc,
2185 kmp_int32 gtid,
2186 kmp_int32 *plastiter,
2187 T *plower,
2188 T *pupper,
2189 typename traits_t< T >::signed_t incr
2190) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002191 typedef typename traits_t< T >::unsigned_t UT;
2192 typedef typename traits_t< T >::signed_t ST;
2193 register kmp_uint32 team_id;
2194 register kmp_uint32 nteams;
2195 register UT trip_count;
2196 register kmp_team_t *team;
2197 kmp_info_t * th;
2198
2199 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2200 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2201 #ifdef KMP_DEBUG
2202 {
2203 const char * buff;
2204 // create format specifiers before the debug output
2205 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2206 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2207 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2208 traits_t< T >::spec );
2209 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2210 __kmp_str_free( &buff );
2211 }
2212 #endif
2213
2214 if( __kmp_env_consistency_check ) {
2215 if( incr == 0 ) {
2216 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2217 }
2218 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2219 // The loop is illegal.
2220 // Some zero-trip loops maintained by compiler, e.g.:
2221 // for(i=10;i<0;++i) // lower >= upper - run-time check
2222 // for(i=0;i>10;--i) // lower <= upper - run-time check
2223 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2224 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2225 // Compiler does not check the following illegal loops:
2226 // for(i=0;i<10;i+=incr) // where incr<0
2227 // for(i=10;i>0;i-=incr) // where incr<0
2228 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2229 }
2230 }
2231 th = __kmp_threads[gtid];
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002232 team = th->th.th_team;
2233 #if OMP_40_ENABLED
Jonathan Peyton441f3372015-09-21 17:24:46 +00002234 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002235 nteams = th->th.th_teams_size.nteams;
2236 #endif
2237 team_id = team->t.t_master_tid;
2238 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2239
2240 // compute global trip count
2241 if( incr == 1 ) {
2242 trip_count = *pupper - *plower + 1;
2243 } else if(incr == -1) {
2244 trip_count = *plower - *pupper + 1;
2245 } else {
2246 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2247 }
Jonathan Peyton45be4502015-08-11 21:36:41 +00002248
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002249 if( trip_count <= nteams ) {
2250 KMP_DEBUG_ASSERT(
2251 __kmp_static == kmp_sch_static_greedy || \
2252 __kmp_static == kmp_sch_static_balanced
2253 ); // Unknown static scheduling type.
2254 // only some teams get single iteration, others get nothing
2255 if( team_id < trip_count ) {
2256 *pupper = *plower = *plower + team_id * incr;
2257 } else {
2258 *plower = *pupper + incr; // zero-trip loop
2259 }
2260 if( plastiter != NULL )
2261 *plastiter = ( team_id == trip_count - 1 );
2262 } else {
2263 if( __kmp_static == kmp_sch_static_balanced ) {
2264 register UT chunk = trip_count / nteams;
2265 register UT extras = trip_count % nteams;
2266 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2267 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2268 if( plastiter != NULL )
2269 *plastiter = ( team_id == nteams - 1 );
2270 } else {
2271 register T chunk_inc_count =
2272 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2273 register T upper = *pupper;
2274 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2275 // Unknown static scheduling type.
2276 *plower += team_id * chunk_inc_count;
2277 *pupper = *plower + chunk_inc_count - incr;
2278 // Check/correct bounds if needed
2279 if( incr > 0 ) {
2280 if( *pupper < *plower )
2281 *pupper = i_maxmin< T >::mx;
2282 if( plastiter != NULL )
2283 *plastiter = *plower <= upper && *pupper > upper - incr;
2284 if( *pupper > upper )
2285 *pupper = upper; // tracker C73258
2286 } else {
2287 if( *pupper > *plower )
2288 *pupper = i_maxmin< T >::mn;
2289 if( plastiter != NULL )
2290 *plastiter = *plower >= upper && *pupper < upper - incr;
2291 if( *pupper < upper )
2292 *pupper = upper; // tracker C73258
2293 }
2294 }
2295 }
2296}
2297
Jim Cownie5e8470a2013-09-27 10:38:44 +00002298//-----------------------------------------------------------------------------------------
2299// Dispatch routines
2300// Transfer call to template< type T >
2301// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2302// T lb, T ub, ST st, ST chunk )
2303extern "C" {
2304
2305/*!
2306@ingroup WORK_SHARING
2307@{
2308@param loc Source location
2309@param gtid Global thread id
2310@param schedule Schedule type
2311@param lb Lower bound
2312@param ub Upper bound
2313@param st Step (or increment if you prefer)
2314@param chunk The chunk size to block with
2315
2316This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2317These functions are all identical apart from the types of the arguments.
2318*/
2319
2320void
2321__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2322 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2323{
2324 KMP_DEBUG_ASSERT( __kmp_init_serial );
2325 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2326}
2327/*!
2328See @ref __kmpc_dispatch_init_4
2329*/
2330void
2331__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2332 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2333{
2334 KMP_DEBUG_ASSERT( __kmp_init_serial );
2335 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2336}
2337
2338/*!
2339See @ref __kmpc_dispatch_init_4
2340*/
2341void
2342__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2343 kmp_int64 lb, kmp_int64 ub,
2344 kmp_int64 st, kmp_int64 chunk )
2345{
2346 KMP_DEBUG_ASSERT( __kmp_init_serial );
2347 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2348}
2349
2350/*!
2351See @ref __kmpc_dispatch_init_4
2352*/
2353void
2354__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2355 kmp_uint64 lb, kmp_uint64 ub,
2356 kmp_int64 st, kmp_int64 chunk )
2357{
2358 KMP_DEBUG_ASSERT( __kmp_init_serial );
2359 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2360}
2361
2362/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002363See @ref __kmpc_dispatch_init_4
2364
2365Difference from __kmpc_dispatch_init set of functions is these functions
2366are called for composite distribute parallel for construct. Thus before
2367regular iterations dispatching we need to calc per-team iteration space.
2368
2369These functions are all identical apart from the types of the arguments.
2370*/
2371void
2372__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2373 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2374{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002375 KMP_DEBUG_ASSERT( __kmp_init_serial );
2376 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2377 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2378}
2379
2380void
2381__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2382 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2383{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002384 KMP_DEBUG_ASSERT( __kmp_init_serial );
2385 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2386 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2387}
2388
2389void
2390__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2391 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2392{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002393 KMP_DEBUG_ASSERT( __kmp_init_serial );
2394 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2395 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2396}
2397
2398void
2399__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2400 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2401{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002402 KMP_DEBUG_ASSERT( __kmp_init_serial );
2403 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2404 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2405}
2406
2407/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002408@param loc Source code location
2409@param gtid Global thread id
2410@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2411@param p_lb Pointer to the lower bound for the next chunk of work
2412@param p_ub Pointer to the upper bound for the next chunk of work
2413@param p_st Pointer to the stride for the next chunk of work
2414@return one if there is work to be done, zero otherwise
2415
2416Get the next dynamically allocated chunk of work for this thread.
2417If there is no more work, then the lb,ub and stride need not be modified.
2418*/
2419int
2420__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2421 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2422{
2423 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2424}
2425
2426/*!
2427See @ref __kmpc_dispatch_next_4
2428*/
2429int
2430__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2431 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2432{
2433 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2434}
2435
2436/*!
2437See @ref __kmpc_dispatch_next_4
2438*/
2439int
2440__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2441 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2442{
2443 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2444}
2445
2446/*!
2447See @ref __kmpc_dispatch_next_4
2448*/
2449int
2450__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2451 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2452{
2453 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2454}
2455
2456/*!
2457@param loc Source code location
2458@param gtid Global thread id
2459
2460Mark the end of a dynamic loop.
2461*/
2462void
2463__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2464{
2465 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2466}
2467
2468/*!
2469See @ref __kmpc_dispatch_fini_4
2470*/
2471void
2472__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2473{
2474 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2475}
2476
2477/*!
2478See @ref __kmpc_dispatch_fini_4
2479*/
2480void
2481__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2482{
2483 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2484}
2485
2486/*!
2487See @ref __kmpc_dispatch_fini_4
2488*/
2489void
2490__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2491{
2492 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2493}
2494/*! @} */
2495
2496//-----------------------------------------------------------------------------------------
2497//Non-template routines from kmp_dispatch.c used in other sources
2498
2499kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2500 return value == checker;
2501}
2502
2503kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2504 return value != checker;
2505}
2506
2507kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2508 return value < checker;
2509}
2510
2511kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2512 return value >= checker;
2513}
2514
2515kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2516 return value <= checker;
2517}
2518kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2519 return value == checker;
2520}
2521
2522kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2523 return value != checker;
2524}
2525
2526kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2527 return value < checker;
2528}
2529
2530kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2531 return value >= checker;
2532}
2533
2534kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2535 return value <= checker;
2536}
2537
2538kmp_uint32
2539__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2540 kmp_uint32 checker,
2541 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2542 , void * obj // Higher-level synchronization object, or NULL.
2543 )
2544{
2545 // note: we may not belong to a team at this point
2546 register volatile kmp_uint32 * spin = spinner;
2547 register kmp_uint32 check = checker;
2548 register kmp_uint32 spins;
2549 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2550 register kmp_uint32 r;
2551
2552 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2553 KMP_INIT_YIELD( spins );
2554 // main wait spin loop
2555 while(!f(r = TCR_4(*spin), check)) {
2556 KMP_FSYNC_SPIN_PREPARE( obj );
2557 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2558 It causes problems with infinite recursion because of exit lock */
2559 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2560 __kmp_abort_thread(); */
2561
Jim Cownie5e8470a2013-09-27 10:38:44 +00002562 /* if we have waited a bit, or are oversubscribed, yield */
2563 /* pause is in the following code */
2564 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2565 KMP_YIELD_SPIN( spins );
2566 }
2567 KMP_FSYNC_SPIN_ACQUIRED( obj );
2568 return r;
2569}
2570
2571kmp_uint64
2572__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2573 kmp_uint64 checker,
2574 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2575 , void * obj // Higher-level synchronization object, or NULL.
2576 )
2577{
2578 // note: we may not belong to a team at this point
2579 register volatile kmp_uint64 * spin = spinner;
2580 register kmp_uint64 check = checker;
2581 register kmp_uint32 spins;
2582 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2583 register kmp_uint64 r;
2584
2585 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2586 KMP_INIT_YIELD( spins );
2587 // main wait spin loop
2588 while(!f(r = *spin, check))
2589 {
2590 KMP_FSYNC_SPIN_PREPARE( obj );
2591 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2592 It causes problems with infinite recursion because of exit lock */
2593 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2594 __kmp_abort_thread(); */
2595
Jim Cownie5e8470a2013-09-27 10:38:44 +00002596 // if we are oversubscribed,
2597 // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2598 // pause is in the following code
2599 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2600 KMP_YIELD_SPIN( spins );
2601 }
2602 KMP_FSYNC_SPIN_ACQUIRED( obj );
2603 return r;
2604}
2605
2606} // extern "C"
2607
2608#ifdef KMP_GOMP_COMPAT
2609
2610void
2611__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2612 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2613 kmp_int32 chunk, int push_ws )
2614{
2615 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2616 push_ws );
2617}
2618
2619void
2620__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2621 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2622 kmp_int32 chunk, int push_ws )
2623{
2624 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2625 push_ws );
2626}
2627
2628void
2629__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2630 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2631 kmp_int64 chunk, int push_ws )
2632{
2633 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2634 push_ws );
2635}
2636
2637void
2638__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2639 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2640 kmp_int64 chunk, int push_ws )
2641{
2642 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2643 push_ws );
2644}
2645
2646void
2647__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2648{
2649 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2650}
2651
2652void
2653__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2654{
2655 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2656}
2657
2658void
2659__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2660{
2661 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2662}
2663
2664void
2665__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2666{
2667 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2668}
2669
2670#endif /* KMP_GOMP_COMPAT */
2671
2672/* ------------------------------------------------------------------------ */
2673/* ------------------------------------------------------------------------ */
2674