blob: a1d0fc44f2b33b977622e6308e1335874a1738fc [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16/*
17 * Dynamic scheduling initialization and dispatch.
18 *
19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20 * it may change values between parallel regions. __kmp_max_nth
21 * is the largest value __kmp_nth may take, 1 is the smallest.
22 *
23 */
24
25/* ------------------------------------------------------------------------ */
26/* ------------------------------------------------------------------------ */
27
28#include "kmp.h"
29#include "kmp_i18n.h"
30#include "kmp_itt.h"
31#include "kmp_str.h"
32#include "kmp_error.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000033#include "kmp_stats.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000034#if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36#endif
37
Andrey Churbanovd7d088f2015-04-29 16:42:24 +000038#if OMPT_SUPPORT
39#include "ompt-internal.h"
40#include "ompt-specific.h"
41#endif
42
Jim Cownie5e8470a2013-09-27 10:38:44 +000043/* ------------------------------------------------------------------------ */
44/* ------------------------------------------------------------------------ */
45
Jim Cownie4cc4bb42014-10-07 16:25:50 +000046// template for type limits
47template< typename T >
48struct i_maxmin {
49 static const T mx;
50 static const T mn;
51};
52template<>
53struct i_maxmin< int > {
54 static const int mx = 0x7fffffff;
55 static const int mn = 0x80000000;
56};
57template<>
58struct i_maxmin< unsigned int > {
59 static const unsigned int mx = 0xffffffff;
60 static const unsigned int mn = 0x00000000;
61};
62template<>
63struct i_maxmin< long long > {
64 static const long long mx = 0x7fffffffffffffffLL;
65 static const long long mn = 0x8000000000000000LL;
66};
67template<>
68struct i_maxmin< unsigned long long > {
69 static const unsigned long long mx = 0xffffffffffffffffLL;
70 static const unsigned long long mn = 0x0000000000000000LL;
71};
72//-------------------------------------------------------------------------
73
Jim Cownie5e8470a2013-09-27 10:38:44 +000074#ifdef KMP_STATIC_STEAL_ENABLED
75
76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77 template< typename T >
78 struct dispatch_private_infoXX_template {
79 typedef typename traits_t< T >::unsigned_t UT;
80 typedef typename traits_t< T >::signed_t ST;
81 UT count; // unsigned
82 T ub;
83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84 T lb;
85 ST st; // signed
86 UT tc; // unsigned
87 T static_steal_counter; // for static_steal only; maybe better to put after ub
88
89 /* parm[1-4] are used in different ways by different scheduling algorithms */
90
91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92 // a) parm3 is properly aligned and
93 // b) all parm1-4 are in the same cache line.
94 // Because of parm1-4 are used together, performance seems to be better
95 // if they are in the same line (not measured though).
96
97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98 T parm1;
99 T parm2;
100 T parm3;
101 T parm4;
102 };
103
104 UT ordered_lower; // unsigned
105 UT ordered_upper; // unsigned
106 #if KMP_OS_WINDOWS
107 T last_upper;
108 #endif /* KMP_OS_WINDOWS */
109 };
110
111#else /* KMP_STATIC_STEAL_ENABLED */
112
113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114 template< typename T >
115 struct dispatch_private_infoXX_template {
116 typedef typename traits_t< T >::unsigned_t UT;
117 typedef typename traits_t< T >::signed_t ST;
118 T lb;
119 T ub;
120 ST st; // signed
121 UT tc; // unsigned
122
123 T parm1;
124 T parm2;
125 T parm3;
126 T parm4;
127
128 UT count; // unsigned
129
130 UT ordered_lower; // unsigned
131 UT ordered_upper; // unsigned
132 #if KMP_OS_WINDOWS
133 T last_upper;
134 #endif /* KMP_OS_WINDOWS */
135 };
136
137#endif /* KMP_STATIC_STEAL_ENABLED */
138
139// replaces dispatch_private_info structure and dispatch_private_info_t type
140template< typename T >
141struct KMP_ALIGN_CACHE dispatch_private_info_template {
142 // duplicate alignment here, otherwise size of structure is not correct in our compiler
143 union KMP_ALIGN_CACHE private_info_tmpl {
144 dispatch_private_infoXX_template< T > p;
145 dispatch_private_info64_t p64;
146 } u;
147 enum sched_type schedule; /* scheduling algorithm */
148 kmp_uint32 ordered; /* ordered clause specified */
149 kmp_uint32 ordered_bumped;
150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152 kmp_uint32 nomerge; /* don't merge iters if serialized */
153 kmp_uint32 type_size;
154 enum cons_type pushed_ws;
155};
156
157
158// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159template< typename UT >
160struct dispatch_shared_infoXX_template {
161 /* chunk index under dynamic, number of idle threads under static-steal;
162 iteration index otherwise */
163 volatile UT iteration;
164 volatile UT num_done;
165 volatile UT ordered_iteration;
166 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167};
168
169// replaces dispatch_shared_info structure and dispatch_shared_info_t type
170template< typename UT >
171struct dispatch_shared_info_template {
172 // we need union here to keep the structure size
173 union shared_info_tmpl {
174 dispatch_shared_infoXX_template< UT > s;
175 dispatch_shared_info64_t s64;
176 } u;
177 volatile kmp_uint32 buffer_index;
178};
179
180/* ------------------------------------------------------------------------ */
181/* ------------------------------------------------------------------------ */
182
Jim Cownie5e8470a2013-09-27 10:38:44 +0000183#undef USE_TEST_LOCKS
184
185// test_then_add template (general template should NOT be used)
186template< typename T >
187static __forceinline T
188test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189
190template<>
191__forceinline kmp_int32
192test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193{
194 kmp_int32 r;
195 r = KMP_TEST_THEN_ADD32( p, d );
196 return r;
197}
198
199template<>
200__forceinline kmp_int64
201test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202{
203 kmp_int64 r;
204 r = KMP_TEST_THEN_ADD64( p, d );
205 return r;
206}
207
208// test_then_inc_acq template (general template should NOT be used)
209template< typename T >
210static __forceinline T
211test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212
213template<>
214__forceinline kmp_int32
215test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216{
217 kmp_int32 r;
218 r = KMP_TEST_THEN_INC_ACQ32( p );
219 return r;
220}
221
222template<>
223__forceinline kmp_int64
224test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225{
226 kmp_int64 r;
227 r = KMP_TEST_THEN_INC_ACQ64( p );
228 return r;
229}
230
231// test_then_inc template (general template should NOT be used)
232template< typename T >
233static __forceinline T
234test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235
236template<>
237__forceinline kmp_int32
238test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239{
240 kmp_int32 r;
241 r = KMP_TEST_THEN_INC32( p );
242 return r;
243}
244
245template<>
246__forceinline kmp_int64
247test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248{
249 kmp_int64 r;
250 r = KMP_TEST_THEN_INC64( p );
251 return r;
252}
253
254// compare_and_swap template (general template should NOT be used)
255template< typename T >
256static __forceinline kmp_int32
257compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258
259template<>
260__forceinline kmp_int32
261compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262{
263 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264}
265
266template<>
267__forceinline kmp_int32
268compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269{
270 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271}
272
273/*
274 Spin wait loop that first does pause, then yield.
275 Waits until function returns non-zero when called with *spinner and check.
276 Does NOT put threads to sleep.
277#if USE_ITT_BUILD
278 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000279 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000280 locks consistently. For example, if lock is acquired immediately, its address is
281 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283 address, not an address of low-level spinner.
284#endif // USE_ITT_BUILD
285*/
286template< typename UT >
287// ToDo: make inline function (move to header file for icl)
288static UT // unsigned 4- or 8-byte type
289__kmp_wait_yield( volatile UT * spinner,
290 UT checker,
291 kmp_uint32 (* pred)( UT, UT )
292 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
293 )
294{
295 // note: we may not belong to a team at this point
296 register volatile UT * spin = spinner;
297 register UT check = checker;
298 register kmp_uint32 spins;
299 register kmp_uint32 (*f) ( UT, UT ) = pred;
300 register UT r;
301
302 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303 KMP_INIT_YIELD( spins );
304 // main wait spin loop
305 while(!f(r = *spin, check))
306 {
307 KMP_FSYNC_SPIN_PREPARE( obj );
308 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309 It causes problems with infinite recursion because of exit lock */
310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311 __kmp_abort_thread(); */
312
Jim Cownie5e8470a2013-09-27 10:38:44 +0000313 // if we are oversubscribed,
314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315 // pause is in the following code
316 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317 KMP_YIELD_SPIN( spins );
318 }
319 KMP_FSYNC_SPIN_ACQUIRED( obj );
320 return r;
321}
322
323template< typename UT >
324static kmp_uint32 __kmp_eq( UT value, UT checker) {
325 return value == checker;
326}
327
328template< typename UT >
329static kmp_uint32 __kmp_neq( UT value, UT checker) {
330 return value != checker;
331}
332
333template< typename UT >
334static kmp_uint32 __kmp_lt( UT value, UT checker) {
335 return value < checker;
336}
337
338template< typename UT >
339static kmp_uint32 __kmp_ge( UT value, UT checker) {
340 return value >= checker;
341}
342
343template< typename UT >
344static kmp_uint32 __kmp_le( UT value, UT checker) {
345 return value <= checker;
346}
347
348
349/* ------------------------------------------------------------------------ */
350/* ------------------------------------------------------------------------ */
351
352static void
353__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354{
355 kmp_info_t *th;
356
357 KMP_DEBUG_ASSERT( gtid_ref );
358
359 if ( __kmp_env_consistency_check ) {
360 th = __kmp_threads[*gtid_ref];
361 if ( th -> th.th_root -> r.r_active
362 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000363#if KMP_USE_DYNAMIC_LOCK
364 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000366 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000367#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000368 }
369 }
370}
371
372template< typename UT >
373static void
374__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375{
376 typedef typename traits_t< UT >::signed_t ST;
377 dispatch_private_info_template< UT > * pr;
378
379 int gtid = *gtid_ref;
380// int cid = *cid_ref;
381 kmp_info_t *th = __kmp_threads[ gtid ];
382 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383
384 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385 if ( __kmp_env_consistency_check ) {
386 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387 ( th -> th.th_dispatch -> th_dispatch_pr_current );
388 if ( pr -> pushed_ws != ct_none ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000389#if KMP_USE_DYNAMIC_LOCK
390 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000392 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000393#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000394 }
395 }
396
397 if ( ! th -> th.th_team -> t.t_serialized ) {
398 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399 ( th -> th.th_dispatch -> th_dispatch_sh_current );
400 UT lower;
401
402 if ( ! __kmp_env_consistency_check ) {
403 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404 ( th -> th.th_dispatch -> th_dispatch_pr_current );
405 }
406 lower = pr->u.p.ordered_lower;
407
408 #if ! defined( KMP_GOMP_COMPAT )
409 if ( __kmp_env_consistency_check ) {
410 if ( pr->ordered_bumped ) {
411 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412 __kmp_error_construct2(
413 kmp_i18n_msg_CnsMultipleNesting,
414 ct_ordered_in_pdo, loc_ref,
415 & p->stack_data[ p->w_top ]
416 );
417 }
418 }
419 #endif /* !defined(KMP_GOMP_COMPAT) */
420
421 KMP_MB();
422 #ifdef KMP_DEBUG
423 {
424 const char * buff;
425 // create format specifiers before the debug output
426 buff = __kmp_str_format(
427 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428 traits_t< UT >::spec, traits_t< UT >::spec );
429 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430 __kmp_str_free( &buff );
431 }
432 #endif
433
434 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435 USE_ITT_BUILD_ARG( NULL )
436 );
437 KMP_MB(); /* is this necessary? */
438 #ifdef KMP_DEBUG
439 {
440 const char * buff;
441 // create format specifiers before the debug output
442 buff = __kmp_str_format(
443 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444 traits_t< UT >::spec, traits_t< UT >::spec );
445 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446 __kmp_str_free( &buff );
447 }
448 #endif
449 }
450 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451}
452
453static void
454__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455{
456 kmp_info_t *th;
457
458 if ( __kmp_env_consistency_check ) {
459 th = __kmp_threads[*gtid_ref];
460 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462 }
463 }
464}
465
466template< typename UT >
467static void
468__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469{
470 typedef typename traits_t< UT >::signed_t ST;
471 dispatch_private_info_template< UT > * pr;
472
473 int gtid = *gtid_ref;
474// int cid = *cid_ref;
475 kmp_info_t *th = __kmp_threads[ gtid ];
476 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477
478 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479 if ( __kmp_env_consistency_check ) {
480 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481 ( th -> th.th_dispatch -> th_dispatch_pr_current );
482 if ( pr -> pushed_ws != ct_none ) {
483 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484 }
485 }
486
487 if ( ! th -> th.th_team -> t.t_serialized ) {
488 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489 ( th -> th.th_dispatch -> th_dispatch_sh_current );
490
491 if ( ! __kmp_env_consistency_check ) {
492 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493 ( th -> th.th_dispatch -> th_dispatch_pr_current );
494 }
495
496 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497 #if ! defined( KMP_GOMP_COMPAT )
498 if ( __kmp_env_consistency_check ) {
499 if ( pr->ordered_bumped != 0 ) {
500 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501 /* How to test it? - OM */
502 __kmp_error_construct2(
503 kmp_i18n_msg_CnsMultipleNesting,
504 ct_ordered_in_pdo, loc_ref,
505 & p->stack_data[ p->w_top ]
506 );
507 }
508 }
509 #endif /* !defined(KMP_GOMP_COMPAT) */
510
511 KMP_MB(); /* Flush all pending memory write invalidates. */
512
513 pr->ordered_bumped += 1;
514
515 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516 gtid, pr->ordered_bumped ) );
517
518 KMP_MB(); /* Flush all pending memory write invalidates. */
519
520 /* TODO use general release procedure? */
521 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522
523 KMP_MB(); /* Flush all pending memory write invalidates. */
524 }
525 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526}
527
528/* Computes and returns x to the power of y, where y must a non-negative integer */
529template< typename UT >
530static __forceinline long double
531__kmp_pow(long double x, UT y) {
532 long double s=1.0L;
533
534 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536 while(y) {
537 if ( y & 1 )
538 s *= x;
539 x *= x;
540 y >>= 1;
541 }
542 return s;
543}
544
545/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549*/
550template< typename T >
551static __inline typename traits_t< T >::unsigned_t
552__kmp_dispatch_guided_remaining(
553 T tc,
554 typename traits_t< T >::floating_t base,
555 typename traits_t< T >::unsigned_t idx
556) {
557 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558 least for ICL 8.1, long double arithmetic may not really have
559 long double precision, even with /Qlong_double. Currently, we
560 workaround that in the caller code, by manipulating the FPCW for
561 Windows* OS on IA-32 architecture. The lack of precision is not
562 expected to be a correctness issue, though.
563 */
564 typedef typename traits_t< T >::unsigned_t UT;
565
566 long double x = tc * __kmp_pow< UT >(base, idx);
567 UT r = (UT) x;
568 if ( x == r )
569 return r;
570 return r + 1;
571}
572
573// Parameters of the guided-iterative algorithm:
574// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
575// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
576// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578static int guided_int_param = 2;
579static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580
581// UT - unsigned flavor of T, ST - signed flavor of T,
582// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583template< typename T >
584static void
585__kmp_dispatch_init(
586 ident_t * loc,
587 int gtid,
588 enum sched_type schedule,
589 T lb,
590 T ub,
591 typename traits_t< T >::signed_t st,
592 typename traits_t< T >::signed_t chunk,
593 int push_ws
594) {
595 typedef typename traits_t< T >::unsigned_t UT;
596 typedef typename traits_t< T >::signed_t ST;
597 typedef typename traits_t< T >::floating_t DBL;
598 static const int ___kmp_size_type = sizeof( UT );
599
600 int active;
601 T tc;
602 kmp_info_t * th;
603 kmp_team_t * team;
604 kmp_uint32 my_buffer_index;
605 dispatch_private_info_template< T > * pr;
606 dispatch_shared_info_template< UT > volatile * sh;
607
608 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610
611 if ( ! TCR_4( __kmp_init_parallel ) )
612 __kmp_parallel_initialize();
613
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000614#if INCLUDE_SSC_MARKS
615 SSC_MARK_DISPATCH_INIT();
616#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000617 #ifdef KMP_DEBUG
618 {
619 const char * buff;
620 // create format specifiers before the debug output
621 buff = __kmp_str_format(
622 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625 __kmp_str_free( &buff );
626 }
627 #endif
628 /* setup data */
629 th = __kmp_threads[ gtid ];
630 team = th -> th.th_team;
631 active = ! team -> t.t_serialized;
632 th->th.th_ident = loc;
633
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000634#if USE_ITT_BUILD
635 kmp_uint64 cur_chunk = chunk;
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000636 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
637 KMP_MASTER_GTID(gtid) &&
638#if OMP_40_ENABLED
639 th->th.th_teams_microtask == NULL &&
640#endif
641 team->t.t_active_level == 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000642#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000643 if ( ! active ) {
644 pr = reinterpret_cast< dispatch_private_info_template< T >* >
645 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
646 } else {
647 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
648 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
649
650 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
651
652 /* What happens when number of threads changes, need to resize buffer? */
653 pr = reinterpret_cast< dispatch_private_info_template< T > * >
654 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
655 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
656 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657 }
658
659 /* Pick up the nomerge/ordered bits from the scheduling type */
660 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
661 pr->nomerge = TRUE;
662 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
663 } else {
664 pr->nomerge = FALSE;
665 }
666 pr->type_size = ___kmp_size_type; // remember the size of variables
667 if ( kmp_ord_lower & schedule ) {
668 pr->ordered = TRUE;
669 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
670 } else {
671 pr->ordered = FALSE;
672 }
Jonathan Peyton45be4502015-08-11 21:36:41 +0000673
Jim Cownie5e8470a2013-09-27 10:38:44 +0000674 if ( schedule == kmp_sch_static ) {
675 schedule = __kmp_static;
676 } else {
677 if ( schedule == kmp_sch_runtime ) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000678 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
679 schedule = team -> t.t_sched.r_sched_type;
680 // Detail the schedule if needed (global controls are differentiated appropriately)
681 if ( schedule == kmp_sch_guided_chunked ) {
682 schedule = __kmp_guided;
683 } else if ( schedule == kmp_sch_static ) {
684 schedule = __kmp_static;
685 }
686 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
687 chunk = team -> t.t_sched.chunk;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000688
689 #ifdef KMP_DEBUG
690 {
691 const char * buff;
692 // create format specifiers before the debug output
693 buff = __kmp_str_format(
694 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
695 traits_t< ST >::spec );
696 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
697 __kmp_str_free( &buff );
698 }
699 #endif
700 } else {
701 if ( schedule == kmp_sch_guided_chunked ) {
702 schedule = __kmp_guided;
703 }
704 if ( chunk <= 0 ) {
705 chunk = KMP_DEFAULT_CHUNK;
706 }
707 }
708
Jim Cownie5e8470a2013-09-27 10:38:44 +0000709 if ( schedule == kmp_sch_auto ) {
710 // mapping and differentiation: in the __kmp_do_serial_initialize()
711 schedule = __kmp_auto;
712 #ifdef KMP_DEBUG
713 {
714 const char * buff;
715 // create format specifiers before the debug output
716 buff = __kmp_str_format(
717 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
718 traits_t< ST >::spec );
719 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
720 __kmp_str_free( &buff );
721 }
722 #endif
723 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000724
725 /* guided analytical not safe for too many threads */
726 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
727 schedule = kmp_sch_guided_iterative_chunked;
728 KMP_WARNING( DispatchManyThreads );
729 }
730 pr->u.p.parm1 = chunk;
731 }
732 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
733 "unknown scheduling type" );
734
735 pr->u.p.count = 0;
736
737 if ( __kmp_env_consistency_check ) {
738 if ( st == 0 ) {
739 __kmp_error_construct(
740 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
741 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
742 );
743 }
744 }
745
746 tc = ( ub - lb + st );
747 if ( st != 1 ) {
748 if ( st < 0 ) {
749 if ( lb < ub ) {
750 tc = 0; // zero-trip
751 } else { // lb >= ub
752 tc = (ST)tc / st; // convert to signed division
753 }
754 } else { // st > 0
755 if ( ub < lb ) {
756 tc = 0; // zero-trip
757 } else { // lb >= ub
758 tc /= st;
759 }
760 }
761 } else if ( ub < lb ) { // st == 1
762 tc = 0; // zero-trip
763 }
764
Jonathan Peyton45be4502015-08-11 21:36:41 +0000765 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
766 // when statistics are disabled.
767 if (schedule == __kmp_static)
768 {
769 KMP_COUNT_BLOCK(OMP_FOR_static);
770 KMP_COUNT_VALUE(FOR_static_iterations, tc);
771 }
772 else
773 {
774 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
775 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
776 }
777
Jim Cownie5e8470a2013-09-27 10:38:44 +0000778 pr->u.p.lb = lb;
779 pr->u.p.ub = ub;
780 pr->u.p.st = st;
781 pr->u.p.tc = tc;
782
783 #if KMP_OS_WINDOWS
784 pr->u.p.last_upper = ub + st;
785 #endif /* KMP_OS_WINDOWS */
786
787 /* NOTE: only the active parallel region(s) has active ordered sections */
788
789 if ( active ) {
790 if ( pr->ordered == 0 ) {
791 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
792 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
793 } else {
794 pr->ordered_bumped = 0;
795
796 pr->u.p.ordered_lower = 1;
797 pr->u.p.ordered_upper = 0;
798
799 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
800 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
801 }
802 }
803
804 if ( __kmp_env_consistency_check ) {
805 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
806 if ( push_ws ) {
807 __kmp_push_workshare( gtid, ws, loc );
808 pr->pushed_ws = ws;
809 } else {
810 __kmp_check_workshare( gtid, ws, loc );
811 pr->pushed_ws = ct_none;
812 }
813 }
814
815 switch ( schedule ) {
816 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
817 case kmp_sch_static_steal:
818 {
819 T nproc = team->t.t_nproc;
820 T ntc, init;
821
822 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
823
824 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
825 if ( nproc > 1 && ntc >= nproc ) {
826 T id = __kmp_tid_from_gtid(gtid);
827 T small_chunk, extras;
828
829 small_chunk = ntc / nproc;
830 extras = ntc % nproc;
831
832 init = id * small_chunk + ( id < extras ? id : extras );
833 pr->u.p.count = init;
834 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
835
836 pr->u.p.parm2 = lb;
837 //pr->pfields.parm3 = 0; // it's not used in static_steal
838 pr->u.p.parm4 = id;
839 pr->u.p.st = st;
840 break;
841 } else {
842 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
843 gtid ) );
844 schedule = kmp_sch_static_balanced;
845 /* too few iterations: fall-through to kmp_sch_static_balanced */
846 } // if
847 /* FALL-THROUGH to static balanced */
848 } // case
849 #endif
850 case kmp_sch_static_balanced:
851 {
852 T nproc = team->t.t_nproc;
853 T init, limit;
854
855 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
856 gtid ) );
857
858 if ( nproc > 1 ) {
859 T id = __kmp_tid_from_gtid(gtid);
860
861 if ( tc < nproc ) {
862 if ( id < tc ) {
863 init = id;
864 limit = id;
865 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
866 } else {
867 pr->u.p.count = 1; /* means no more chunks to execute */
868 pr->u.p.parm1 = FALSE;
869 break;
870 }
871 } else {
872 T small_chunk = tc / nproc;
873 T extras = tc % nproc;
874 init = id * small_chunk + (id < extras ? id : extras);
875 limit = init + small_chunk - (id < extras ? 0 : 1);
876 pr->u.p.parm1 = (id == nproc - 1);
877 }
878 } else {
879 if ( tc > 0 ) {
880 init = 0;
881 limit = tc - 1;
882 pr->u.p.parm1 = TRUE;
883 } else {
884 // zero trip count
885 pr->u.p.count = 1; /* means no more chunks to execute */
886 pr->u.p.parm1 = FALSE;
887 break;
888 }
889 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000890#if USE_ITT_BUILD
891 // Calculate chunk for metadata report
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000892 if ( itt_need_metadata_reporting )
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000893 cur_chunk = limit - init + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000894#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000895 if ( st == 1 ) {
896 pr->u.p.lb = lb + init;
897 pr->u.p.ub = lb + limit;
898 } else {
899 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
900 pr->u.p.lb = lb + init * st;
901 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
902 if ( st > 0 ) {
903 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
904 } else {
905 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
906 }
907 }
908 if ( pr->ordered ) {
909 pr->u.p.ordered_lower = init;
910 pr->u.p.ordered_upper = limit;
911 }
912 break;
913 } // case
914 case kmp_sch_guided_iterative_chunked :
915 {
916 T nproc = team->t.t_nproc;
917 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
918
919 if ( nproc > 1 ) {
920 if ( (2L * chunk + 1 ) * nproc >= tc ) {
921 /* chunk size too large, switch to dynamic */
922 schedule = kmp_sch_dynamic_chunked;
923 } else {
924 // when remaining iters become less than parm2 - switch to dynamic
925 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
926 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
927 }
928 } else {
929 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
930 schedule = kmp_sch_static_greedy;
931 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
932 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
933 pr->u.p.parm1 = tc;
934 } // if
935 } // case
936 break;
937 case kmp_sch_guided_analytical_chunked:
938 {
939 T nproc = team->t.t_nproc;
940 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
941
942 if ( nproc > 1 ) {
943 if ( (2L * chunk + 1 ) * nproc >= tc ) {
944 /* chunk size too large, switch to dynamic */
945 schedule = kmp_sch_dynamic_chunked;
946 } else {
947 /* commonly used term: (2 nproc - 1)/(2 nproc) */
948 DBL x;
949
950 #if KMP_OS_WINDOWS && KMP_ARCH_X86
951 /* Linux* OS already has 64-bit computation by default for
952 long double, and on Windows* OS on Intel(R) 64,
953 /Qlong_double doesn't work. On Windows* OS
954 on IA-32 architecture, we need to set precision to
955 64-bit instead of the default 53-bit. Even though long
956 double doesn't work on Windows* OS on Intel(R) 64, the
957 resulting lack of precision is not expected to impact
958 the correctness of the algorithm, but this has not been
959 mathematically proven.
960 */
961 // save original FPCW and set precision to 64-bit, as
962 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +0000963 unsigned int oldFpcw = _control87(0,0);
964 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +0000965 #endif
966 /* value used for comparison in solver for cross-over point */
967 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
968
969 /* crossover point--chunk indexes equal to or greater than
970 this point switch to dynamic-style scheduling */
971 UT cross;
972
973 /* commonly used term: (2 nproc - 1)/(2 nproc) */
974 x = (long double)1.0 - (long double)0.5 / nproc;
975
976 #ifdef KMP_DEBUG
977 { // test natural alignment
978 struct _test_a {
979 char a;
980 union {
981 char b;
982 DBL d;
983 };
984 } t;
985 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
986 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
987 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
988 }
989 #endif // KMP_DEBUG
990
991 /* save the term in thread private dispatch structure */
992 *(DBL*)&pr->u.p.parm3 = x;
993
994 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
995 {
996 UT left, right, mid;
997 long double p;
998
999 /* estimate initial upper and lower bound */
1000
1001 /* doesn't matter what value right is as long as it is positive, but
1002 it affects performance of the solver
1003 */
1004 right = 229;
1005 p = __kmp_pow< UT >(x,right);
1006 if ( p > target ) {
1007 do{
1008 p *= p;
1009 right <<= 1;
1010 } while(p>target && right < (1<<27));
1011 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1012 } else {
1013 left = 0;
1014 }
1015
1016 /* bisection root-finding method */
1017 while ( left + 1 < right ) {
1018 mid = (left + right) / 2;
1019 if ( __kmp_pow< UT >(x,mid) > target ) {
1020 left = mid;
1021 } else {
1022 right = mid;
1023 }
1024 } // while
1025 cross = right;
1026 }
1027 /* assert sanity of computed crossover point */
1028 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1029
1030 /* save the crossover point in thread private dispatch structure */
1031 pr->u.p.parm2 = cross;
1032
1033 // C75803
1034 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1035 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1036 #else
1037 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1038 #endif
1039 /* dynamic-style scheduling offset */
1040 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1041 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1042 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +00001043 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001044 #endif
1045 } // if
1046 } else {
1047 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1048 gtid ) );
1049 schedule = kmp_sch_static_greedy;
1050 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1051 pr->u.p.parm1 = tc;
1052 } // if
1053 } // case
1054 break;
1055 case kmp_sch_static_greedy:
1056 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1057 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1058 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1059 tc;
1060 break;
1061 case kmp_sch_static_chunked :
1062 case kmp_sch_dynamic_chunked :
1063 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1064 break;
1065 case kmp_sch_trapezoidal :
1066 {
1067 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1068
1069 T parm1, parm2, parm3, parm4;
1070 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1071
1072 parm1 = chunk;
1073
1074 /* F : size of the first cycle */
1075 parm2 = ( tc / (2 * team->t.t_nproc) );
1076
1077 if ( parm2 < 1 ) {
1078 parm2 = 1;
1079 }
1080
1081 /* L : size of the last cycle. Make sure the last cycle
1082 * is not larger than the first cycle.
1083 */
1084 if ( parm1 < 1 ) {
1085 parm1 = 1;
1086 } else if ( parm1 > parm2 ) {
1087 parm1 = parm2;
1088 }
1089
1090 /* N : number of cycles */
1091 parm3 = ( parm2 + parm1 );
1092 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1093
1094 if ( parm3 < 2 ) {
1095 parm3 = 2;
1096 }
1097
1098 /* sigma : decreasing incr of the trapezoid */
1099 parm4 = ( parm3 - 1 );
1100 parm4 = ( parm2 - parm1 ) / parm4;
1101
1102 // pointless check, because parm4 >= 0 always
1103 //if ( parm4 < 0 ) {
1104 // parm4 = 0;
1105 //}
1106
1107 pr->u.p.parm1 = parm1;
1108 pr->u.p.parm2 = parm2;
1109 pr->u.p.parm3 = parm3;
1110 pr->u.p.parm4 = parm4;
1111 } // case
1112 break;
1113
1114 default:
1115 {
1116 __kmp_msg(
1117 kmp_ms_fatal, // Severity
1118 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1119 KMP_HNT( GetNewerLibrary ), // Hint
1120 __kmp_msg_null // Variadic argument list terminator
1121 );
1122 }
1123 break;
1124 } // switch
1125 pr->schedule = schedule;
1126 if ( active ) {
1127 /* The name of this buffer should be my_buffer_index when it's free to use it */
1128
1129 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1130 gtid, my_buffer_index, sh->buffer_index) );
1131 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1132 USE_ITT_BUILD_ARG( NULL )
1133 );
1134 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1135 // *always* 32-bit integers.
1136 KMP_MB(); /* is this necessary? */
1137 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1138 gtid, my_buffer_index, sh->buffer_index) );
1139
1140 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1141 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1142#if USE_ITT_BUILD
1143 if ( pr->ordered ) {
1144 __kmp_itt_ordered_init( gtid );
1145 }; // if
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001146 // Report loop metadata
1147 if ( itt_need_metadata_reporting ) {
1148 // Only report metadata by master of active team at level 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001149 kmp_uint64 schedtype = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001150 switch ( schedule ) {
1151 case kmp_sch_static_chunked:
1152 case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1153 break;
1154 case kmp_sch_static_greedy:
1155 cur_chunk = pr->u.p.parm1;
1156 break;
1157 case kmp_sch_dynamic_chunked:
1158 schedtype = 1;
1159 break;
1160 case kmp_sch_guided_iterative_chunked:
1161 case kmp_sch_guided_analytical_chunked:
1162 schedtype = 2;
1163 break;
1164 default:
1165// Should we put this case under "static"?
1166// case kmp_sch_static_steal:
1167 schedtype = 3;
1168 break;
1169 }
1170 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1171 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001172#endif /* USE_ITT_BUILD */
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001173 }; // if
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001174
Jim Cownie5e8470a2013-09-27 10:38:44 +00001175 #ifdef KMP_DEBUG
1176 {
1177 const char * buff;
1178 // create format specifiers before the debug output
1179 buff = __kmp_str_format(
1180 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1181 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1182 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1183 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1184 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1185 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1186 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1187 KD_TRACE(10, ( buff,
1188 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1189 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1190 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1191 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1192 __kmp_str_free( &buff );
1193 }
1194 #endif
1195 #if ( KMP_STATIC_STEAL_ENABLED )
1196 if ( ___kmp_size_type < 8 ) {
1197 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1198 // all the parm3 variables will contain the same value.
1199 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1200 // rather than program life-time increment.
1201 // So the dedicated variable is required. The 'static_steal_counter' is used.
1202 if( schedule == kmp_sch_static_steal ) {
1203 // Other threads will inspect this variable when searching for a victim.
1204 // This is a flag showing that other threads may steal from this thread since then.
1205 volatile T * p = &pr->u.p.static_steal_counter;
1206 *p = *p + 1;
1207 }
1208 }
1209 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001210
1211#if OMPT_SUPPORT && OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001212 if (ompt_enabled &&
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001213 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1214 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1215 ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1216 ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1217 team_info->parallel_id, task_info->task_id, team_info->microtask);
1218 }
1219#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001220}
1221
1222/*
1223 * For ordered loops, either __kmp_dispatch_finish() should be called after
1224 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1225 * every chunk of iterations. If the ordered section(s) were not executed
1226 * for this iteration (or every iteration in this chunk), we need to set the
1227 * ordered iteration counters so that the next thread can proceed.
1228 */
1229template< typename UT >
1230static void
1231__kmp_dispatch_finish( int gtid, ident_t *loc )
1232{
1233 typedef typename traits_t< UT >::signed_t ST;
1234 kmp_info_t *th = __kmp_threads[ gtid ];
1235
1236 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1237 if ( ! th -> th.th_team -> t.t_serialized ) {
1238
1239 dispatch_private_info_template< UT > * pr =
1240 reinterpret_cast< dispatch_private_info_template< UT >* >
1241 ( th->th.th_dispatch->th_dispatch_pr_current );
1242 dispatch_shared_info_template< UT > volatile * sh =
1243 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1244 ( th->th.th_dispatch->th_dispatch_sh_current );
1245 KMP_DEBUG_ASSERT( pr );
1246 KMP_DEBUG_ASSERT( sh );
1247 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1248 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1249
1250 if ( pr->ordered_bumped ) {
1251 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1252 gtid ) );
1253 pr->ordered_bumped = 0;
1254 } else {
1255 UT lower = pr->u.p.ordered_lower;
1256
1257 #ifdef KMP_DEBUG
1258 {
1259 const char * buff;
1260 // create format specifiers before the debug output
1261 buff = __kmp_str_format(
1262 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1263 traits_t< UT >::spec, traits_t< UT >::spec );
1264 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1265 __kmp_str_free( &buff );
1266 }
1267 #endif
1268
1269 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1270 USE_ITT_BUILD_ARG(NULL)
1271 );
1272 KMP_MB(); /* is this necessary? */
1273 #ifdef KMP_DEBUG
1274 {
1275 const char * buff;
1276 // create format specifiers before the debug output
1277 buff = __kmp_str_format(
1278 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1279 traits_t< UT >::spec, traits_t< UT >::spec );
1280 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1281 __kmp_str_free( &buff );
1282 }
1283 #endif
1284
1285 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1286 } // if
1287 } // if
1288 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1289}
1290
1291#ifdef KMP_GOMP_COMPAT
1292
1293template< typename UT >
1294static void
1295__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1296{
1297 typedef typename traits_t< UT >::signed_t ST;
1298 kmp_info_t *th = __kmp_threads[ gtid ];
1299
1300 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1301 if ( ! th -> th.th_team -> t.t_serialized ) {
1302// int cid;
1303 dispatch_private_info_template< UT > * pr =
1304 reinterpret_cast< dispatch_private_info_template< UT >* >
1305 ( th->th.th_dispatch->th_dispatch_pr_current );
1306 dispatch_shared_info_template< UT > volatile * sh =
1307 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1308 ( th->th.th_dispatch->th_dispatch_sh_current );
1309 KMP_DEBUG_ASSERT( pr );
1310 KMP_DEBUG_ASSERT( sh );
1311 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1312 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1313
1314// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1315 UT lower = pr->u.p.ordered_lower;
1316 UT upper = pr->u.p.ordered_upper;
1317 UT inc = upper - lower + 1;
1318
1319 if ( pr->ordered_bumped == inc ) {
1320 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1321 gtid ) );
1322 pr->ordered_bumped = 0;
1323 } else {
1324 inc -= pr->ordered_bumped;
1325
1326 #ifdef KMP_DEBUG
1327 {
1328 const char * buff;
1329 // create format specifiers before the debug output
1330 buff = __kmp_str_format(
1331 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1332 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1333 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1334 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1335 __kmp_str_free( &buff );
1336 }
1337 #endif
1338
1339 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1340 USE_ITT_BUILD_ARG(NULL)
1341 );
1342
1343 KMP_MB(); /* is this necessary? */
1344 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1345 gtid ) );
1346 pr->ordered_bumped = 0;
1347//!!!!! TODO check if the inc should be unsigned, or signed???
1348 #ifdef KMP_DEBUG
1349 {
1350 const char * buff;
1351 // create format specifiers before the debug output
1352 buff = __kmp_str_format(
1353 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1354 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1355 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1356 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1357 __kmp_str_free( &buff );
1358 }
1359 #endif
1360
1361 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1362 }
1363// }
1364 }
1365 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1366}
1367
1368#endif /* KMP_GOMP_COMPAT */
1369
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001370/* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1371 * (no more work), then tell OMPT the loop is over. In some cases
1372 * kmp_dispatch_fini() is not called. */
1373#if OMPT_SUPPORT && OMPT_TRACE
1374#define OMPT_LOOP_END \
1375 if (status == 0) { \
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001376 if (ompt_enabled && \
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001377 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1378 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1379 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1380 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1381 team_info->parallel_id, task_info->task_id); \
1382 } \
1383 }
1384#else
1385#define OMPT_LOOP_END // no-op
1386#endif
1387
Jim Cownie5e8470a2013-09-27 10:38:44 +00001388template< typename T >
1389static int
1390__kmp_dispatch_next(
1391 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1392) {
1393
1394 typedef typename traits_t< T >::unsigned_t UT;
1395 typedef typename traits_t< T >::signed_t ST;
1396 typedef typename traits_t< T >::floating_t DBL;
Jonathan Peyton2321d572015-06-08 19:25:25 +00001397#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001398 static const int ___kmp_size_type = sizeof( UT );
Jonathan Peyton2321d572015-06-08 19:25:25 +00001399#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001400
Jonathan Peyton45be4502015-08-11 21:36:41 +00001401 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1402 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1403 // more than a compile time choice to use static scheduling would.)
1404 KMP_TIME_BLOCK(FOR_dynamic_scheduling);
1405
Jim Cownie5e8470a2013-09-27 10:38:44 +00001406 int status;
1407 dispatch_private_info_template< T > * pr;
1408 kmp_info_t * th = __kmp_threads[ gtid ];
1409 kmp_team_t * team = th -> th.th_team;
1410
Andrey Churbanov9ad5c3a2015-07-13 17:52:41 +00001411 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie5e8470a2013-09-27 10:38:44 +00001412 #ifdef KMP_DEBUG
1413 {
1414 const char * buff;
1415 // create format specifiers before the debug output
1416 buff = __kmp_str_format(
1417 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1418 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1419 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1420 __kmp_str_free( &buff );
1421 }
1422 #endif
1423
1424 if ( team -> t.t_serialized ) {
1425 /* NOTE: serialize this dispatch becase we are not at the active level */
1426 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1427 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1428 KMP_DEBUG_ASSERT( pr );
1429
1430 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1431 *p_lb = 0;
1432 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001433// if ( p_last != NULL )
1434// *p_last = 0;
1435 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001436 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001437 if ( __kmp_env_consistency_check ) {
1438 if ( pr->pushed_ws != ct_none ) {
1439 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1440 }
1441 }
1442 } else if ( pr->nomerge ) {
1443 kmp_int32 last;
1444 T start;
1445 UT limit, trip, init;
1446 ST incr;
1447 T chunk = pr->u.p.parm1;
1448
1449 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1450
1451 init = chunk * pr->u.p.count++;
1452 trip = pr->u.p.tc - 1;
1453
1454 if ( (status = (init <= trip)) == 0 ) {
1455 *p_lb = 0;
1456 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001457// if ( p_last != NULL )
1458// *p_last = 0;
1459 if ( p_st != NULL )
1460 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001461 if ( __kmp_env_consistency_check ) {
1462 if ( pr->pushed_ws != ct_none ) {
1463 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1464 }
1465 }
1466 } else {
1467 start = pr->u.p.lb;
1468 limit = chunk + init - 1;
1469 incr = pr->u.p.st;
1470
1471 if ( (last = (limit >= trip)) != 0 ) {
1472 limit = trip;
1473 #if KMP_OS_WINDOWS
1474 pr->u.p.last_upper = pr->u.p.ub;
1475 #endif /* KMP_OS_WINDOWS */
1476 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001477 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001478 *p_last = last;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001479 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001480 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001481 if ( incr == 1 ) {
1482 *p_lb = start + init;
1483 *p_ub = start + limit;
1484 } else {
1485 *p_lb = start + init * incr;
1486 *p_ub = start + limit * incr;
1487 }
1488
1489 if ( pr->ordered ) {
1490 pr->u.p.ordered_lower = init;
1491 pr->u.p.ordered_upper = limit;
1492 #ifdef KMP_DEBUG
1493 {
1494 const char * buff;
1495 // create format specifiers before the debug output
1496 buff = __kmp_str_format(
1497 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1498 traits_t< UT >::spec, traits_t< UT >::spec );
1499 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1500 __kmp_str_free( &buff );
1501 }
1502 #endif
1503 } // if
1504 } // if
1505 } else {
1506 pr->u.p.tc = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001507 *p_lb = pr->u.p.lb;
1508 *p_ub = pr->u.p.ub;
1509 #if KMP_OS_WINDOWS
1510 pr->u.p.last_upper = *p_ub;
1511 #endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001512 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001513 *p_last = TRUE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001514 if ( p_st != NULL )
1515 *p_st = pr->u.p.st;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001516 } // if
1517 #ifdef KMP_DEBUG
1518 {
1519 const char * buff;
1520 // create format specifiers before the debug output
1521 buff = __kmp_str_format(
1522 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001523 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie5e8470a2013-09-27 10:38:44 +00001524 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001525 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001526 __kmp_str_free( &buff );
1527 }
1528 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001529#if INCLUDE_SSC_MARKS
1530 SSC_MARK_DISPATCH_NEXT();
1531#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001532 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001533 return status;
1534 } else {
1535 kmp_int32 last = 0;
1536 dispatch_shared_info_template< UT > *sh;
1537 T start;
1538 ST incr;
1539 UT limit, trip, init;
1540
1541 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1542 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1543
1544 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1545 ( th->th.th_dispatch->th_dispatch_pr_current );
1546 KMP_DEBUG_ASSERT( pr );
1547 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1548 ( th->th.th_dispatch->th_dispatch_sh_current );
1549 KMP_DEBUG_ASSERT( sh );
1550
1551 if ( pr->u.p.tc == 0 ) {
1552 // zero trip count
1553 status = 0;
1554 } else {
1555 switch (pr->schedule) {
1556 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1557 case kmp_sch_static_steal:
1558 {
1559 T chunk = pr->u.p.parm1;
1560
1561 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1562
1563 trip = pr->u.p.tc - 1;
1564
1565 if ( ___kmp_size_type > 4 ) {
1566 // Other threads do not look into the data of this thread,
1567 // so it's not necessary to make volatile casting.
1568 init = ( pr->u.p.count )++;
1569 status = ( init < (UT)pr->u.p.ub );
1570 } else {
1571 typedef union {
1572 struct {
1573 UT count;
1574 T ub;
1575 } p;
1576 kmp_int64 b;
1577 } union_i4;
1578 // All operations on 'count' or 'ub' must be combined atomically together.
1579 // stealing implemented only for 4-byte indexes
1580 {
1581 union_i4 vold, vnew;
1582 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1583 vnew = vold;
1584 vnew.p.count++;
1585 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1586 ( volatile kmp_int64* )&pr->u.p.count,
1587 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1588 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1589 KMP_CPU_PAUSE();
1590 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1591 vnew = vold;
1592 vnew.p.count++;
1593 }
1594 vnew = vold;
1595 init = vnew.p.count;
1596 status = ( init < (UT)vnew.p.ub ) ;
1597 }
1598
1599 if( !status ) {
1600 kmp_info_t **other_threads = team->t.t_threads;
1601 int while_limit = 10;
1602 int while_index = 0;
1603
1604 // TODO: algorithm of searching for a victim
1605 // should be cleaned up and measured
1606 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1607 union_i4 vold, vnew;
1608 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1609 T victimIdx = pr->u.p.parm4;
1610 T oldVictimIdx = victimIdx;
1611 dispatch_private_info_template< T > * victim;
1612
1613 do {
1614 if( !victimIdx ) {
1615 victimIdx = team->t.t_nproc - 1;
1616 } else {
1617 --victimIdx;
1618 }
1619 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1620 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1621 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1622 // TODO: think about a proper place of this test
1623 if ( ( !victim ) ||
1624 ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1625 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1626 // TODO: delay would be nice
1627 continue;
1628 // the victim is not ready yet to participate in stealing
1629 // because the victim is still in kmp_init_dispatch
1630 }
1631 if ( oldVictimIdx == victimIdx ) {
1632 break;
1633 }
1634 pr->u.p.parm4 = victimIdx;
1635
1636 while( 1 ) {
1637 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1638 vnew = vold;
1639
1640 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1641 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1642 break;
1643 }
1644 vnew.p.ub -= (remaining >> 2);
1645 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1646 #pragma warning( push )
1647 // disable warning on pointless comparison of unsigned with 0
1648 #pragma warning( disable: 186 )
1649 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1650 #pragma warning( pop )
1651 // TODO: Should this be acquire or release?
1652 if ( KMP_COMPARE_AND_STORE_ACQ64(
1653 ( volatile kmp_int64 * )&victim->u.p.count,
1654 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1655 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1656 status = 1;
1657 while_index = 0;
1658 // now update own count and ub
1659 #if KMP_ARCH_X86
1660 // stealing executed on non-KMP_ARCH_X86 only
1661 // Atomic 64-bit write on ia32 is
1662 // unavailable, so we do this in steps.
1663 // This code is not tested.
1664 init = vold.p.count;
1665 pr->u.p.ub = 0;
1666 pr->u.p.count = init + 1;
1667 pr->u.p.ub = vnew.p.count;
1668 #else
1669 init = vnew.p.ub;
1670 vold.p.count = init + 1;
1671 // TODO: is it safe and enough?
1672 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1673 #endif // KMP_ARCH_X86
1674 break;
1675 } // if
1676 KMP_CPU_PAUSE();
1677 } // while (1)
1678 } // while
1679 } // if
1680 } // if
1681 if ( !status ) {
1682 *p_lb = 0;
1683 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001684 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001685 } else {
1686 start = pr->u.p.parm2;
1687 init *= chunk;
1688 limit = chunk + init - 1;
1689 incr = pr->u.p.st;
1690
1691 KMP_DEBUG_ASSERT(init <= trip);
1692 if ( (last = (limit >= trip)) != 0 )
1693 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001694 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001695
1696 if ( incr == 1 ) {
1697 *p_lb = start + init;
1698 *p_ub = start + limit;
1699 } else {
1700 *p_lb = start + init * incr;
1701 *p_ub = start + limit * incr;
1702 }
1703
1704 if ( pr->ordered ) {
1705 pr->u.p.ordered_lower = init;
1706 pr->u.p.ordered_upper = limit;
1707 #ifdef KMP_DEBUG
1708 {
1709 const char * buff;
1710 // create format specifiers before the debug output
1711 buff = __kmp_str_format(
1712 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1713 traits_t< UT >::spec, traits_t< UT >::spec );
1714 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1715 __kmp_str_free( &buff );
1716 }
1717 #endif
1718 } // if
1719 } // if
1720 break;
1721 } // case
1722 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1723 case kmp_sch_static_balanced:
1724 {
1725 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1726 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1727 pr->u.p.count = 1;
1728 *p_lb = pr->u.p.lb;
1729 *p_ub = pr->u.p.ub;
1730 last = pr->u.p.parm1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001731 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001732 *p_st = pr->u.p.st;
1733 } else { /* no iterations to do */
1734 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1735 }
1736 if ( pr->ordered ) {
1737 #ifdef KMP_DEBUG
1738 {
1739 const char * buff;
1740 // create format specifiers before the debug output
1741 buff = __kmp_str_format(
1742 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1743 traits_t< UT >::spec, traits_t< UT >::spec );
1744 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1745 __kmp_str_free( &buff );
1746 }
1747 #endif
1748 } // if
1749 } // case
1750 break;
1751 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1752 case kmp_sch_static_chunked:
1753 {
1754 T parm1;
1755
1756 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1757 gtid ) );
1758 parm1 = pr->u.p.parm1;
1759
1760 trip = pr->u.p.tc - 1;
1761 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1762
1763 if ( (status = (init <= trip)) != 0 ) {
1764 start = pr->u.p.lb;
1765 incr = pr->u.p.st;
1766 limit = parm1 + init - 1;
1767
1768 if ( (last = (limit >= trip)) != 0 )
1769 limit = trip;
1770
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001771 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001772
1773 pr->u.p.count += team->t.t_nproc;
1774
1775 if ( incr == 1 ) {
1776 *p_lb = start + init;
1777 *p_ub = start + limit;
1778 }
1779 else {
1780 *p_lb = start + init * incr;
1781 *p_ub = start + limit * incr;
1782 }
1783
1784 if ( pr->ordered ) {
1785 pr->u.p.ordered_lower = init;
1786 pr->u.p.ordered_upper = limit;
1787 #ifdef KMP_DEBUG
1788 {
1789 const char * buff;
1790 // create format specifiers before the debug output
1791 buff = __kmp_str_format(
1792 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1793 traits_t< UT >::spec, traits_t< UT >::spec );
1794 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1795 __kmp_str_free( &buff );
1796 }
1797 #endif
1798 } // if
1799 } // if
1800 } // case
1801 break;
1802
1803 case kmp_sch_dynamic_chunked:
1804 {
1805 T chunk = pr->u.p.parm1;
1806
1807 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1808 gtid ) );
1809
1810 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1811 trip = pr->u.p.tc - 1;
1812
1813 if ( (status = (init <= trip)) == 0 ) {
1814 *p_lb = 0;
1815 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001816 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001817 } else {
1818 start = pr->u.p.lb;
1819 limit = chunk + init - 1;
1820 incr = pr->u.p.st;
1821
1822 if ( (last = (limit >= trip)) != 0 )
1823 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001824
1825 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001826
1827 if ( incr == 1 ) {
1828 *p_lb = start + init;
1829 *p_ub = start + limit;
1830 } else {
1831 *p_lb = start + init * incr;
1832 *p_ub = start + limit * incr;
1833 }
1834
1835 if ( pr->ordered ) {
1836 pr->u.p.ordered_lower = init;
1837 pr->u.p.ordered_upper = limit;
1838 #ifdef KMP_DEBUG
1839 {
1840 const char * buff;
1841 // create format specifiers before the debug output
1842 buff = __kmp_str_format(
1843 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1844 traits_t< UT >::spec, traits_t< UT >::spec );
1845 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1846 __kmp_str_free( &buff );
1847 }
1848 #endif
1849 } // if
1850 } // if
1851 } // case
1852 break;
1853
1854 case kmp_sch_guided_iterative_chunked:
1855 {
1856 T chunkspec = pr->u.p.parm1;
1857 KD_TRACE(100,
1858 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1859 trip = pr->u.p.tc;
1860 // Start atomic part of calculations
1861 while(1) {
1862 ST remaining; // signed, because can be < 0
1863 init = sh->u.s.iteration; // shared value
1864 remaining = trip - init;
1865 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1866 // nothing to do, don't try atomic op
1867 status = 0;
1868 break;
1869 }
1870 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1871 // use dynamic-style shcedule
1872 // atomically inrement iterations, get old value
1873 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1874 remaining = trip - init;
1875 if (remaining <= 0) {
1876 status = 0; // all iterations got by other threads
1877 } else {
1878 // got some iterations to work on
1879 status = 1;
1880 if ( (T)remaining > chunkspec ) {
1881 limit = init + chunkspec - 1;
1882 } else {
1883 last = 1; // the last chunk
1884 limit = init + remaining - 1;
1885 } // if
1886 } // if
1887 break;
1888 } // if
1889 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1890 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1891 // CAS was successful, chunk obtained
1892 status = 1;
1893 --limit;
1894 break;
1895 } // if
1896 } // while
1897 if ( status != 0 ) {
1898 start = pr->u.p.lb;
1899 incr = pr->u.p.st;
1900 if ( p_st != NULL )
1901 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001902 *p_lb = start + init * incr;
1903 *p_ub = start + limit * incr;
1904 if ( pr->ordered ) {
1905 pr->u.p.ordered_lower = init;
1906 pr->u.p.ordered_upper = limit;
1907 #ifdef KMP_DEBUG
1908 {
1909 const char * buff;
1910 // create format specifiers before the debug output
1911 buff = __kmp_str_format(
1912 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1913 traits_t< UT >::spec, traits_t< UT >::spec );
1914 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1915 __kmp_str_free( &buff );
1916 }
1917 #endif
1918 } // if
1919 } else {
1920 *p_lb = 0;
1921 *p_ub = 0;
1922 if ( p_st != NULL )
1923 *p_st = 0;
1924 } // if
1925 } // case
1926 break;
1927
1928 case kmp_sch_guided_analytical_chunked:
1929 {
1930 T chunkspec = pr->u.p.parm1;
1931 UT chunkIdx;
1932 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1933 /* for storing original FPCW value for Windows* OS on
1934 IA-32 architecture 8-byte version */
1935 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001936 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001937 #endif
1938 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1939 gtid ) );
1940
1941 trip = pr->u.p.tc;
1942
1943 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1944 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1945
1946 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1947 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1948 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1949 --trip;
1950 /* use dynamic-style scheduling */
1951 init = chunkIdx * chunkspec + pr->u.p.count;
1952 /* need to verify init > 0 in case of overflow in the above calculation */
1953 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1954 limit = init + chunkspec -1;
1955
1956 if ( (last = (limit >= trip)) != 0 )
1957 limit = trip;
1958 }
1959 break;
1960 } else {
1961 /* use exponential-style scheduling */
1962 /* The following check is to workaround the lack of long double precision on Windows* OS.
1963 This check works around the possible effect that init != 0 for chunkIdx == 0.
1964 */
1965 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1966 /* If we haven't already done so, save original
1967 FPCW and set precision to 64-bit, as Windows* OS
1968 on IA-32 architecture defaults to 53-bit */
1969 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00001970 oldFpcw = _control87(0,0);
1971 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001972 fpcwSet = 0x30000;
1973 }
1974 #endif
1975 if ( chunkIdx ) {
1976 init = __kmp_dispatch_guided_remaining< T >(
1977 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1978 KMP_DEBUG_ASSERT(init);
1979 init = trip - init;
1980 } else
1981 init = 0;
1982 limit = trip - __kmp_dispatch_guided_remaining< T >(
1983 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1984 KMP_ASSERT(init <= limit);
1985 if ( init < limit ) {
1986 KMP_DEBUG_ASSERT(limit <= trip);
1987 --limit;
1988 status = 1;
1989 break;
1990 } // if
1991 } // if
1992 } // while (1)
1993 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00001994 /* restore FPCW if necessary
1995 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1996 */
1997 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1998 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001999 #endif
2000 if ( status != 0 ) {
2001 start = pr->u.p.lb;
2002 incr = pr->u.p.st;
2003 if ( p_st != NULL )
2004 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002005 *p_lb = start + init * incr;
2006 *p_ub = start + limit * incr;
2007 if ( pr->ordered ) {
2008 pr->u.p.ordered_lower = init;
2009 pr->u.p.ordered_upper = limit;
2010 #ifdef KMP_DEBUG
2011 {
2012 const char * buff;
2013 // create format specifiers before the debug output
2014 buff = __kmp_str_format(
2015 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2016 traits_t< UT >::spec, traits_t< UT >::spec );
2017 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2018 __kmp_str_free( &buff );
2019 }
2020 #endif
2021 }
2022 } else {
2023 *p_lb = 0;
2024 *p_ub = 0;
2025 if ( p_st != NULL )
2026 *p_st = 0;
2027 }
2028 } // case
2029 break;
2030
2031 case kmp_sch_trapezoidal:
2032 {
2033 UT index;
2034 T parm2 = pr->u.p.parm2;
2035 T parm3 = pr->u.p.parm3;
2036 T parm4 = pr->u.p.parm4;
2037 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2038 gtid ) );
2039
2040 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2041
2042 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2043 trip = pr->u.p.tc - 1;
2044
2045 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2046 *p_lb = 0;
2047 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002048 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002049 } else {
2050 start = pr->u.p.lb;
2051 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2052 incr = pr->u.p.st;
2053
2054 if ( (last = (limit >= trip)) != 0 )
2055 limit = trip;
2056
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002057 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002058
2059 if ( incr == 1 ) {
2060 *p_lb = start + init;
2061 *p_ub = start + limit;
2062 } else {
2063 *p_lb = start + init * incr;
2064 *p_ub = start + limit * incr;
2065 }
2066
2067 if ( pr->ordered ) {
2068 pr->u.p.ordered_lower = init;
2069 pr->u.p.ordered_upper = limit;
2070 #ifdef KMP_DEBUG
2071 {
2072 const char * buff;
2073 // create format specifiers before the debug output
2074 buff = __kmp_str_format(
2075 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2076 traits_t< UT >::spec, traits_t< UT >::spec );
2077 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2078 __kmp_str_free( &buff );
2079 }
2080 #endif
2081 } // if
2082 } // if
2083 } // case
2084 break;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002085 default:
2086 {
2087 status = 0; // to avoid complaints on uninitialized variable use
2088 __kmp_msg(
2089 kmp_ms_fatal, // Severity
2090 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2091 KMP_HNT( GetNewerLibrary ), // Hint
2092 __kmp_msg_null // Variadic argument list terminator
2093 );
2094 }
2095 break;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002096 } // switch
2097 } // if tc == 0;
2098
2099 if ( status == 0 ) {
2100 UT num_done;
2101
2102 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2103 #ifdef KMP_DEBUG
2104 {
2105 const char * buff;
2106 // create format specifiers before the debug output
2107 buff = __kmp_str_format(
2108 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2109 traits_t< UT >::spec );
2110 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2111 __kmp_str_free( &buff );
2112 }
2113 #endif
2114
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002115 if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002116 /* NOTE: release this buffer to be reused */
2117
2118 KMP_MB(); /* Flush all pending memory write invalidates. */
2119
2120 sh->u.s.num_done = 0;
2121 sh->u.s.iteration = 0;
2122
2123 /* TODO replace with general release procedure? */
2124 if ( pr->ordered ) {
2125 sh->u.s.ordered_iteration = 0;
2126 }
2127
2128 KMP_MB(); /* Flush all pending memory write invalidates. */
2129
2130 sh -> buffer_index += KMP_MAX_DISP_BUF;
2131 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2132 gtid, sh->buffer_index) );
2133
2134 KMP_MB(); /* Flush all pending memory write invalidates. */
2135
2136 } // if
2137 if ( __kmp_env_consistency_check ) {
2138 if ( pr->pushed_ws != ct_none ) {
2139 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2140 }
2141 }
2142
2143 th -> th.th_dispatch -> th_deo_fcn = NULL;
2144 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2145 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2146 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2147 } // if (status == 0)
2148#if KMP_OS_WINDOWS
2149 else if ( last ) {
2150 pr->u.p.last_upper = pr->u.p.ub;
2151 }
2152#endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002153 if ( p_last != NULL && status != 0 )
2154 *p_last = last;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002155 } // if
2156
2157 #ifdef KMP_DEBUG
2158 {
2159 const char * buff;
2160 // create format specifiers before the debug output
2161 buff = __kmp_str_format(
2162 "__kmp_dispatch_next: T#%%d normal case: " \
2163 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2164 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2165 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2166 __kmp_str_free( &buff );
2167 }
2168 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002169#if INCLUDE_SSC_MARKS
2170 SSC_MARK_DISPATCH_NEXT();
2171#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00002172 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002173 return status;
2174}
2175
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002176template< typename T >
2177static void
2178__kmp_dist_get_bounds(
2179 ident_t *loc,
2180 kmp_int32 gtid,
2181 kmp_int32 *plastiter,
2182 T *plower,
2183 T *pupper,
2184 typename traits_t< T >::signed_t incr
2185) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002186 typedef typename traits_t< T >::unsigned_t UT;
2187 typedef typename traits_t< T >::signed_t ST;
2188 register kmp_uint32 team_id;
2189 register kmp_uint32 nteams;
2190 register UT trip_count;
2191 register kmp_team_t *team;
2192 kmp_info_t * th;
2193
2194 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2195 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2196 #ifdef KMP_DEBUG
2197 {
2198 const char * buff;
2199 // create format specifiers before the debug output
2200 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2201 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2202 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2203 traits_t< T >::spec );
2204 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2205 __kmp_str_free( &buff );
2206 }
2207 #endif
2208
2209 if( __kmp_env_consistency_check ) {
2210 if( incr == 0 ) {
2211 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2212 }
2213 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2214 // The loop is illegal.
2215 // Some zero-trip loops maintained by compiler, e.g.:
2216 // for(i=10;i<0;++i) // lower >= upper - run-time check
2217 // for(i=0;i>10;--i) // lower <= upper - run-time check
2218 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2219 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2220 // Compiler does not check the following illegal loops:
2221 // for(i=0;i<10;i+=incr) // where incr<0
2222 // for(i=10;i>0;i-=incr) // where incr<0
2223 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2224 }
2225 }
2226 th = __kmp_threads[gtid];
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002227 team = th->th.th_team;
2228 #if OMP_40_ENABLED
Jonathan Peyton441f3372015-09-21 17:24:46 +00002229 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002230 nteams = th->th.th_teams_size.nteams;
2231 #endif
2232 team_id = team->t.t_master_tid;
2233 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2234
2235 // compute global trip count
2236 if( incr == 1 ) {
2237 trip_count = *pupper - *plower + 1;
2238 } else if(incr == -1) {
2239 trip_count = *plower - *pupper + 1;
2240 } else {
2241 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2242 }
Jonathan Peyton45be4502015-08-11 21:36:41 +00002243
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002244 if( trip_count <= nteams ) {
2245 KMP_DEBUG_ASSERT(
2246 __kmp_static == kmp_sch_static_greedy || \
2247 __kmp_static == kmp_sch_static_balanced
2248 ); // Unknown static scheduling type.
2249 // only some teams get single iteration, others get nothing
2250 if( team_id < trip_count ) {
2251 *pupper = *plower = *plower + team_id * incr;
2252 } else {
2253 *plower = *pupper + incr; // zero-trip loop
2254 }
2255 if( plastiter != NULL )
2256 *plastiter = ( team_id == trip_count - 1 );
2257 } else {
2258 if( __kmp_static == kmp_sch_static_balanced ) {
2259 register UT chunk = trip_count / nteams;
2260 register UT extras = trip_count % nteams;
2261 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2262 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2263 if( plastiter != NULL )
2264 *plastiter = ( team_id == nteams - 1 );
2265 } else {
2266 register T chunk_inc_count =
2267 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2268 register T upper = *pupper;
2269 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2270 // Unknown static scheduling type.
2271 *plower += team_id * chunk_inc_count;
2272 *pupper = *plower + chunk_inc_count - incr;
2273 // Check/correct bounds if needed
2274 if( incr > 0 ) {
2275 if( *pupper < *plower )
2276 *pupper = i_maxmin< T >::mx;
2277 if( plastiter != NULL )
2278 *plastiter = *plower <= upper && *pupper > upper - incr;
2279 if( *pupper > upper )
2280 *pupper = upper; // tracker C73258
2281 } else {
2282 if( *pupper > *plower )
2283 *pupper = i_maxmin< T >::mn;
2284 if( plastiter != NULL )
2285 *plastiter = *plower >= upper && *pupper < upper - incr;
2286 if( *pupper < upper )
2287 *pupper = upper; // tracker C73258
2288 }
2289 }
2290 }
2291}
2292
Jim Cownie5e8470a2013-09-27 10:38:44 +00002293//-----------------------------------------------------------------------------------------
2294// Dispatch routines
2295// Transfer call to template< type T >
2296// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2297// T lb, T ub, ST st, ST chunk )
2298extern "C" {
2299
2300/*!
2301@ingroup WORK_SHARING
2302@{
2303@param loc Source location
2304@param gtid Global thread id
2305@param schedule Schedule type
2306@param lb Lower bound
2307@param ub Upper bound
2308@param st Step (or increment if you prefer)
2309@param chunk The chunk size to block with
2310
2311This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2312These functions are all identical apart from the types of the arguments.
2313*/
2314
2315void
2316__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2317 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2318{
2319 KMP_DEBUG_ASSERT( __kmp_init_serial );
2320 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2321}
2322/*!
2323See @ref __kmpc_dispatch_init_4
2324*/
2325void
2326__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2327 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2328{
2329 KMP_DEBUG_ASSERT( __kmp_init_serial );
2330 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2331}
2332
2333/*!
2334See @ref __kmpc_dispatch_init_4
2335*/
2336void
2337__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2338 kmp_int64 lb, kmp_int64 ub,
2339 kmp_int64 st, kmp_int64 chunk )
2340{
2341 KMP_DEBUG_ASSERT( __kmp_init_serial );
2342 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2343}
2344
2345/*!
2346See @ref __kmpc_dispatch_init_4
2347*/
2348void
2349__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2350 kmp_uint64 lb, kmp_uint64 ub,
2351 kmp_int64 st, kmp_int64 chunk )
2352{
2353 KMP_DEBUG_ASSERT( __kmp_init_serial );
2354 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2355}
2356
2357/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002358See @ref __kmpc_dispatch_init_4
2359
2360Difference from __kmpc_dispatch_init set of functions is these functions
2361are called for composite distribute parallel for construct. Thus before
2362regular iterations dispatching we need to calc per-team iteration space.
2363
2364These functions are all identical apart from the types of the arguments.
2365*/
2366void
2367__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2368 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2369{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002370 KMP_DEBUG_ASSERT( __kmp_init_serial );
2371 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2372 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2373}
2374
2375void
2376__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2377 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2378{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002379 KMP_DEBUG_ASSERT( __kmp_init_serial );
2380 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2381 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2382}
2383
2384void
2385__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2386 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2387{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002388 KMP_DEBUG_ASSERT( __kmp_init_serial );
2389 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2390 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2391}
2392
2393void
2394__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2395 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2396{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002397 KMP_DEBUG_ASSERT( __kmp_init_serial );
2398 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2399 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2400}
2401
2402/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002403@param loc Source code location
2404@param gtid Global thread id
2405@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2406@param p_lb Pointer to the lower bound for the next chunk of work
2407@param p_ub Pointer to the upper bound for the next chunk of work
2408@param p_st Pointer to the stride for the next chunk of work
2409@return one if there is work to be done, zero otherwise
2410
2411Get the next dynamically allocated chunk of work for this thread.
2412If there is no more work, then the lb,ub and stride need not be modified.
2413*/
2414int
2415__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2416 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2417{
2418 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2419}
2420
2421/*!
2422See @ref __kmpc_dispatch_next_4
2423*/
2424int
2425__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2426 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2427{
2428 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2429}
2430
2431/*!
2432See @ref __kmpc_dispatch_next_4
2433*/
2434int
2435__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2436 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2437{
2438 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2439}
2440
2441/*!
2442See @ref __kmpc_dispatch_next_4
2443*/
2444int
2445__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2446 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2447{
2448 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2449}
2450
2451/*!
2452@param loc Source code location
2453@param gtid Global thread id
2454
2455Mark the end of a dynamic loop.
2456*/
2457void
2458__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2459{
2460 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2461}
2462
2463/*!
2464See @ref __kmpc_dispatch_fini_4
2465*/
2466void
2467__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2468{
2469 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2470}
2471
2472/*!
2473See @ref __kmpc_dispatch_fini_4
2474*/
2475void
2476__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2477{
2478 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2479}
2480
2481/*!
2482See @ref __kmpc_dispatch_fini_4
2483*/
2484void
2485__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2486{
2487 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2488}
2489/*! @} */
2490
2491//-----------------------------------------------------------------------------------------
2492//Non-template routines from kmp_dispatch.c used in other sources
2493
2494kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2495 return value == checker;
2496}
2497
2498kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2499 return value != checker;
2500}
2501
2502kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2503 return value < checker;
2504}
2505
2506kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2507 return value >= checker;
2508}
2509
2510kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2511 return value <= checker;
2512}
2513kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2514 return value == checker;
2515}
2516
2517kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2518 return value != checker;
2519}
2520
2521kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2522 return value < checker;
2523}
2524
2525kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2526 return value >= checker;
2527}
2528
2529kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2530 return value <= checker;
2531}
2532
2533kmp_uint32
2534__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2535 kmp_uint32 checker,
2536 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2537 , void * obj // Higher-level synchronization object, or NULL.
2538 )
2539{
2540 // note: we may not belong to a team at this point
2541 register volatile kmp_uint32 * spin = spinner;
2542 register kmp_uint32 check = checker;
2543 register kmp_uint32 spins;
2544 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2545 register kmp_uint32 r;
2546
2547 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2548 KMP_INIT_YIELD( spins );
2549 // main wait spin loop
2550 while(!f(r = TCR_4(*spin), check)) {
2551 KMP_FSYNC_SPIN_PREPARE( obj );
2552 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2553 It causes problems with infinite recursion because of exit lock */
2554 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2555 __kmp_abort_thread(); */
2556
Jim Cownie5e8470a2013-09-27 10:38:44 +00002557 /* if we have waited a bit, or are oversubscribed, yield */
2558 /* pause is in the following code */
2559 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2560 KMP_YIELD_SPIN( spins );
2561 }
2562 KMP_FSYNC_SPIN_ACQUIRED( obj );
2563 return r;
2564}
2565
2566kmp_uint64
2567__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2568 kmp_uint64 checker,
2569 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2570 , void * obj // Higher-level synchronization object, or NULL.
2571 )
2572{
2573 // note: we may not belong to a team at this point
2574 register volatile kmp_uint64 * spin = spinner;
2575 register kmp_uint64 check = checker;
2576 register kmp_uint32 spins;
2577 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2578 register kmp_uint64 r;
2579
2580 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2581 KMP_INIT_YIELD( spins );
2582 // main wait spin loop
2583 while(!f(r = *spin, check))
2584 {
2585 KMP_FSYNC_SPIN_PREPARE( obj );
2586 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2587 It causes problems with infinite recursion because of exit lock */
2588 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2589 __kmp_abort_thread(); */
2590
Jim Cownie5e8470a2013-09-27 10:38:44 +00002591 // if we are oversubscribed,
2592 // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2593 // pause is in the following code
2594 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2595 KMP_YIELD_SPIN( spins );
2596 }
2597 KMP_FSYNC_SPIN_ACQUIRED( obj );
2598 return r;
2599}
2600
2601} // extern "C"
2602
2603#ifdef KMP_GOMP_COMPAT
2604
2605void
2606__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2607 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2608 kmp_int32 chunk, int push_ws )
2609{
2610 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2611 push_ws );
2612}
2613
2614void
2615__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2616 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2617 kmp_int32 chunk, int push_ws )
2618{
2619 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2620 push_ws );
2621}
2622
2623void
2624__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2625 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2626 kmp_int64 chunk, int push_ws )
2627{
2628 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2629 push_ws );
2630}
2631
2632void
2633__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2634 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2635 kmp_int64 chunk, int push_ws )
2636{
2637 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2638 push_ws );
2639}
2640
2641void
2642__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2643{
2644 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2645}
2646
2647void
2648__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2649{
2650 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2651}
2652
2653void
2654__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2655{
2656 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2657}
2658
2659void
2660__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2661{
2662 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2663}
2664
2665#endif /* KMP_GOMP_COMPAT */
2666
2667/* ------------------------------------------------------------------------ */
2668/* ------------------------------------------------------------------------ */
2669