blob: 2cf3d7f42cd0faf78005311011efc9e32420a90a [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16/*
17 * Dynamic scheduling initialization and dispatch.
18 *
19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20 * it may change values between parallel regions. __kmp_max_nth
21 * is the largest value __kmp_nth may take, 1 is the smallest.
22 *
23 */
24
25/* ------------------------------------------------------------------------ */
26/* ------------------------------------------------------------------------ */
27
28#include "kmp.h"
29#include "kmp_i18n.h"
30#include "kmp_itt.h"
31#include "kmp_str.h"
32#include "kmp_error.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000033#include "kmp_stats.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000034#if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36#endif
37
Andrey Churbanovd7d088f2015-04-29 16:42:24 +000038#if OMPT_SUPPORT
39#include "ompt-internal.h"
40#include "ompt-specific.h"
41#endif
42
Jim Cownie5e8470a2013-09-27 10:38:44 +000043/* ------------------------------------------------------------------------ */
44/* ------------------------------------------------------------------------ */
45
Jim Cownie4cc4bb42014-10-07 16:25:50 +000046// template for type limits
47template< typename T >
48struct i_maxmin {
49 static const T mx;
50 static const T mn;
51};
52template<>
53struct i_maxmin< int > {
54 static const int mx = 0x7fffffff;
55 static const int mn = 0x80000000;
56};
57template<>
58struct i_maxmin< unsigned int > {
59 static const unsigned int mx = 0xffffffff;
60 static const unsigned int mn = 0x00000000;
61};
62template<>
63struct i_maxmin< long long > {
64 static const long long mx = 0x7fffffffffffffffLL;
65 static const long long mn = 0x8000000000000000LL;
66};
67template<>
68struct i_maxmin< unsigned long long > {
69 static const unsigned long long mx = 0xffffffffffffffffLL;
70 static const unsigned long long mn = 0x0000000000000000LL;
71};
72//-------------------------------------------------------------------------
73
Jim Cownie5e8470a2013-09-27 10:38:44 +000074#ifdef KMP_STATIC_STEAL_ENABLED
75
76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77 template< typename T >
78 struct dispatch_private_infoXX_template {
79 typedef typename traits_t< T >::unsigned_t UT;
80 typedef typename traits_t< T >::signed_t ST;
81 UT count; // unsigned
82 T ub;
83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84 T lb;
85 ST st; // signed
86 UT tc; // unsigned
87 T static_steal_counter; // for static_steal only; maybe better to put after ub
88
89 /* parm[1-4] are used in different ways by different scheduling algorithms */
90
91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92 // a) parm3 is properly aligned and
93 // b) all parm1-4 are in the same cache line.
94 // Because of parm1-4 are used together, performance seems to be better
95 // if they are in the same line (not measured though).
96
97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98 T parm1;
99 T parm2;
100 T parm3;
101 T parm4;
102 };
103
104 UT ordered_lower; // unsigned
105 UT ordered_upper; // unsigned
106 #if KMP_OS_WINDOWS
107 T last_upper;
108 #endif /* KMP_OS_WINDOWS */
109 };
110
111#else /* KMP_STATIC_STEAL_ENABLED */
112
113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114 template< typename T >
115 struct dispatch_private_infoXX_template {
116 typedef typename traits_t< T >::unsigned_t UT;
117 typedef typename traits_t< T >::signed_t ST;
118 T lb;
119 T ub;
120 ST st; // signed
121 UT tc; // unsigned
122
123 T parm1;
124 T parm2;
125 T parm3;
126 T parm4;
127
128 UT count; // unsigned
129
130 UT ordered_lower; // unsigned
131 UT ordered_upper; // unsigned
132 #if KMP_OS_WINDOWS
133 T last_upper;
134 #endif /* KMP_OS_WINDOWS */
135 };
136
137#endif /* KMP_STATIC_STEAL_ENABLED */
138
139// replaces dispatch_private_info structure and dispatch_private_info_t type
140template< typename T >
141struct KMP_ALIGN_CACHE dispatch_private_info_template {
142 // duplicate alignment here, otherwise size of structure is not correct in our compiler
143 union KMP_ALIGN_CACHE private_info_tmpl {
144 dispatch_private_infoXX_template< T > p;
145 dispatch_private_info64_t p64;
146 } u;
147 enum sched_type schedule; /* scheduling algorithm */
148 kmp_uint32 ordered; /* ordered clause specified */
149 kmp_uint32 ordered_bumped;
150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152 kmp_uint32 nomerge; /* don't merge iters if serialized */
153 kmp_uint32 type_size;
154 enum cons_type pushed_ws;
155};
156
157
158// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159template< typename UT >
160struct dispatch_shared_infoXX_template {
161 /* chunk index under dynamic, number of idle threads under static-steal;
162 iteration index otherwise */
163 volatile UT iteration;
164 volatile UT num_done;
165 volatile UT ordered_iteration;
166 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167};
168
169// replaces dispatch_shared_info structure and dispatch_shared_info_t type
170template< typename UT >
171struct dispatch_shared_info_template {
172 // we need union here to keep the structure size
173 union shared_info_tmpl {
174 dispatch_shared_infoXX_template< UT > s;
175 dispatch_shared_info64_t s64;
176 } u;
177 volatile kmp_uint32 buffer_index;
178};
179
180/* ------------------------------------------------------------------------ */
181/* ------------------------------------------------------------------------ */
182
Jim Cownie5e8470a2013-09-27 10:38:44 +0000183#undef USE_TEST_LOCKS
184
185// test_then_add template (general template should NOT be used)
186template< typename T >
187static __forceinline T
188test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189
190template<>
191__forceinline kmp_int32
192test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193{
194 kmp_int32 r;
195 r = KMP_TEST_THEN_ADD32( p, d );
196 return r;
197}
198
199template<>
200__forceinline kmp_int64
201test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202{
203 kmp_int64 r;
204 r = KMP_TEST_THEN_ADD64( p, d );
205 return r;
206}
207
208// test_then_inc_acq template (general template should NOT be used)
209template< typename T >
210static __forceinline T
211test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212
213template<>
214__forceinline kmp_int32
215test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216{
217 kmp_int32 r;
218 r = KMP_TEST_THEN_INC_ACQ32( p );
219 return r;
220}
221
222template<>
223__forceinline kmp_int64
224test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225{
226 kmp_int64 r;
227 r = KMP_TEST_THEN_INC_ACQ64( p );
228 return r;
229}
230
231// test_then_inc template (general template should NOT be used)
232template< typename T >
233static __forceinline T
234test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235
236template<>
237__forceinline kmp_int32
238test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239{
240 kmp_int32 r;
241 r = KMP_TEST_THEN_INC32( p );
242 return r;
243}
244
245template<>
246__forceinline kmp_int64
247test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248{
249 kmp_int64 r;
250 r = KMP_TEST_THEN_INC64( p );
251 return r;
252}
253
254// compare_and_swap template (general template should NOT be used)
255template< typename T >
256static __forceinline kmp_int32
257compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258
259template<>
260__forceinline kmp_int32
261compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262{
263 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264}
265
266template<>
267__forceinline kmp_int32
268compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269{
270 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271}
272
273/*
274 Spin wait loop that first does pause, then yield.
275 Waits until function returns non-zero when called with *spinner and check.
276 Does NOT put threads to sleep.
277#if USE_ITT_BUILD
278 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000279 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000280 locks consistently. For example, if lock is acquired immediately, its address is
281 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283 address, not an address of low-level spinner.
284#endif // USE_ITT_BUILD
285*/
286template< typename UT >
287// ToDo: make inline function (move to header file for icl)
288static UT // unsigned 4- or 8-byte type
289__kmp_wait_yield( volatile UT * spinner,
290 UT checker,
291 kmp_uint32 (* pred)( UT, UT )
292 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
293 )
294{
295 // note: we may not belong to a team at this point
296 register volatile UT * spin = spinner;
297 register UT check = checker;
298 register kmp_uint32 spins;
299 register kmp_uint32 (*f) ( UT, UT ) = pred;
300 register UT r;
301
302 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303 KMP_INIT_YIELD( spins );
304 // main wait spin loop
305 while(!f(r = *spin, check))
306 {
307 KMP_FSYNC_SPIN_PREPARE( obj );
308 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309 It causes problems with infinite recursion because of exit lock */
310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311 __kmp_abort_thread(); */
312
Jim Cownie5e8470a2013-09-27 10:38:44 +0000313 // if we are oversubscribed,
314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315 // pause is in the following code
316 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317 KMP_YIELD_SPIN( spins );
318 }
319 KMP_FSYNC_SPIN_ACQUIRED( obj );
320 return r;
321}
322
323template< typename UT >
324static kmp_uint32 __kmp_eq( UT value, UT checker) {
325 return value == checker;
326}
327
328template< typename UT >
329static kmp_uint32 __kmp_neq( UT value, UT checker) {
330 return value != checker;
331}
332
333template< typename UT >
334static kmp_uint32 __kmp_lt( UT value, UT checker) {
335 return value < checker;
336}
337
338template< typename UT >
339static kmp_uint32 __kmp_ge( UT value, UT checker) {
340 return value >= checker;
341}
342
343template< typename UT >
344static kmp_uint32 __kmp_le( UT value, UT checker) {
345 return value <= checker;
346}
347
348
349/* ------------------------------------------------------------------------ */
350/* ------------------------------------------------------------------------ */
351
352static void
353__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354{
355 kmp_info_t *th;
356
357 KMP_DEBUG_ASSERT( gtid_ref );
358
359 if ( __kmp_env_consistency_check ) {
360 th = __kmp_threads[*gtid_ref];
361 if ( th -> th.th_root -> r.r_active
362 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000363#if KMP_USE_DYNAMIC_LOCK
364 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000366 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000367#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000368 }
369 }
370}
371
372template< typename UT >
373static void
374__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375{
376 typedef typename traits_t< UT >::signed_t ST;
377 dispatch_private_info_template< UT > * pr;
378
379 int gtid = *gtid_ref;
380// int cid = *cid_ref;
381 kmp_info_t *th = __kmp_threads[ gtid ];
382 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383
384 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385 if ( __kmp_env_consistency_check ) {
386 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387 ( th -> th.th_dispatch -> th_dispatch_pr_current );
388 if ( pr -> pushed_ws != ct_none ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000389#if KMP_USE_DYNAMIC_LOCK
390 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000392 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000393#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000394 }
395 }
396
397 if ( ! th -> th.th_team -> t.t_serialized ) {
398 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399 ( th -> th.th_dispatch -> th_dispatch_sh_current );
400 UT lower;
401
402 if ( ! __kmp_env_consistency_check ) {
403 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404 ( th -> th.th_dispatch -> th_dispatch_pr_current );
405 }
406 lower = pr->u.p.ordered_lower;
407
408 #if ! defined( KMP_GOMP_COMPAT )
409 if ( __kmp_env_consistency_check ) {
410 if ( pr->ordered_bumped ) {
411 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412 __kmp_error_construct2(
413 kmp_i18n_msg_CnsMultipleNesting,
414 ct_ordered_in_pdo, loc_ref,
415 & p->stack_data[ p->w_top ]
416 );
417 }
418 }
419 #endif /* !defined(KMP_GOMP_COMPAT) */
420
421 KMP_MB();
422 #ifdef KMP_DEBUG
423 {
424 const char * buff;
425 // create format specifiers before the debug output
426 buff = __kmp_str_format(
427 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428 traits_t< UT >::spec, traits_t< UT >::spec );
429 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430 __kmp_str_free( &buff );
431 }
432 #endif
433
434 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435 USE_ITT_BUILD_ARG( NULL )
436 );
437 KMP_MB(); /* is this necessary? */
438 #ifdef KMP_DEBUG
439 {
440 const char * buff;
441 // create format specifiers before the debug output
442 buff = __kmp_str_format(
443 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444 traits_t< UT >::spec, traits_t< UT >::spec );
445 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446 __kmp_str_free( &buff );
447 }
448 #endif
449 }
450 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451}
452
453static void
454__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455{
456 kmp_info_t *th;
457
458 if ( __kmp_env_consistency_check ) {
459 th = __kmp_threads[*gtid_ref];
460 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462 }
463 }
464}
465
466template< typename UT >
467static void
468__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469{
470 typedef typename traits_t< UT >::signed_t ST;
471 dispatch_private_info_template< UT > * pr;
472
473 int gtid = *gtid_ref;
474// int cid = *cid_ref;
475 kmp_info_t *th = __kmp_threads[ gtid ];
476 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477
478 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479 if ( __kmp_env_consistency_check ) {
480 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481 ( th -> th.th_dispatch -> th_dispatch_pr_current );
482 if ( pr -> pushed_ws != ct_none ) {
483 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484 }
485 }
486
487 if ( ! th -> th.th_team -> t.t_serialized ) {
488 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489 ( th -> th.th_dispatch -> th_dispatch_sh_current );
490
491 if ( ! __kmp_env_consistency_check ) {
492 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493 ( th -> th.th_dispatch -> th_dispatch_pr_current );
494 }
495
496 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497 #if ! defined( KMP_GOMP_COMPAT )
498 if ( __kmp_env_consistency_check ) {
499 if ( pr->ordered_bumped != 0 ) {
500 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501 /* How to test it? - OM */
502 __kmp_error_construct2(
503 kmp_i18n_msg_CnsMultipleNesting,
504 ct_ordered_in_pdo, loc_ref,
505 & p->stack_data[ p->w_top ]
506 );
507 }
508 }
509 #endif /* !defined(KMP_GOMP_COMPAT) */
510
511 KMP_MB(); /* Flush all pending memory write invalidates. */
512
513 pr->ordered_bumped += 1;
514
515 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516 gtid, pr->ordered_bumped ) );
517
518 KMP_MB(); /* Flush all pending memory write invalidates. */
519
520 /* TODO use general release procedure? */
521 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522
523 KMP_MB(); /* Flush all pending memory write invalidates. */
524 }
525 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526}
527
528/* Computes and returns x to the power of y, where y must a non-negative integer */
529template< typename UT >
530static __forceinline long double
531__kmp_pow(long double x, UT y) {
532 long double s=1.0L;
533
534 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536 while(y) {
537 if ( y & 1 )
538 s *= x;
539 x *= x;
540 y >>= 1;
541 }
542 return s;
543}
544
545/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549*/
550template< typename T >
551static __inline typename traits_t< T >::unsigned_t
552__kmp_dispatch_guided_remaining(
553 T tc,
554 typename traits_t< T >::floating_t base,
555 typename traits_t< T >::unsigned_t idx
556) {
557 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558 least for ICL 8.1, long double arithmetic may not really have
559 long double precision, even with /Qlong_double. Currently, we
560 workaround that in the caller code, by manipulating the FPCW for
561 Windows* OS on IA-32 architecture. The lack of precision is not
562 expected to be a correctness issue, though.
563 */
564 typedef typename traits_t< T >::unsigned_t UT;
565
566 long double x = tc * __kmp_pow< UT >(base, idx);
567 UT r = (UT) x;
568 if ( x == r )
569 return r;
570 return r + 1;
571}
572
573// Parameters of the guided-iterative algorithm:
574// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
575// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
576// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578static int guided_int_param = 2;
579static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580
581// UT - unsigned flavor of T, ST - signed flavor of T,
582// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583template< typename T >
584static void
585__kmp_dispatch_init(
586 ident_t * loc,
587 int gtid,
588 enum sched_type schedule,
589 T lb,
590 T ub,
591 typename traits_t< T >::signed_t st,
592 typename traits_t< T >::signed_t chunk,
593 int push_ws
594) {
595 typedef typename traits_t< T >::unsigned_t UT;
596 typedef typename traits_t< T >::signed_t ST;
597 typedef typename traits_t< T >::floating_t DBL;
598 static const int ___kmp_size_type = sizeof( UT );
599
600 int active;
601 T tc;
602 kmp_info_t * th;
603 kmp_team_t * team;
604 kmp_uint32 my_buffer_index;
605 dispatch_private_info_template< T > * pr;
606 dispatch_shared_info_template< UT > volatile * sh;
607
608 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610
611 if ( ! TCR_4( __kmp_init_parallel ) )
612 __kmp_parallel_initialize();
613
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000614#if INCLUDE_SSC_MARKS
615 SSC_MARK_DISPATCH_INIT();
616#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000617 #ifdef KMP_DEBUG
618 {
619 const char * buff;
620 // create format specifiers before the debug output
621 buff = __kmp_str_format(
622 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625 __kmp_str_free( &buff );
626 }
627 #endif
628 /* setup data */
629 th = __kmp_threads[ gtid ];
630 team = th -> th.th_team;
631 active = ! team -> t.t_serialized;
632 th->th.th_ident = loc;
633
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000634#if USE_ITT_BUILD
635 kmp_uint64 cur_chunk = chunk;
636#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000637 if ( ! active ) {
638 pr = reinterpret_cast< dispatch_private_info_template< T >* >
639 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
640 } else {
641 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
642 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
643
644 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
645
646 /* What happens when number of threads changes, need to resize buffer? */
647 pr = reinterpret_cast< dispatch_private_info_template< T > * >
648 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
649 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
650 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
651 }
652
653 /* Pick up the nomerge/ordered bits from the scheduling type */
654 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
655 pr->nomerge = TRUE;
656 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
657 } else {
658 pr->nomerge = FALSE;
659 }
660 pr->type_size = ___kmp_size_type; // remember the size of variables
661 if ( kmp_ord_lower & schedule ) {
662 pr->ordered = TRUE;
663 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
664 } else {
665 pr->ordered = FALSE;
666 }
667 if ( schedule == kmp_sch_static ) {
668 schedule = __kmp_static;
669 } else {
670 if ( schedule == kmp_sch_runtime ) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000671 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
672 schedule = team -> t.t_sched.r_sched_type;
673 // Detail the schedule if needed (global controls are differentiated appropriately)
674 if ( schedule == kmp_sch_guided_chunked ) {
675 schedule = __kmp_guided;
676 } else if ( schedule == kmp_sch_static ) {
677 schedule = __kmp_static;
678 }
679 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
680 chunk = team -> t.t_sched.chunk;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000681
682 #ifdef KMP_DEBUG
683 {
684 const char * buff;
685 // create format specifiers before the debug output
686 buff = __kmp_str_format(
687 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
688 traits_t< ST >::spec );
689 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
690 __kmp_str_free( &buff );
691 }
692 #endif
693 } else {
694 if ( schedule == kmp_sch_guided_chunked ) {
695 schedule = __kmp_guided;
696 }
697 if ( chunk <= 0 ) {
698 chunk = KMP_DEFAULT_CHUNK;
699 }
700 }
701
Jim Cownie5e8470a2013-09-27 10:38:44 +0000702 if ( schedule == kmp_sch_auto ) {
703 // mapping and differentiation: in the __kmp_do_serial_initialize()
704 schedule = __kmp_auto;
705 #ifdef KMP_DEBUG
706 {
707 const char * buff;
708 // create format specifiers before the debug output
709 buff = __kmp_str_format(
710 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
711 traits_t< ST >::spec );
712 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
713 __kmp_str_free( &buff );
714 }
715 #endif
716 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000717
718 /* guided analytical not safe for too many threads */
719 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
720 schedule = kmp_sch_guided_iterative_chunked;
721 KMP_WARNING( DispatchManyThreads );
722 }
723 pr->u.p.parm1 = chunk;
724 }
725 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
726 "unknown scheduling type" );
727
728 pr->u.p.count = 0;
729
730 if ( __kmp_env_consistency_check ) {
731 if ( st == 0 ) {
732 __kmp_error_construct(
733 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
734 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
735 );
736 }
737 }
738
739 tc = ( ub - lb + st );
740 if ( st != 1 ) {
741 if ( st < 0 ) {
742 if ( lb < ub ) {
743 tc = 0; // zero-trip
744 } else { // lb >= ub
745 tc = (ST)tc / st; // convert to signed division
746 }
747 } else { // st > 0
748 if ( ub < lb ) {
749 tc = 0; // zero-trip
750 } else { // lb >= ub
751 tc /= st;
752 }
753 }
754 } else if ( ub < lb ) { // st == 1
755 tc = 0; // zero-trip
756 }
757
758 pr->u.p.lb = lb;
759 pr->u.p.ub = ub;
760 pr->u.p.st = st;
761 pr->u.p.tc = tc;
762
763 #if KMP_OS_WINDOWS
764 pr->u.p.last_upper = ub + st;
765 #endif /* KMP_OS_WINDOWS */
766
767 /* NOTE: only the active parallel region(s) has active ordered sections */
768
769 if ( active ) {
770 if ( pr->ordered == 0 ) {
771 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
772 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
773 } else {
774 pr->ordered_bumped = 0;
775
776 pr->u.p.ordered_lower = 1;
777 pr->u.p.ordered_upper = 0;
778
779 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
780 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
781 }
782 }
783
784 if ( __kmp_env_consistency_check ) {
785 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
786 if ( push_ws ) {
787 __kmp_push_workshare( gtid, ws, loc );
788 pr->pushed_ws = ws;
789 } else {
790 __kmp_check_workshare( gtid, ws, loc );
791 pr->pushed_ws = ct_none;
792 }
793 }
794
795 switch ( schedule ) {
796 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
797 case kmp_sch_static_steal:
798 {
799 T nproc = team->t.t_nproc;
800 T ntc, init;
801
802 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
803
804 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
805 if ( nproc > 1 && ntc >= nproc ) {
806 T id = __kmp_tid_from_gtid(gtid);
807 T small_chunk, extras;
808
809 small_chunk = ntc / nproc;
810 extras = ntc % nproc;
811
812 init = id * small_chunk + ( id < extras ? id : extras );
813 pr->u.p.count = init;
814 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
815
816 pr->u.p.parm2 = lb;
817 //pr->pfields.parm3 = 0; // it's not used in static_steal
818 pr->u.p.parm4 = id;
819 pr->u.p.st = st;
820 break;
821 } else {
822 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
823 gtid ) );
824 schedule = kmp_sch_static_balanced;
825 /* too few iterations: fall-through to kmp_sch_static_balanced */
826 } // if
827 /* FALL-THROUGH to static balanced */
828 } // case
829 #endif
830 case kmp_sch_static_balanced:
831 {
832 T nproc = team->t.t_nproc;
833 T init, limit;
834
835 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
836 gtid ) );
837
838 if ( nproc > 1 ) {
839 T id = __kmp_tid_from_gtid(gtid);
840
841 if ( tc < nproc ) {
842 if ( id < tc ) {
843 init = id;
844 limit = id;
845 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
846 } else {
847 pr->u.p.count = 1; /* means no more chunks to execute */
848 pr->u.p.parm1 = FALSE;
849 break;
850 }
851 } else {
852 T small_chunk = tc / nproc;
853 T extras = tc % nproc;
854 init = id * small_chunk + (id < extras ? id : extras);
855 limit = init + small_chunk - (id < extras ? 0 : 1);
856 pr->u.p.parm1 = (id == nproc - 1);
857 }
858 } else {
859 if ( tc > 0 ) {
860 init = 0;
861 limit = tc - 1;
862 pr->u.p.parm1 = TRUE;
863 } else {
864 // zero trip count
865 pr->u.p.count = 1; /* means no more chunks to execute */
866 pr->u.p.parm1 = FALSE;
867 break;
868 }
869 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000870#if USE_ITT_BUILD
871 // Calculate chunk for metadata report
872 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
873 cur_chunk = limit - init + 1;
874 }
875#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000876 if ( st == 1 ) {
877 pr->u.p.lb = lb + init;
878 pr->u.p.ub = lb + limit;
879 } else {
880 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
881 pr->u.p.lb = lb + init * st;
882 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
883 if ( st > 0 ) {
884 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
885 } else {
886 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
887 }
888 }
889 if ( pr->ordered ) {
890 pr->u.p.ordered_lower = init;
891 pr->u.p.ordered_upper = limit;
892 }
893 break;
894 } // case
895 case kmp_sch_guided_iterative_chunked :
896 {
897 T nproc = team->t.t_nproc;
898 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
899
900 if ( nproc > 1 ) {
901 if ( (2L * chunk + 1 ) * nproc >= tc ) {
902 /* chunk size too large, switch to dynamic */
903 schedule = kmp_sch_dynamic_chunked;
904 } else {
905 // when remaining iters become less than parm2 - switch to dynamic
906 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
907 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
908 }
909 } else {
910 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
911 schedule = kmp_sch_static_greedy;
912 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
913 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
914 pr->u.p.parm1 = tc;
915 } // if
916 } // case
917 break;
918 case kmp_sch_guided_analytical_chunked:
919 {
920 T nproc = team->t.t_nproc;
921 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
922
923 if ( nproc > 1 ) {
924 if ( (2L * chunk + 1 ) * nproc >= tc ) {
925 /* chunk size too large, switch to dynamic */
926 schedule = kmp_sch_dynamic_chunked;
927 } else {
928 /* commonly used term: (2 nproc - 1)/(2 nproc) */
929 DBL x;
930
931 #if KMP_OS_WINDOWS && KMP_ARCH_X86
932 /* Linux* OS already has 64-bit computation by default for
933 long double, and on Windows* OS on Intel(R) 64,
934 /Qlong_double doesn't work. On Windows* OS
935 on IA-32 architecture, we need to set precision to
936 64-bit instead of the default 53-bit. Even though long
937 double doesn't work on Windows* OS on Intel(R) 64, the
938 resulting lack of precision is not expected to impact
939 the correctness of the algorithm, but this has not been
940 mathematically proven.
941 */
942 // save original FPCW and set precision to 64-bit, as
943 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +0000944 unsigned int oldFpcw = _control87(0,0);
945 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +0000946 #endif
947 /* value used for comparison in solver for cross-over point */
948 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
949
950 /* crossover point--chunk indexes equal to or greater than
951 this point switch to dynamic-style scheduling */
952 UT cross;
953
954 /* commonly used term: (2 nproc - 1)/(2 nproc) */
955 x = (long double)1.0 - (long double)0.5 / nproc;
956
957 #ifdef KMP_DEBUG
958 { // test natural alignment
959 struct _test_a {
960 char a;
961 union {
962 char b;
963 DBL d;
964 };
965 } t;
966 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
967 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
968 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
969 }
970 #endif // KMP_DEBUG
971
972 /* save the term in thread private dispatch structure */
973 *(DBL*)&pr->u.p.parm3 = x;
974
975 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
976 {
977 UT left, right, mid;
978 long double p;
979
980 /* estimate initial upper and lower bound */
981
982 /* doesn't matter what value right is as long as it is positive, but
983 it affects performance of the solver
984 */
985 right = 229;
986 p = __kmp_pow< UT >(x,right);
987 if ( p > target ) {
988 do{
989 p *= p;
990 right <<= 1;
991 } while(p>target && right < (1<<27));
992 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
993 } else {
994 left = 0;
995 }
996
997 /* bisection root-finding method */
998 while ( left + 1 < right ) {
999 mid = (left + right) / 2;
1000 if ( __kmp_pow< UT >(x,mid) > target ) {
1001 left = mid;
1002 } else {
1003 right = mid;
1004 }
1005 } // while
1006 cross = right;
1007 }
1008 /* assert sanity of computed crossover point */
1009 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1010
1011 /* save the crossover point in thread private dispatch structure */
1012 pr->u.p.parm2 = cross;
1013
1014 // C75803
1015 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1016 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1017 #else
1018 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1019 #endif
1020 /* dynamic-style scheduling offset */
1021 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1022 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1023 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +00001024 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001025 #endif
1026 } // if
1027 } else {
1028 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1029 gtid ) );
1030 schedule = kmp_sch_static_greedy;
1031 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1032 pr->u.p.parm1 = tc;
1033 } // if
1034 } // case
1035 break;
1036 case kmp_sch_static_greedy:
1037 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1038 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1039 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1040 tc;
1041 break;
1042 case kmp_sch_static_chunked :
1043 case kmp_sch_dynamic_chunked :
1044 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1045 break;
1046 case kmp_sch_trapezoidal :
1047 {
1048 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1049
1050 T parm1, parm2, parm3, parm4;
1051 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1052
1053 parm1 = chunk;
1054
1055 /* F : size of the first cycle */
1056 parm2 = ( tc / (2 * team->t.t_nproc) );
1057
1058 if ( parm2 < 1 ) {
1059 parm2 = 1;
1060 }
1061
1062 /* L : size of the last cycle. Make sure the last cycle
1063 * is not larger than the first cycle.
1064 */
1065 if ( parm1 < 1 ) {
1066 parm1 = 1;
1067 } else if ( parm1 > parm2 ) {
1068 parm1 = parm2;
1069 }
1070
1071 /* N : number of cycles */
1072 parm3 = ( parm2 + parm1 );
1073 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1074
1075 if ( parm3 < 2 ) {
1076 parm3 = 2;
1077 }
1078
1079 /* sigma : decreasing incr of the trapezoid */
1080 parm4 = ( parm3 - 1 );
1081 parm4 = ( parm2 - parm1 ) / parm4;
1082
1083 // pointless check, because parm4 >= 0 always
1084 //if ( parm4 < 0 ) {
1085 // parm4 = 0;
1086 //}
1087
1088 pr->u.p.parm1 = parm1;
1089 pr->u.p.parm2 = parm2;
1090 pr->u.p.parm3 = parm3;
1091 pr->u.p.parm4 = parm4;
1092 } // case
1093 break;
1094
1095 default:
1096 {
1097 __kmp_msg(
1098 kmp_ms_fatal, // Severity
1099 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1100 KMP_HNT( GetNewerLibrary ), // Hint
1101 __kmp_msg_null // Variadic argument list terminator
1102 );
1103 }
1104 break;
1105 } // switch
1106 pr->schedule = schedule;
1107 if ( active ) {
1108 /* The name of this buffer should be my_buffer_index when it's free to use it */
1109
1110 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1111 gtid, my_buffer_index, sh->buffer_index) );
1112 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1113 USE_ITT_BUILD_ARG( NULL )
1114 );
1115 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1116 // *always* 32-bit integers.
1117 KMP_MB(); /* is this necessary? */
1118 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1119 gtid, my_buffer_index, sh->buffer_index) );
1120
1121 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1122 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1123#if USE_ITT_BUILD
1124 if ( pr->ordered ) {
1125 __kmp_itt_ordered_init( gtid );
1126 }; // if
1127#endif /* USE_ITT_BUILD */
1128 }; // if
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001129
1130#if USE_ITT_BUILD
1131 // Report loop metadata
1132 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
1133 kmp_uint32 tid = __kmp_tid_from_gtid( gtid );
1134 if (KMP_MASTER_TID(tid)) {
1135 kmp_uint64 schedtype = 0;
1136
1137 switch ( schedule ) {
1138 case kmp_sch_static_chunked:
1139 case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1140 break;
1141 case kmp_sch_static_greedy:
1142 cur_chunk = pr->u.p.parm1;
1143 break;
1144 case kmp_sch_dynamic_chunked:
1145 schedtype = 1;
1146 break;
1147 case kmp_sch_guided_iterative_chunked:
1148 case kmp_sch_guided_analytical_chunked:
1149 schedtype = 2;
1150 break;
1151 default:
1152// Should we put this case under "static"?
1153// case kmp_sch_static_steal:
1154 schedtype = 3;
1155 break;
1156 }
1157 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1158 }
1159 }
1160#endif /* USE_ITT_BUILD */
1161
Jim Cownie5e8470a2013-09-27 10:38:44 +00001162 #ifdef KMP_DEBUG
1163 {
1164 const char * buff;
1165 // create format specifiers before the debug output
1166 buff = __kmp_str_format(
1167 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1168 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1169 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1170 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1171 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1172 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1173 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1174 KD_TRACE(10, ( buff,
1175 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1176 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1177 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1178 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1179 __kmp_str_free( &buff );
1180 }
1181 #endif
1182 #if ( KMP_STATIC_STEAL_ENABLED )
1183 if ( ___kmp_size_type < 8 ) {
1184 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1185 // all the parm3 variables will contain the same value.
1186 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1187 // rather than program life-time increment.
1188 // So the dedicated variable is required. The 'static_steal_counter' is used.
1189 if( schedule == kmp_sch_static_steal ) {
1190 // Other threads will inspect this variable when searching for a victim.
1191 // This is a flag showing that other threads may steal from this thread since then.
1192 volatile T * p = &pr->u.p.static_steal_counter;
1193 *p = *p + 1;
1194 }
1195 }
1196 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001197
1198#if OMPT_SUPPORT && OMPT_TRACE
1199 if ((ompt_status == ompt_status_track_callback) &&
1200 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1201 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1202 ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1203 ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1204 team_info->parallel_id, task_info->task_id, team_info->microtask);
1205 }
1206#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001207}
1208
1209/*
1210 * For ordered loops, either __kmp_dispatch_finish() should be called after
1211 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1212 * every chunk of iterations. If the ordered section(s) were not executed
1213 * for this iteration (or every iteration in this chunk), we need to set the
1214 * ordered iteration counters so that the next thread can proceed.
1215 */
1216template< typename UT >
1217static void
1218__kmp_dispatch_finish( int gtid, ident_t *loc )
1219{
1220 typedef typename traits_t< UT >::signed_t ST;
1221 kmp_info_t *th = __kmp_threads[ gtid ];
1222
1223 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1224 if ( ! th -> th.th_team -> t.t_serialized ) {
1225
1226 dispatch_private_info_template< UT > * pr =
1227 reinterpret_cast< dispatch_private_info_template< UT >* >
1228 ( th->th.th_dispatch->th_dispatch_pr_current );
1229 dispatch_shared_info_template< UT > volatile * sh =
1230 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1231 ( th->th.th_dispatch->th_dispatch_sh_current );
1232 KMP_DEBUG_ASSERT( pr );
1233 KMP_DEBUG_ASSERT( sh );
1234 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1235 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1236
1237 if ( pr->ordered_bumped ) {
1238 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1239 gtid ) );
1240 pr->ordered_bumped = 0;
1241 } else {
1242 UT lower = pr->u.p.ordered_lower;
1243
1244 #ifdef KMP_DEBUG
1245 {
1246 const char * buff;
1247 // create format specifiers before the debug output
1248 buff = __kmp_str_format(
1249 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1250 traits_t< UT >::spec, traits_t< UT >::spec );
1251 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1252 __kmp_str_free( &buff );
1253 }
1254 #endif
1255
1256 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1257 USE_ITT_BUILD_ARG(NULL)
1258 );
1259 KMP_MB(); /* is this necessary? */
1260 #ifdef KMP_DEBUG
1261 {
1262 const char * buff;
1263 // create format specifiers before the debug output
1264 buff = __kmp_str_format(
1265 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1266 traits_t< UT >::spec, traits_t< UT >::spec );
1267 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1268 __kmp_str_free( &buff );
1269 }
1270 #endif
1271
1272 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1273 } // if
1274 } // if
1275 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1276}
1277
1278#ifdef KMP_GOMP_COMPAT
1279
1280template< typename UT >
1281static void
1282__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1283{
1284 typedef typename traits_t< UT >::signed_t ST;
1285 kmp_info_t *th = __kmp_threads[ gtid ];
1286
1287 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1288 if ( ! th -> th.th_team -> t.t_serialized ) {
1289// int cid;
1290 dispatch_private_info_template< UT > * pr =
1291 reinterpret_cast< dispatch_private_info_template< UT >* >
1292 ( th->th.th_dispatch->th_dispatch_pr_current );
1293 dispatch_shared_info_template< UT > volatile * sh =
1294 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1295 ( th->th.th_dispatch->th_dispatch_sh_current );
1296 KMP_DEBUG_ASSERT( pr );
1297 KMP_DEBUG_ASSERT( sh );
1298 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1299 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1300
1301// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1302 UT lower = pr->u.p.ordered_lower;
1303 UT upper = pr->u.p.ordered_upper;
1304 UT inc = upper - lower + 1;
1305
1306 if ( pr->ordered_bumped == inc ) {
1307 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1308 gtid ) );
1309 pr->ordered_bumped = 0;
1310 } else {
1311 inc -= pr->ordered_bumped;
1312
1313 #ifdef KMP_DEBUG
1314 {
1315 const char * buff;
1316 // create format specifiers before the debug output
1317 buff = __kmp_str_format(
1318 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1319 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1320 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1321 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1322 __kmp_str_free( &buff );
1323 }
1324 #endif
1325
1326 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1327 USE_ITT_BUILD_ARG(NULL)
1328 );
1329
1330 KMP_MB(); /* is this necessary? */
1331 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1332 gtid ) );
1333 pr->ordered_bumped = 0;
1334//!!!!! TODO check if the inc should be unsigned, or signed???
1335 #ifdef KMP_DEBUG
1336 {
1337 const char * buff;
1338 // create format specifiers before the debug output
1339 buff = __kmp_str_format(
1340 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1341 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1342 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1343 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1344 __kmp_str_free( &buff );
1345 }
1346 #endif
1347
1348 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1349 }
1350// }
1351 }
1352 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1353}
1354
1355#endif /* KMP_GOMP_COMPAT */
1356
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001357/* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1358 * (no more work), then tell OMPT the loop is over. In some cases
1359 * kmp_dispatch_fini() is not called. */
1360#if OMPT_SUPPORT && OMPT_TRACE
1361#define OMPT_LOOP_END \
1362 if (status == 0) { \
1363 if ((ompt_status == ompt_status_track_callback) && \
1364 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1365 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1366 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1367 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1368 team_info->parallel_id, task_info->task_id); \
1369 } \
1370 }
1371#else
1372#define OMPT_LOOP_END // no-op
1373#endif
1374
Jim Cownie5e8470a2013-09-27 10:38:44 +00001375template< typename T >
1376static int
1377__kmp_dispatch_next(
1378 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1379) {
1380
1381 typedef typename traits_t< T >::unsigned_t UT;
1382 typedef typename traits_t< T >::signed_t ST;
1383 typedef typename traits_t< T >::floating_t DBL;
1384 static const int ___kmp_size_type = sizeof( UT );
1385
1386 int status;
1387 dispatch_private_info_template< T > * pr;
1388 kmp_info_t * th = __kmp_threads[ gtid ];
1389 kmp_team_t * team = th -> th.th_team;
1390
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001391 KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie5e8470a2013-09-27 10:38:44 +00001392 #ifdef KMP_DEBUG
1393 {
1394 const char * buff;
1395 // create format specifiers before the debug output
1396 buff = __kmp_str_format(
1397 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1398 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1399 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1400 __kmp_str_free( &buff );
1401 }
1402 #endif
1403
1404 if ( team -> t.t_serialized ) {
1405 /* NOTE: serialize this dispatch becase we are not at the active level */
1406 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1407 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1408 KMP_DEBUG_ASSERT( pr );
1409
1410 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1411 *p_lb = 0;
1412 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001413// if ( p_last != NULL )
1414// *p_last = 0;
1415 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001416 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001417 if ( __kmp_env_consistency_check ) {
1418 if ( pr->pushed_ws != ct_none ) {
1419 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1420 }
1421 }
1422 } else if ( pr->nomerge ) {
1423 kmp_int32 last;
1424 T start;
1425 UT limit, trip, init;
1426 ST incr;
1427 T chunk = pr->u.p.parm1;
1428
1429 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1430
1431 init = chunk * pr->u.p.count++;
1432 trip = pr->u.p.tc - 1;
1433
1434 if ( (status = (init <= trip)) == 0 ) {
1435 *p_lb = 0;
1436 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001437// if ( p_last != NULL )
1438// *p_last = 0;
1439 if ( p_st != NULL )
1440 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001441 if ( __kmp_env_consistency_check ) {
1442 if ( pr->pushed_ws != ct_none ) {
1443 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1444 }
1445 }
1446 } else {
1447 start = pr->u.p.lb;
1448 limit = chunk + init - 1;
1449 incr = pr->u.p.st;
1450
1451 if ( (last = (limit >= trip)) != 0 ) {
1452 limit = trip;
1453 #if KMP_OS_WINDOWS
1454 pr->u.p.last_upper = pr->u.p.ub;
1455 #endif /* KMP_OS_WINDOWS */
1456 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001457 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001458 *p_last = last;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001459 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001460 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001461 if ( incr == 1 ) {
1462 *p_lb = start + init;
1463 *p_ub = start + limit;
1464 } else {
1465 *p_lb = start + init * incr;
1466 *p_ub = start + limit * incr;
1467 }
1468
1469 if ( pr->ordered ) {
1470 pr->u.p.ordered_lower = init;
1471 pr->u.p.ordered_upper = limit;
1472 #ifdef KMP_DEBUG
1473 {
1474 const char * buff;
1475 // create format specifiers before the debug output
1476 buff = __kmp_str_format(
1477 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1478 traits_t< UT >::spec, traits_t< UT >::spec );
1479 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1480 __kmp_str_free( &buff );
1481 }
1482 #endif
1483 } // if
1484 } // if
1485 } else {
1486 pr->u.p.tc = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001487 *p_lb = pr->u.p.lb;
1488 *p_ub = pr->u.p.ub;
1489 #if KMP_OS_WINDOWS
1490 pr->u.p.last_upper = *p_ub;
1491 #endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001492 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001493 *p_last = TRUE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001494 if ( p_st != NULL )
1495 *p_st = pr->u.p.st;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001496 } // if
1497 #ifdef KMP_DEBUG
1498 {
1499 const char * buff;
1500 // create format specifiers before the debug output
1501 buff = __kmp_str_format(
1502 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001503 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie5e8470a2013-09-27 10:38:44 +00001504 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001505 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001506 __kmp_str_free( &buff );
1507 }
1508 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001509#if INCLUDE_SSC_MARKS
1510 SSC_MARK_DISPATCH_NEXT();
1511#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001512 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001513 return status;
1514 } else {
1515 kmp_int32 last = 0;
1516 dispatch_shared_info_template< UT > *sh;
1517 T start;
1518 ST incr;
1519 UT limit, trip, init;
1520
1521 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1522 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1523
1524 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1525 ( th->th.th_dispatch->th_dispatch_pr_current );
1526 KMP_DEBUG_ASSERT( pr );
1527 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1528 ( th->th.th_dispatch->th_dispatch_sh_current );
1529 KMP_DEBUG_ASSERT( sh );
1530
1531 if ( pr->u.p.tc == 0 ) {
1532 // zero trip count
1533 status = 0;
1534 } else {
1535 switch (pr->schedule) {
1536 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1537 case kmp_sch_static_steal:
1538 {
1539 T chunk = pr->u.p.parm1;
1540
1541 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1542
1543 trip = pr->u.p.tc - 1;
1544
1545 if ( ___kmp_size_type > 4 ) {
1546 // Other threads do not look into the data of this thread,
1547 // so it's not necessary to make volatile casting.
1548 init = ( pr->u.p.count )++;
1549 status = ( init < (UT)pr->u.p.ub );
1550 } else {
1551 typedef union {
1552 struct {
1553 UT count;
1554 T ub;
1555 } p;
1556 kmp_int64 b;
1557 } union_i4;
1558 // All operations on 'count' or 'ub' must be combined atomically together.
1559 // stealing implemented only for 4-byte indexes
1560 {
1561 union_i4 vold, vnew;
1562 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1563 vnew = vold;
1564 vnew.p.count++;
1565 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1566 ( volatile kmp_int64* )&pr->u.p.count,
1567 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1568 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1569 KMP_CPU_PAUSE();
1570 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1571 vnew = vold;
1572 vnew.p.count++;
1573 }
1574 vnew = vold;
1575 init = vnew.p.count;
1576 status = ( init < (UT)vnew.p.ub ) ;
1577 }
1578
1579 if( !status ) {
1580 kmp_info_t **other_threads = team->t.t_threads;
1581 int while_limit = 10;
1582 int while_index = 0;
1583
1584 // TODO: algorithm of searching for a victim
1585 // should be cleaned up and measured
1586 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1587 union_i4 vold, vnew;
1588 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1589 T victimIdx = pr->u.p.parm4;
1590 T oldVictimIdx = victimIdx;
1591 dispatch_private_info_template< T > * victim;
1592
1593 do {
1594 if( !victimIdx ) {
1595 victimIdx = team->t.t_nproc - 1;
1596 } else {
1597 --victimIdx;
1598 }
1599 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1600 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1601 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1602 // TODO: think about a proper place of this test
1603 if ( ( !victim ) ||
1604 ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1605 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1606 // TODO: delay would be nice
1607 continue;
1608 // the victim is not ready yet to participate in stealing
1609 // because the victim is still in kmp_init_dispatch
1610 }
1611 if ( oldVictimIdx == victimIdx ) {
1612 break;
1613 }
1614 pr->u.p.parm4 = victimIdx;
1615
1616 while( 1 ) {
1617 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1618 vnew = vold;
1619
1620 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1621 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1622 break;
1623 }
1624 vnew.p.ub -= (remaining >> 2);
1625 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1626 #pragma warning( push )
1627 // disable warning on pointless comparison of unsigned with 0
1628 #pragma warning( disable: 186 )
1629 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1630 #pragma warning( pop )
1631 // TODO: Should this be acquire or release?
1632 if ( KMP_COMPARE_AND_STORE_ACQ64(
1633 ( volatile kmp_int64 * )&victim->u.p.count,
1634 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1635 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1636 status = 1;
1637 while_index = 0;
1638 // now update own count and ub
1639 #if KMP_ARCH_X86
1640 // stealing executed on non-KMP_ARCH_X86 only
1641 // Atomic 64-bit write on ia32 is
1642 // unavailable, so we do this in steps.
1643 // This code is not tested.
1644 init = vold.p.count;
1645 pr->u.p.ub = 0;
1646 pr->u.p.count = init + 1;
1647 pr->u.p.ub = vnew.p.count;
1648 #else
1649 init = vnew.p.ub;
1650 vold.p.count = init + 1;
1651 // TODO: is it safe and enough?
1652 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1653 #endif // KMP_ARCH_X86
1654 break;
1655 } // if
1656 KMP_CPU_PAUSE();
1657 } // while (1)
1658 } // while
1659 } // if
1660 } // if
1661 if ( !status ) {
1662 *p_lb = 0;
1663 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001664 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001665 } else {
1666 start = pr->u.p.parm2;
1667 init *= chunk;
1668 limit = chunk + init - 1;
1669 incr = pr->u.p.st;
1670
1671 KMP_DEBUG_ASSERT(init <= trip);
1672 if ( (last = (limit >= trip)) != 0 )
1673 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001674 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001675
1676 if ( incr == 1 ) {
1677 *p_lb = start + init;
1678 *p_ub = start + limit;
1679 } else {
1680 *p_lb = start + init * incr;
1681 *p_ub = start + limit * incr;
1682 }
1683
1684 if ( pr->ordered ) {
1685 pr->u.p.ordered_lower = init;
1686 pr->u.p.ordered_upper = limit;
1687 #ifdef KMP_DEBUG
1688 {
1689 const char * buff;
1690 // create format specifiers before the debug output
1691 buff = __kmp_str_format(
1692 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1693 traits_t< UT >::spec, traits_t< UT >::spec );
1694 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1695 __kmp_str_free( &buff );
1696 }
1697 #endif
1698 } // if
1699 } // if
1700 break;
1701 } // case
1702 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1703 case kmp_sch_static_balanced:
1704 {
1705 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1706 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1707 pr->u.p.count = 1;
1708 *p_lb = pr->u.p.lb;
1709 *p_ub = pr->u.p.ub;
1710 last = pr->u.p.parm1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001711 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001712 *p_st = pr->u.p.st;
1713 } else { /* no iterations to do */
1714 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1715 }
1716 if ( pr->ordered ) {
1717 #ifdef KMP_DEBUG
1718 {
1719 const char * buff;
1720 // create format specifiers before the debug output
1721 buff = __kmp_str_format(
1722 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1723 traits_t< UT >::spec, traits_t< UT >::spec );
1724 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1725 __kmp_str_free( &buff );
1726 }
1727 #endif
1728 } // if
1729 } // case
1730 break;
1731 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1732 case kmp_sch_static_chunked:
1733 {
1734 T parm1;
1735
1736 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1737 gtid ) );
1738 parm1 = pr->u.p.parm1;
1739
1740 trip = pr->u.p.tc - 1;
1741 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1742
1743 if ( (status = (init <= trip)) != 0 ) {
1744 start = pr->u.p.lb;
1745 incr = pr->u.p.st;
1746 limit = parm1 + init - 1;
1747
1748 if ( (last = (limit >= trip)) != 0 )
1749 limit = trip;
1750
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001751 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001752
1753 pr->u.p.count += team->t.t_nproc;
1754
1755 if ( incr == 1 ) {
1756 *p_lb = start + init;
1757 *p_ub = start + limit;
1758 }
1759 else {
1760 *p_lb = start + init * incr;
1761 *p_ub = start + limit * incr;
1762 }
1763
1764 if ( pr->ordered ) {
1765 pr->u.p.ordered_lower = init;
1766 pr->u.p.ordered_upper = limit;
1767 #ifdef KMP_DEBUG
1768 {
1769 const char * buff;
1770 // create format specifiers before the debug output
1771 buff = __kmp_str_format(
1772 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1773 traits_t< UT >::spec, traits_t< UT >::spec );
1774 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1775 __kmp_str_free( &buff );
1776 }
1777 #endif
1778 } // if
1779 } // if
1780 } // case
1781 break;
1782
1783 case kmp_sch_dynamic_chunked:
1784 {
1785 T chunk = pr->u.p.parm1;
1786
1787 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1788 gtid ) );
1789
1790 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1791 trip = pr->u.p.tc - 1;
1792
1793 if ( (status = (init <= trip)) == 0 ) {
1794 *p_lb = 0;
1795 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001796 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001797 } else {
1798 start = pr->u.p.lb;
1799 limit = chunk + init - 1;
1800 incr = pr->u.p.st;
1801
1802 if ( (last = (limit >= trip)) != 0 )
1803 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001804
1805 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001806
1807 if ( incr == 1 ) {
1808 *p_lb = start + init;
1809 *p_ub = start + limit;
1810 } else {
1811 *p_lb = start + init * incr;
1812 *p_ub = start + limit * incr;
1813 }
1814
1815 if ( pr->ordered ) {
1816 pr->u.p.ordered_lower = init;
1817 pr->u.p.ordered_upper = limit;
1818 #ifdef KMP_DEBUG
1819 {
1820 const char * buff;
1821 // create format specifiers before the debug output
1822 buff = __kmp_str_format(
1823 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1824 traits_t< UT >::spec, traits_t< UT >::spec );
1825 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1826 __kmp_str_free( &buff );
1827 }
1828 #endif
1829 } // if
1830 } // if
1831 } // case
1832 break;
1833
1834 case kmp_sch_guided_iterative_chunked:
1835 {
1836 T chunkspec = pr->u.p.parm1;
1837 KD_TRACE(100,
1838 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1839 trip = pr->u.p.tc;
1840 // Start atomic part of calculations
1841 while(1) {
1842 ST remaining; // signed, because can be < 0
1843 init = sh->u.s.iteration; // shared value
1844 remaining = trip - init;
1845 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1846 // nothing to do, don't try atomic op
1847 status = 0;
1848 break;
1849 }
1850 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1851 // use dynamic-style shcedule
1852 // atomically inrement iterations, get old value
1853 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1854 remaining = trip - init;
1855 if (remaining <= 0) {
1856 status = 0; // all iterations got by other threads
1857 } else {
1858 // got some iterations to work on
1859 status = 1;
1860 if ( (T)remaining > chunkspec ) {
1861 limit = init + chunkspec - 1;
1862 } else {
1863 last = 1; // the last chunk
1864 limit = init + remaining - 1;
1865 } // if
1866 } // if
1867 break;
1868 } // if
1869 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1870 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1871 // CAS was successful, chunk obtained
1872 status = 1;
1873 --limit;
1874 break;
1875 } // if
1876 } // while
1877 if ( status != 0 ) {
1878 start = pr->u.p.lb;
1879 incr = pr->u.p.st;
1880 if ( p_st != NULL )
1881 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001882 *p_lb = start + init * incr;
1883 *p_ub = start + limit * incr;
1884 if ( pr->ordered ) {
1885 pr->u.p.ordered_lower = init;
1886 pr->u.p.ordered_upper = limit;
1887 #ifdef KMP_DEBUG
1888 {
1889 const char * buff;
1890 // create format specifiers before the debug output
1891 buff = __kmp_str_format(
1892 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1893 traits_t< UT >::spec, traits_t< UT >::spec );
1894 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1895 __kmp_str_free( &buff );
1896 }
1897 #endif
1898 } // if
1899 } else {
1900 *p_lb = 0;
1901 *p_ub = 0;
1902 if ( p_st != NULL )
1903 *p_st = 0;
1904 } // if
1905 } // case
1906 break;
1907
1908 case kmp_sch_guided_analytical_chunked:
1909 {
1910 T chunkspec = pr->u.p.parm1;
1911 UT chunkIdx;
1912 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1913 /* for storing original FPCW value for Windows* OS on
1914 IA-32 architecture 8-byte version */
1915 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001916 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001917 #endif
1918 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1919 gtid ) );
1920
1921 trip = pr->u.p.tc;
1922
1923 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1924 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1925
1926 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1927 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1928 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1929 --trip;
1930 /* use dynamic-style scheduling */
1931 init = chunkIdx * chunkspec + pr->u.p.count;
1932 /* need to verify init > 0 in case of overflow in the above calculation */
1933 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1934 limit = init + chunkspec -1;
1935
1936 if ( (last = (limit >= trip)) != 0 )
1937 limit = trip;
1938 }
1939 break;
1940 } else {
1941 /* use exponential-style scheduling */
1942 /* The following check is to workaround the lack of long double precision on Windows* OS.
1943 This check works around the possible effect that init != 0 for chunkIdx == 0.
1944 */
1945 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1946 /* If we haven't already done so, save original
1947 FPCW and set precision to 64-bit, as Windows* OS
1948 on IA-32 architecture defaults to 53-bit */
1949 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00001950 oldFpcw = _control87(0,0);
1951 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001952 fpcwSet = 0x30000;
1953 }
1954 #endif
1955 if ( chunkIdx ) {
1956 init = __kmp_dispatch_guided_remaining< T >(
1957 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1958 KMP_DEBUG_ASSERT(init);
1959 init = trip - init;
1960 } else
1961 init = 0;
1962 limit = trip - __kmp_dispatch_guided_remaining< T >(
1963 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1964 KMP_ASSERT(init <= limit);
1965 if ( init < limit ) {
1966 KMP_DEBUG_ASSERT(limit <= trip);
1967 --limit;
1968 status = 1;
1969 break;
1970 } // if
1971 } // if
1972 } // while (1)
1973 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00001974 /* restore FPCW if necessary
1975 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1976 */
1977 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1978 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001979 #endif
1980 if ( status != 0 ) {
1981 start = pr->u.p.lb;
1982 incr = pr->u.p.st;
1983 if ( p_st != NULL )
1984 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001985 *p_lb = start + init * incr;
1986 *p_ub = start + limit * incr;
1987 if ( pr->ordered ) {
1988 pr->u.p.ordered_lower = init;
1989 pr->u.p.ordered_upper = limit;
1990 #ifdef KMP_DEBUG
1991 {
1992 const char * buff;
1993 // create format specifiers before the debug output
1994 buff = __kmp_str_format(
1995 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1996 traits_t< UT >::spec, traits_t< UT >::spec );
1997 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1998 __kmp_str_free( &buff );
1999 }
2000 #endif
2001 }
2002 } else {
2003 *p_lb = 0;
2004 *p_ub = 0;
2005 if ( p_st != NULL )
2006 *p_st = 0;
2007 }
2008 } // case
2009 break;
2010
2011 case kmp_sch_trapezoidal:
2012 {
2013 UT index;
2014 T parm2 = pr->u.p.parm2;
2015 T parm3 = pr->u.p.parm3;
2016 T parm4 = pr->u.p.parm4;
2017 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2018 gtid ) );
2019
2020 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2021
2022 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2023 trip = pr->u.p.tc - 1;
2024
2025 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2026 *p_lb = 0;
2027 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002028 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002029 } else {
2030 start = pr->u.p.lb;
2031 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2032 incr = pr->u.p.st;
2033
2034 if ( (last = (limit >= trip)) != 0 )
2035 limit = trip;
2036
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002037 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002038
2039 if ( incr == 1 ) {
2040 *p_lb = start + init;
2041 *p_ub = start + limit;
2042 } else {
2043 *p_lb = start + init * incr;
2044 *p_ub = start + limit * incr;
2045 }
2046
2047 if ( pr->ordered ) {
2048 pr->u.p.ordered_lower = init;
2049 pr->u.p.ordered_upper = limit;
2050 #ifdef KMP_DEBUG
2051 {
2052 const char * buff;
2053 // create format specifiers before the debug output
2054 buff = __kmp_str_format(
2055 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2056 traits_t< UT >::spec, traits_t< UT >::spec );
2057 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2058 __kmp_str_free( &buff );
2059 }
2060 #endif
2061 } // if
2062 } // if
2063 } // case
2064 break;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002065 default:
2066 {
2067 status = 0; // to avoid complaints on uninitialized variable use
2068 __kmp_msg(
2069 kmp_ms_fatal, // Severity
2070 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2071 KMP_HNT( GetNewerLibrary ), // Hint
2072 __kmp_msg_null // Variadic argument list terminator
2073 );
2074 }
2075 break;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002076 } // switch
2077 } // if tc == 0;
2078
2079 if ( status == 0 ) {
2080 UT num_done;
2081
2082 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2083 #ifdef KMP_DEBUG
2084 {
2085 const char * buff;
2086 // create format specifiers before the debug output
2087 buff = __kmp_str_format(
2088 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2089 traits_t< UT >::spec );
2090 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2091 __kmp_str_free( &buff );
2092 }
2093 #endif
2094
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002095 if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002096 /* NOTE: release this buffer to be reused */
2097
2098 KMP_MB(); /* Flush all pending memory write invalidates. */
2099
2100 sh->u.s.num_done = 0;
2101 sh->u.s.iteration = 0;
2102
2103 /* TODO replace with general release procedure? */
2104 if ( pr->ordered ) {
2105 sh->u.s.ordered_iteration = 0;
2106 }
2107
2108 KMP_MB(); /* Flush all pending memory write invalidates. */
2109
2110 sh -> buffer_index += KMP_MAX_DISP_BUF;
2111 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2112 gtid, sh->buffer_index) );
2113
2114 KMP_MB(); /* Flush all pending memory write invalidates. */
2115
2116 } // if
2117 if ( __kmp_env_consistency_check ) {
2118 if ( pr->pushed_ws != ct_none ) {
2119 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2120 }
2121 }
2122
2123 th -> th.th_dispatch -> th_deo_fcn = NULL;
2124 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2125 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2126 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2127 } // if (status == 0)
2128#if KMP_OS_WINDOWS
2129 else if ( last ) {
2130 pr->u.p.last_upper = pr->u.p.ub;
2131 }
2132#endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002133 if ( p_last != NULL && status != 0 )
2134 *p_last = last;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002135 } // if
2136
2137 #ifdef KMP_DEBUG
2138 {
2139 const char * buff;
2140 // create format specifiers before the debug output
2141 buff = __kmp_str_format(
2142 "__kmp_dispatch_next: T#%%d normal case: " \
2143 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2144 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2145 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2146 __kmp_str_free( &buff );
2147 }
2148 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002149#if INCLUDE_SSC_MARKS
2150 SSC_MARK_DISPATCH_NEXT();
2151#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00002152 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002153 return status;
2154}
2155
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002156template< typename T >
2157static void
2158__kmp_dist_get_bounds(
2159 ident_t *loc,
2160 kmp_int32 gtid,
2161 kmp_int32 *plastiter,
2162 T *plower,
2163 T *pupper,
2164 typename traits_t< T >::signed_t incr
2165) {
2166 KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2167 typedef typename traits_t< T >::unsigned_t UT;
2168 typedef typename traits_t< T >::signed_t ST;
2169 register kmp_uint32 team_id;
2170 register kmp_uint32 nteams;
2171 register UT trip_count;
2172 register kmp_team_t *team;
2173 kmp_info_t * th;
2174
2175 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2176 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2177 #ifdef KMP_DEBUG
2178 {
2179 const char * buff;
2180 // create format specifiers before the debug output
2181 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2182 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2183 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2184 traits_t< T >::spec );
2185 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2186 __kmp_str_free( &buff );
2187 }
2188 #endif
2189
2190 if( __kmp_env_consistency_check ) {
2191 if( incr == 0 ) {
2192 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2193 }
2194 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2195 // The loop is illegal.
2196 // Some zero-trip loops maintained by compiler, e.g.:
2197 // for(i=10;i<0;++i) // lower >= upper - run-time check
2198 // for(i=0;i>10;--i) // lower <= upper - run-time check
2199 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2200 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2201 // Compiler does not check the following illegal loops:
2202 // for(i=0;i<10;i+=incr) // where incr<0
2203 // for(i=10;i>0;i-=incr) // where incr<0
2204 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2205 }
2206 }
2207 th = __kmp_threads[gtid];
2208 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2209 team = th->th.th_team;
2210 #if OMP_40_ENABLED
2211 nteams = th->th.th_teams_size.nteams;
2212 #endif
2213 team_id = team->t.t_master_tid;
2214 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2215
2216 // compute global trip count
2217 if( incr == 1 ) {
2218 trip_count = *pupper - *plower + 1;
2219 } else if(incr == -1) {
2220 trip_count = *plower - *pupper + 1;
2221 } else {
2222 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2223 }
2224 if( trip_count <= nteams ) {
2225 KMP_DEBUG_ASSERT(
2226 __kmp_static == kmp_sch_static_greedy || \
2227 __kmp_static == kmp_sch_static_balanced
2228 ); // Unknown static scheduling type.
2229 // only some teams get single iteration, others get nothing
2230 if( team_id < trip_count ) {
2231 *pupper = *plower = *plower + team_id * incr;
2232 } else {
2233 *plower = *pupper + incr; // zero-trip loop
2234 }
2235 if( plastiter != NULL )
2236 *plastiter = ( team_id == trip_count - 1 );
2237 } else {
2238 if( __kmp_static == kmp_sch_static_balanced ) {
2239 register UT chunk = trip_count / nteams;
2240 register UT extras = trip_count % nteams;
2241 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2242 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2243 if( plastiter != NULL )
2244 *plastiter = ( team_id == nteams - 1 );
2245 } else {
2246 register T chunk_inc_count =
2247 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2248 register T upper = *pupper;
2249 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2250 // Unknown static scheduling type.
2251 *plower += team_id * chunk_inc_count;
2252 *pupper = *plower + chunk_inc_count - incr;
2253 // Check/correct bounds if needed
2254 if( incr > 0 ) {
2255 if( *pupper < *plower )
2256 *pupper = i_maxmin< T >::mx;
2257 if( plastiter != NULL )
2258 *plastiter = *plower <= upper && *pupper > upper - incr;
2259 if( *pupper > upper )
2260 *pupper = upper; // tracker C73258
2261 } else {
2262 if( *pupper > *plower )
2263 *pupper = i_maxmin< T >::mn;
2264 if( plastiter != NULL )
2265 *plastiter = *plower >= upper && *pupper < upper - incr;
2266 if( *pupper < upper )
2267 *pupper = upper; // tracker C73258
2268 }
2269 }
2270 }
2271}
2272
Jim Cownie5e8470a2013-09-27 10:38:44 +00002273//-----------------------------------------------------------------------------------------
2274// Dispatch routines
2275// Transfer call to template< type T >
2276// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2277// T lb, T ub, ST st, ST chunk )
2278extern "C" {
2279
2280/*!
2281@ingroup WORK_SHARING
2282@{
2283@param loc Source location
2284@param gtid Global thread id
2285@param schedule Schedule type
2286@param lb Lower bound
2287@param ub Upper bound
2288@param st Step (or increment if you prefer)
2289@param chunk The chunk size to block with
2290
2291This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2292These functions are all identical apart from the types of the arguments.
2293*/
2294
2295void
2296__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2297 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2298{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002299 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002300 KMP_DEBUG_ASSERT( __kmp_init_serial );
2301 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2302}
2303/*!
2304See @ref __kmpc_dispatch_init_4
2305*/
2306void
2307__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2308 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2309{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002310 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002311 KMP_DEBUG_ASSERT( __kmp_init_serial );
2312 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2313}
2314
2315/*!
2316See @ref __kmpc_dispatch_init_4
2317*/
2318void
2319__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2320 kmp_int64 lb, kmp_int64 ub,
2321 kmp_int64 st, kmp_int64 chunk )
2322{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002323 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002324 KMP_DEBUG_ASSERT( __kmp_init_serial );
2325 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2326}
2327
2328/*!
2329See @ref __kmpc_dispatch_init_4
2330*/
2331void
2332__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2333 kmp_uint64 lb, kmp_uint64 ub,
2334 kmp_int64 st, kmp_int64 chunk )
2335{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002336 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002337 KMP_DEBUG_ASSERT( __kmp_init_serial );
2338 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2339}
2340
2341/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002342See @ref __kmpc_dispatch_init_4
2343
2344Difference from __kmpc_dispatch_init set of functions is these functions
2345are called for composite distribute parallel for construct. Thus before
2346regular iterations dispatching we need to calc per-team iteration space.
2347
2348These functions are all identical apart from the types of the arguments.
2349*/
2350void
2351__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2352 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2353{
2354 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2355 KMP_DEBUG_ASSERT( __kmp_init_serial );
2356 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2357 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2358}
2359
2360void
2361__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2362 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2363{
2364 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2365 KMP_DEBUG_ASSERT( __kmp_init_serial );
2366 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2367 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2368}
2369
2370void
2371__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2372 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2373{
2374 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2375 KMP_DEBUG_ASSERT( __kmp_init_serial );
2376 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2377 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2378}
2379
2380void
2381__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2382 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2383{
2384 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2385 KMP_DEBUG_ASSERT( __kmp_init_serial );
2386 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2387 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2388}
2389
2390/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002391@param loc Source code location
2392@param gtid Global thread id
2393@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2394@param p_lb Pointer to the lower bound for the next chunk of work
2395@param p_ub Pointer to the upper bound for the next chunk of work
2396@param p_st Pointer to the stride for the next chunk of work
2397@return one if there is work to be done, zero otherwise
2398
2399Get the next dynamically allocated chunk of work for this thread.
2400If there is no more work, then the lb,ub and stride need not be modified.
2401*/
2402int
2403__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2404 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2405{
2406 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2407}
2408
2409/*!
2410See @ref __kmpc_dispatch_next_4
2411*/
2412int
2413__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2414 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2415{
2416 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2417}
2418
2419/*!
2420See @ref __kmpc_dispatch_next_4
2421*/
2422int
2423__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2424 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2425{
2426 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2427}
2428
2429/*!
2430See @ref __kmpc_dispatch_next_4
2431*/
2432int
2433__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2434 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2435{
2436 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2437}
2438
2439/*!
2440@param loc Source code location
2441@param gtid Global thread id
2442
2443Mark the end of a dynamic loop.
2444*/
2445void
2446__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2447{
2448 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2449}
2450
2451/*!
2452See @ref __kmpc_dispatch_fini_4
2453*/
2454void
2455__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2456{
2457 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2458}
2459
2460/*!
2461See @ref __kmpc_dispatch_fini_4
2462*/
2463void
2464__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2465{
2466 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2467}
2468
2469/*!
2470See @ref __kmpc_dispatch_fini_4
2471*/
2472void
2473__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2474{
2475 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2476}
2477/*! @} */
2478
2479//-----------------------------------------------------------------------------------------
2480//Non-template routines from kmp_dispatch.c used in other sources
2481
2482kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2483 return value == checker;
2484}
2485
2486kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2487 return value != checker;
2488}
2489
2490kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2491 return value < checker;
2492}
2493
2494kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2495 return value >= checker;
2496}
2497
2498kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2499 return value <= checker;
2500}
2501kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2502 return value == checker;
2503}
2504
2505kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2506 return value != checker;
2507}
2508
2509kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2510 return value < checker;
2511}
2512
2513kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2514 return value >= checker;
2515}
2516
2517kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2518 return value <= checker;
2519}
2520
2521kmp_uint32
2522__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2523 kmp_uint32 checker,
2524 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2525 , void * obj // Higher-level synchronization object, or NULL.
2526 )
2527{
2528 // note: we may not belong to a team at this point
2529 register volatile kmp_uint32 * spin = spinner;
2530 register kmp_uint32 check = checker;
2531 register kmp_uint32 spins;
2532 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2533 register kmp_uint32 r;
2534
2535 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2536 KMP_INIT_YIELD( spins );
2537 // main wait spin loop
2538 while(!f(r = TCR_4(*spin), check)) {
2539 KMP_FSYNC_SPIN_PREPARE( obj );
2540 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2541 It causes problems with infinite recursion because of exit lock */
2542 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2543 __kmp_abort_thread(); */
2544
Jim Cownie5e8470a2013-09-27 10:38:44 +00002545 /* if we have waited a bit, or are oversubscribed, yield */
2546 /* pause is in the following code */
2547 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2548 KMP_YIELD_SPIN( spins );
2549 }
2550 KMP_FSYNC_SPIN_ACQUIRED( obj );
2551 return r;
2552}
2553
2554kmp_uint64
2555__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2556 kmp_uint64 checker,
2557 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2558 , void * obj // Higher-level synchronization object, or NULL.
2559 )
2560{
2561 // note: we may not belong to a team at this point
2562 register volatile kmp_uint64 * spin = spinner;
2563 register kmp_uint64 check = checker;
2564 register kmp_uint32 spins;
2565 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2566 register kmp_uint64 r;
2567
2568 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2569 KMP_INIT_YIELD( spins );
2570 // main wait spin loop
2571 while(!f(r = *spin, check))
2572 {
2573 KMP_FSYNC_SPIN_PREPARE( obj );
2574 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2575 It causes problems with infinite recursion because of exit lock */
2576 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2577 __kmp_abort_thread(); */
2578
Jim Cownie5e8470a2013-09-27 10:38:44 +00002579 // if we are oversubscribed,
2580 // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2581 // pause is in the following code
2582 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2583 KMP_YIELD_SPIN( spins );
2584 }
2585 KMP_FSYNC_SPIN_ACQUIRED( obj );
2586 return r;
2587}
2588
2589} // extern "C"
2590
2591#ifdef KMP_GOMP_COMPAT
2592
2593void
2594__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2595 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2596 kmp_int32 chunk, int push_ws )
2597{
2598 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2599 push_ws );
2600}
2601
2602void
2603__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2604 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2605 kmp_int32 chunk, int push_ws )
2606{
2607 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2608 push_ws );
2609}
2610
2611void
2612__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2613 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2614 kmp_int64 chunk, int push_ws )
2615{
2616 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2617 push_ws );
2618}
2619
2620void
2621__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2622 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2623 kmp_int64 chunk, int push_ws )
2624{
2625 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2626 push_ws );
2627}
2628
2629void
2630__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2631{
2632 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2633}
2634
2635void
2636__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2637{
2638 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2639}
2640
2641void
2642__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2643{
2644 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2645}
2646
2647void
2648__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2649{
2650 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2651}
2652
2653#endif /* KMP_GOMP_COMPAT */
2654
2655/* ------------------------------------------------------------------------ */
2656/* ------------------------------------------------------------------------ */
2657