blob: b0d472c40ecaea411cd1c06838bdd188a6fdbf34 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16/*
17 * Dynamic scheduling initialization and dispatch.
18 *
19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20 * it may change values between parallel regions. __kmp_max_nth
21 * is the largest value __kmp_nth may take, 1 is the smallest.
22 *
23 */
24
25/* ------------------------------------------------------------------------ */
26/* ------------------------------------------------------------------------ */
27
28#include "kmp.h"
29#include "kmp_i18n.h"
30#include "kmp_itt.h"
31#include "kmp_str.h"
32#include "kmp_error.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000033#include "kmp_stats.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000034#if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36#endif
37
Andrey Churbanovd7d088f2015-04-29 16:42:24 +000038#if OMPT_SUPPORT
39#include "ompt-internal.h"
40#include "ompt-specific.h"
41#endif
42
Jim Cownie5e8470a2013-09-27 10:38:44 +000043/* ------------------------------------------------------------------------ */
44/* ------------------------------------------------------------------------ */
45
Jim Cownie4cc4bb42014-10-07 16:25:50 +000046// template for type limits
47template< typename T >
48struct i_maxmin {
49 static const T mx;
50 static const T mn;
51};
52template<>
53struct i_maxmin< int > {
54 static const int mx = 0x7fffffff;
55 static const int mn = 0x80000000;
56};
57template<>
58struct i_maxmin< unsigned int > {
59 static const unsigned int mx = 0xffffffff;
60 static const unsigned int mn = 0x00000000;
61};
62template<>
63struct i_maxmin< long long > {
64 static const long long mx = 0x7fffffffffffffffLL;
65 static const long long mn = 0x8000000000000000LL;
66};
67template<>
68struct i_maxmin< unsigned long long > {
69 static const unsigned long long mx = 0xffffffffffffffffLL;
70 static const unsigned long long mn = 0x0000000000000000LL;
71};
72//-------------------------------------------------------------------------
73
Jim Cownie5e8470a2013-09-27 10:38:44 +000074#ifdef KMP_STATIC_STEAL_ENABLED
75
76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77 template< typename T >
78 struct dispatch_private_infoXX_template {
79 typedef typename traits_t< T >::unsigned_t UT;
80 typedef typename traits_t< T >::signed_t ST;
81 UT count; // unsigned
82 T ub;
83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84 T lb;
85 ST st; // signed
86 UT tc; // unsigned
87 T static_steal_counter; // for static_steal only; maybe better to put after ub
88
89 /* parm[1-4] are used in different ways by different scheduling algorithms */
90
91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92 // a) parm3 is properly aligned and
93 // b) all parm1-4 are in the same cache line.
94 // Because of parm1-4 are used together, performance seems to be better
95 // if they are in the same line (not measured though).
96
97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98 T parm1;
99 T parm2;
100 T parm3;
101 T parm4;
102 };
103
104 UT ordered_lower; // unsigned
105 UT ordered_upper; // unsigned
106 #if KMP_OS_WINDOWS
107 T last_upper;
108 #endif /* KMP_OS_WINDOWS */
109 };
110
111#else /* KMP_STATIC_STEAL_ENABLED */
112
113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114 template< typename T >
115 struct dispatch_private_infoXX_template {
116 typedef typename traits_t< T >::unsigned_t UT;
117 typedef typename traits_t< T >::signed_t ST;
118 T lb;
119 T ub;
120 ST st; // signed
121 UT tc; // unsigned
122
123 T parm1;
124 T parm2;
125 T parm3;
126 T parm4;
127
128 UT count; // unsigned
129
130 UT ordered_lower; // unsigned
131 UT ordered_upper; // unsigned
132 #if KMP_OS_WINDOWS
133 T last_upper;
134 #endif /* KMP_OS_WINDOWS */
135 };
136
137#endif /* KMP_STATIC_STEAL_ENABLED */
138
139// replaces dispatch_private_info structure and dispatch_private_info_t type
140template< typename T >
141struct KMP_ALIGN_CACHE dispatch_private_info_template {
142 // duplicate alignment here, otherwise size of structure is not correct in our compiler
143 union KMP_ALIGN_CACHE private_info_tmpl {
144 dispatch_private_infoXX_template< T > p;
145 dispatch_private_info64_t p64;
146 } u;
147 enum sched_type schedule; /* scheduling algorithm */
148 kmp_uint32 ordered; /* ordered clause specified */
149 kmp_uint32 ordered_bumped;
150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152 kmp_uint32 nomerge; /* don't merge iters if serialized */
153 kmp_uint32 type_size;
154 enum cons_type pushed_ws;
155};
156
157
158// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159template< typename UT >
160struct dispatch_shared_infoXX_template {
161 /* chunk index under dynamic, number of idle threads under static-steal;
162 iteration index otherwise */
163 volatile UT iteration;
164 volatile UT num_done;
165 volatile UT ordered_iteration;
166 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167};
168
169// replaces dispatch_shared_info structure and dispatch_shared_info_t type
170template< typename UT >
171struct dispatch_shared_info_template {
172 // we need union here to keep the structure size
173 union shared_info_tmpl {
174 dispatch_shared_infoXX_template< UT > s;
175 dispatch_shared_info64_t s64;
176 } u;
177 volatile kmp_uint32 buffer_index;
178};
179
180/* ------------------------------------------------------------------------ */
181/* ------------------------------------------------------------------------ */
182
Jim Cownie5e8470a2013-09-27 10:38:44 +0000183#undef USE_TEST_LOCKS
184
185// test_then_add template (general template should NOT be used)
186template< typename T >
187static __forceinline T
188test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189
190template<>
191__forceinline kmp_int32
192test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193{
194 kmp_int32 r;
195 r = KMP_TEST_THEN_ADD32( p, d );
196 return r;
197}
198
199template<>
200__forceinline kmp_int64
201test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202{
203 kmp_int64 r;
204 r = KMP_TEST_THEN_ADD64( p, d );
205 return r;
206}
207
208// test_then_inc_acq template (general template should NOT be used)
209template< typename T >
210static __forceinline T
211test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212
213template<>
214__forceinline kmp_int32
215test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216{
217 kmp_int32 r;
218 r = KMP_TEST_THEN_INC_ACQ32( p );
219 return r;
220}
221
222template<>
223__forceinline kmp_int64
224test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225{
226 kmp_int64 r;
227 r = KMP_TEST_THEN_INC_ACQ64( p );
228 return r;
229}
230
231// test_then_inc template (general template should NOT be used)
232template< typename T >
233static __forceinline T
234test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235
236template<>
237__forceinline kmp_int32
238test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239{
240 kmp_int32 r;
241 r = KMP_TEST_THEN_INC32( p );
242 return r;
243}
244
245template<>
246__forceinline kmp_int64
247test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248{
249 kmp_int64 r;
250 r = KMP_TEST_THEN_INC64( p );
251 return r;
252}
253
254// compare_and_swap template (general template should NOT be used)
255template< typename T >
256static __forceinline kmp_int32
257compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258
259template<>
260__forceinline kmp_int32
261compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262{
263 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264}
265
266template<>
267__forceinline kmp_int32
268compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269{
270 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271}
272
273/*
274 Spin wait loop that first does pause, then yield.
275 Waits until function returns non-zero when called with *spinner and check.
276 Does NOT put threads to sleep.
277#if USE_ITT_BUILD
278 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000279 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000280 locks consistently. For example, if lock is acquired immediately, its address is
281 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283 address, not an address of low-level spinner.
284#endif // USE_ITT_BUILD
285*/
286template< typename UT >
287// ToDo: make inline function (move to header file for icl)
288static UT // unsigned 4- or 8-byte type
289__kmp_wait_yield( volatile UT * spinner,
290 UT checker,
291 kmp_uint32 (* pred)( UT, UT )
292 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
293 )
294{
295 // note: we may not belong to a team at this point
296 register volatile UT * spin = spinner;
297 register UT check = checker;
298 register kmp_uint32 spins;
299 register kmp_uint32 (*f) ( UT, UT ) = pred;
300 register UT r;
301
302 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303 KMP_INIT_YIELD( spins );
304 // main wait spin loop
305 while(!f(r = *spin, check))
306 {
307 KMP_FSYNC_SPIN_PREPARE( obj );
308 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309 It causes problems with infinite recursion because of exit lock */
310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311 __kmp_abort_thread(); */
312
Jim Cownie5e8470a2013-09-27 10:38:44 +0000313 // if we are oversubscribed,
314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315 // pause is in the following code
316 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317 KMP_YIELD_SPIN( spins );
318 }
319 KMP_FSYNC_SPIN_ACQUIRED( obj );
320 return r;
321}
322
323template< typename UT >
324static kmp_uint32 __kmp_eq( UT value, UT checker) {
325 return value == checker;
326}
327
328template< typename UT >
329static kmp_uint32 __kmp_neq( UT value, UT checker) {
330 return value != checker;
331}
332
333template< typename UT >
334static kmp_uint32 __kmp_lt( UT value, UT checker) {
335 return value < checker;
336}
337
338template< typename UT >
339static kmp_uint32 __kmp_ge( UT value, UT checker) {
340 return value >= checker;
341}
342
343template< typename UT >
344static kmp_uint32 __kmp_le( UT value, UT checker) {
345 return value <= checker;
346}
347
348
349/* ------------------------------------------------------------------------ */
350/* ------------------------------------------------------------------------ */
351
352static void
353__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354{
355 kmp_info_t *th;
356
357 KMP_DEBUG_ASSERT( gtid_ref );
358
359 if ( __kmp_env_consistency_check ) {
360 th = __kmp_threads[*gtid_ref];
361 if ( th -> th.th_root -> r.r_active
362 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000363#if KMP_USE_DYNAMIC_LOCK
364 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000366 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000367#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000368 }
369 }
370}
371
372template< typename UT >
373static void
374__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375{
376 typedef typename traits_t< UT >::signed_t ST;
377 dispatch_private_info_template< UT > * pr;
378
379 int gtid = *gtid_ref;
380// int cid = *cid_ref;
381 kmp_info_t *th = __kmp_threads[ gtid ];
382 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383
384 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385 if ( __kmp_env_consistency_check ) {
386 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387 ( th -> th.th_dispatch -> th_dispatch_pr_current );
388 if ( pr -> pushed_ws != ct_none ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000389#if KMP_USE_DYNAMIC_LOCK
390 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000392 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000393#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000394 }
395 }
396
397 if ( ! th -> th.th_team -> t.t_serialized ) {
398 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399 ( th -> th.th_dispatch -> th_dispatch_sh_current );
400 UT lower;
401
402 if ( ! __kmp_env_consistency_check ) {
403 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404 ( th -> th.th_dispatch -> th_dispatch_pr_current );
405 }
406 lower = pr->u.p.ordered_lower;
407
408 #if ! defined( KMP_GOMP_COMPAT )
409 if ( __kmp_env_consistency_check ) {
410 if ( pr->ordered_bumped ) {
411 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412 __kmp_error_construct2(
413 kmp_i18n_msg_CnsMultipleNesting,
414 ct_ordered_in_pdo, loc_ref,
415 & p->stack_data[ p->w_top ]
416 );
417 }
418 }
419 #endif /* !defined(KMP_GOMP_COMPAT) */
420
421 KMP_MB();
422 #ifdef KMP_DEBUG
423 {
424 const char * buff;
425 // create format specifiers before the debug output
426 buff = __kmp_str_format(
427 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428 traits_t< UT >::spec, traits_t< UT >::spec );
429 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430 __kmp_str_free( &buff );
431 }
432 #endif
433
434 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435 USE_ITT_BUILD_ARG( NULL )
436 );
437 KMP_MB(); /* is this necessary? */
438 #ifdef KMP_DEBUG
439 {
440 const char * buff;
441 // create format specifiers before the debug output
442 buff = __kmp_str_format(
443 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444 traits_t< UT >::spec, traits_t< UT >::spec );
445 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446 __kmp_str_free( &buff );
447 }
448 #endif
449 }
450 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451}
452
453static void
454__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455{
456 kmp_info_t *th;
457
458 if ( __kmp_env_consistency_check ) {
459 th = __kmp_threads[*gtid_ref];
460 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462 }
463 }
464}
465
466template< typename UT >
467static void
468__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469{
470 typedef typename traits_t< UT >::signed_t ST;
471 dispatch_private_info_template< UT > * pr;
472
473 int gtid = *gtid_ref;
474// int cid = *cid_ref;
475 kmp_info_t *th = __kmp_threads[ gtid ];
476 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477
478 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479 if ( __kmp_env_consistency_check ) {
480 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481 ( th -> th.th_dispatch -> th_dispatch_pr_current );
482 if ( pr -> pushed_ws != ct_none ) {
483 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484 }
485 }
486
487 if ( ! th -> th.th_team -> t.t_serialized ) {
488 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489 ( th -> th.th_dispatch -> th_dispatch_sh_current );
490
491 if ( ! __kmp_env_consistency_check ) {
492 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493 ( th -> th.th_dispatch -> th_dispatch_pr_current );
494 }
495
496 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497 #if ! defined( KMP_GOMP_COMPAT )
498 if ( __kmp_env_consistency_check ) {
499 if ( pr->ordered_bumped != 0 ) {
500 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501 /* How to test it? - OM */
502 __kmp_error_construct2(
503 kmp_i18n_msg_CnsMultipleNesting,
504 ct_ordered_in_pdo, loc_ref,
505 & p->stack_data[ p->w_top ]
506 );
507 }
508 }
509 #endif /* !defined(KMP_GOMP_COMPAT) */
510
511 KMP_MB(); /* Flush all pending memory write invalidates. */
512
513 pr->ordered_bumped += 1;
514
515 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516 gtid, pr->ordered_bumped ) );
517
518 KMP_MB(); /* Flush all pending memory write invalidates. */
519
520 /* TODO use general release procedure? */
521 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522
523 KMP_MB(); /* Flush all pending memory write invalidates. */
524 }
525 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526}
527
528/* Computes and returns x to the power of y, where y must a non-negative integer */
529template< typename UT >
530static __forceinline long double
531__kmp_pow(long double x, UT y) {
532 long double s=1.0L;
533
534 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536 while(y) {
537 if ( y & 1 )
538 s *= x;
539 x *= x;
540 y >>= 1;
541 }
542 return s;
543}
544
545/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549*/
550template< typename T >
551static __inline typename traits_t< T >::unsigned_t
552__kmp_dispatch_guided_remaining(
553 T tc,
554 typename traits_t< T >::floating_t base,
555 typename traits_t< T >::unsigned_t idx
556) {
557 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558 least for ICL 8.1, long double arithmetic may not really have
559 long double precision, even with /Qlong_double. Currently, we
560 workaround that in the caller code, by manipulating the FPCW for
561 Windows* OS on IA-32 architecture. The lack of precision is not
562 expected to be a correctness issue, though.
563 */
564 typedef typename traits_t< T >::unsigned_t UT;
565
566 long double x = tc * __kmp_pow< UT >(base, idx);
567 UT r = (UT) x;
568 if ( x == r )
569 return r;
570 return r + 1;
571}
572
573// Parameters of the guided-iterative algorithm:
574// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
575// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
576// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578static int guided_int_param = 2;
579static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580
581// UT - unsigned flavor of T, ST - signed flavor of T,
582// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583template< typename T >
584static void
585__kmp_dispatch_init(
586 ident_t * loc,
587 int gtid,
588 enum sched_type schedule,
589 T lb,
590 T ub,
591 typename traits_t< T >::signed_t st,
592 typename traits_t< T >::signed_t chunk,
593 int push_ws
594) {
595 typedef typename traits_t< T >::unsigned_t UT;
596 typedef typename traits_t< T >::signed_t ST;
597 typedef typename traits_t< T >::floating_t DBL;
598 static const int ___kmp_size_type = sizeof( UT );
599
600 int active;
601 T tc;
602 kmp_info_t * th;
603 kmp_team_t * team;
604 kmp_uint32 my_buffer_index;
605 dispatch_private_info_template< T > * pr;
606 dispatch_shared_info_template< UT > volatile * sh;
607
608 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610
611 if ( ! TCR_4( __kmp_init_parallel ) )
612 __kmp_parallel_initialize();
613
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000614#if INCLUDE_SSC_MARKS
615 SSC_MARK_DISPATCH_INIT();
616#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000617 #ifdef KMP_DEBUG
618 {
619 const char * buff;
620 // create format specifiers before the debug output
621 buff = __kmp_str_format(
622 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625 __kmp_str_free( &buff );
626 }
627 #endif
628 /* setup data */
629 th = __kmp_threads[ gtid ];
630 team = th -> th.th_team;
631 active = ! team -> t.t_serialized;
632 th->th.th_ident = loc;
633
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000634#if USE_ITT_BUILD
635 kmp_uint64 cur_chunk = chunk;
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000636 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
637 KMP_MASTER_GTID(gtid) &&
638#if OMP_40_ENABLED
639 th->th.th_teams_microtask == NULL &&
640#endif
641 team->t.t_active_level == 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000642#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000643 if ( ! active ) {
644 pr = reinterpret_cast< dispatch_private_info_template< T >* >
645 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
646 } else {
647 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
648 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
649
650 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
651
652 /* What happens when number of threads changes, need to resize buffer? */
653 pr = reinterpret_cast< dispatch_private_info_template< T > * >
654 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
655 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
656 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657 }
658
659 /* Pick up the nomerge/ordered bits from the scheduling type */
660 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
661 pr->nomerge = TRUE;
662 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
663 } else {
664 pr->nomerge = FALSE;
665 }
666 pr->type_size = ___kmp_size_type; // remember the size of variables
667 if ( kmp_ord_lower & schedule ) {
668 pr->ordered = TRUE;
669 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
670 } else {
671 pr->ordered = FALSE;
672 }
Jonathan Peyton45be4502015-08-11 21:36:41 +0000673
Jim Cownie5e8470a2013-09-27 10:38:44 +0000674 if ( schedule == kmp_sch_static ) {
675 schedule = __kmp_static;
676 } else {
677 if ( schedule == kmp_sch_runtime ) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000678 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
679 schedule = team -> t.t_sched.r_sched_type;
680 // Detail the schedule if needed (global controls are differentiated appropriately)
681 if ( schedule == kmp_sch_guided_chunked ) {
682 schedule = __kmp_guided;
683 } else if ( schedule == kmp_sch_static ) {
684 schedule = __kmp_static;
685 }
686 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
687 chunk = team -> t.t_sched.chunk;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000688
689 #ifdef KMP_DEBUG
690 {
691 const char * buff;
692 // create format specifiers before the debug output
693 buff = __kmp_str_format(
694 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
695 traits_t< ST >::spec );
696 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
697 __kmp_str_free( &buff );
698 }
699 #endif
700 } else {
701 if ( schedule == kmp_sch_guided_chunked ) {
702 schedule = __kmp_guided;
703 }
704 if ( chunk <= 0 ) {
705 chunk = KMP_DEFAULT_CHUNK;
706 }
707 }
708
Jim Cownie5e8470a2013-09-27 10:38:44 +0000709 if ( schedule == kmp_sch_auto ) {
710 // mapping and differentiation: in the __kmp_do_serial_initialize()
711 schedule = __kmp_auto;
712 #ifdef KMP_DEBUG
713 {
714 const char * buff;
715 // create format specifiers before the debug output
716 buff = __kmp_str_format(
717 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
718 traits_t< ST >::spec );
719 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
720 __kmp_str_free( &buff );
721 }
722 #endif
723 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000724
725 /* guided analytical not safe for too many threads */
726 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
727 schedule = kmp_sch_guided_iterative_chunked;
728 KMP_WARNING( DispatchManyThreads );
729 }
730 pr->u.p.parm1 = chunk;
731 }
732 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
733 "unknown scheduling type" );
734
735 pr->u.p.count = 0;
736
737 if ( __kmp_env_consistency_check ) {
738 if ( st == 0 ) {
739 __kmp_error_construct(
740 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
741 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
742 );
743 }
744 }
745
746 tc = ( ub - lb + st );
747 if ( st != 1 ) {
748 if ( st < 0 ) {
749 if ( lb < ub ) {
750 tc = 0; // zero-trip
751 } else { // lb >= ub
752 tc = (ST)tc / st; // convert to signed division
753 }
754 } else { // st > 0
755 if ( ub < lb ) {
756 tc = 0; // zero-trip
757 } else { // lb >= ub
758 tc /= st;
759 }
760 }
761 } else if ( ub < lb ) { // st == 1
762 tc = 0; // zero-trip
763 }
764
Jonathan Peyton45be4502015-08-11 21:36:41 +0000765 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
766 // when statistics are disabled.
767 if (schedule == __kmp_static)
768 {
769 KMP_COUNT_BLOCK(OMP_FOR_static);
770 KMP_COUNT_VALUE(FOR_static_iterations, tc);
771 }
772 else
773 {
774 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
775 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
776 }
777
Jim Cownie5e8470a2013-09-27 10:38:44 +0000778 pr->u.p.lb = lb;
779 pr->u.p.ub = ub;
780 pr->u.p.st = st;
781 pr->u.p.tc = tc;
782
783 #if KMP_OS_WINDOWS
784 pr->u.p.last_upper = ub + st;
785 #endif /* KMP_OS_WINDOWS */
786
787 /* NOTE: only the active parallel region(s) has active ordered sections */
788
789 if ( active ) {
790 if ( pr->ordered == 0 ) {
791 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
792 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
793 } else {
794 pr->ordered_bumped = 0;
795
796 pr->u.p.ordered_lower = 1;
797 pr->u.p.ordered_upper = 0;
798
799 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
800 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
801 }
802 }
803
804 if ( __kmp_env_consistency_check ) {
805 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
806 if ( push_ws ) {
807 __kmp_push_workshare( gtid, ws, loc );
808 pr->pushed_ws = ws;
809 } else {
810 __kmp_check_workshare( gtid, ws, loc );
811 pr->pushed_ws = ct_none;
812 }
813 }
814
815 switch ( schedule ) {
816 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
817 case kmp_sch_static_steal:
818 {
819 T nproc = team->t.t_nproc;
820 T ntc, init;
821
822 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
823
824 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
825 if ( nproc > 1 && ntc >= nproc ) {
826 T id = __kmp_tid_from_gtid(gtid);
827 T small_chunk, extras;
828
829 small_chunk = ntc / nproc;
830 extras = ntc % nproc;
831
832 init = id * small_chunk + ( id < extras ? id : extras );
833 pr->u.p.count = init;
834 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
835
836 pr->u.p.parm2 = lb;
837 //pr->pfields.parm3 = 0; // it's not used in static_steal
838 pr->u.p.parm4 = id;
839 pr->u.p.st = st;
840 break;
841 } else {
842 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
843 gtid ) );
844 schedule = kmp_sch_static_balanced;
845 /* too few iterations: fall-through to kmp_sch_static_balanced */
846 } // if
847 /* FALL-THROUGH to static balanced */
848 } // case
849 #endif
850 case kmp_sch_static_balanced:
851 {
852 T nproc = team->t.t_nproc;
853 T init, limit;
854
855 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
856 gtid ) );
857
858 if ( nproc > 1 ) {
859 T id = __kmp_tid_from_gtid(gtid);
860
861 if ( tc < nproc ) {
862 if ( id < tc ) {
863 init = id;
864 limit = id;
865 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
866 } else {
867 pr->u.p.count = 1; /* means no more chunks to execute */
868 pr->u.p.parm1 = FALSE;
869 break;
870 }
871 } else {
872 T small_chunk = tc / nproc;
873 T extras = tc % nproc;
874 init = id * small_chunk + (id < extras ? id : extras);
875 limit = init + small_chunk - (id < extras ? 0 : 1);
876 pr->u.p.parm1 = (id == nproc - 1);
877 }
878 } else {
879 if ( tc > 0 ) {
880 init = 0;
881 limit = tc - 1;
882 pr->u.p.parm1 = TRUE;
883 } else {
884 // zero trip count
885 pr->u.p.count = 1; /* means no more chunks to execute */
886 pr->u.p.parm1 = FALSE;
887 break;
888 }
889 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000890#if USE_ITT_BUILD
891 // Calculate chunk for metadata report
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000892 if ( itt_need_metadata_reporting )
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000893 cur_chunk = limit - init + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000894#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000895 if ( st == 1 ) {
896 pr->u.p.lb = lb + init;
897 pr->u.p.ub = lb + limit;
898 } else {
899 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
900 pr->u.p.lb = lb + init * st;
901 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
902 if ( st > 0 ) {
903 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
904 } else {
905 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
906 }
907 }
908 if ( pr->ordered ) {
909 pr->u.p.ordered_lower = init;
910 pr->u.p.ordered_upper = limit;
911 }
912 break;
913 } // case
914 case kmp_sch_guided_iterative_chunked :
915 {
916 T nproc = team->t.t_nproc;
917 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
918
919 if ( nproc > 1 ) {
920 if ( (2L * chunk + 1 ) * nproc >= tc ) {
921 /* chunk size too large, switch to dynamic */
922 schedule = kmp_sch_dynamic_chunked;
923 } else {
924 // when remaining iters become less than parm2 - switch to dynamic
925 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
926 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
927 }
928 } else {
929 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
930 schedule = kmp_sch_static_greedy;
931 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
932 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
933 pr->u.p.parm1 = tc;
934 } // if
935 } // case
936 break;
937 case kmp_sch_guided_analytical_chunked:
938 {
939 T nproc = team->t.t_nproc;
940 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
941
942 if ( nproc > 1 ) {
943 if ( (2L * chunk + 1 ) * nproc >= tc ) {
944 /* chunk size too large, switch to dynamic */
945 schedule = kmp_sch_dynamic_chunked;
946 } else {
947 /* commonly used term: (2 nproc - 1)/(2 nproc) */
948 DBL x;
949
950 #if KMP_OS_WINDOWS && KMP_ARCH_X86
951 /* Linux* OS already has 64-bit computation by default for
952 long double, and on Windows* OS on Intel(R) 64,
953 /Qlong_double doesn't work. On Windows* OS
954 on IA-32 architecture, we need to set precision to
955 64-bit instead of the default 53-bit. Even though long
956 double doesn't work on Windows* OS on Intel(R) 64, the
957 resulting lack of precision is not expected to impact
958 the correctness of the algorithm, but this has not been
959 mathematically proven.
960 */
961 // save original FPCW and set precision to 64-bit, as
962 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +0000963 unsigned int oldFpcw = _control87(0,0);
964 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +0000965 #endif
966 /* value used for comparison in solver for cross-over point */
967 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
968
969 /* crossover point--chunk indexes equal to or greater than
970 this point switch to dynamic-style scheduling */
971 UT cross;
972
973 /* commonly used term: (2 nproc - 1)/(2 nproc) */
974 x = (long double)1.0 - (long double)0.5 / nproc;
975
976 #ifdef KMP_DEBUG
977 { // test natural alignment
978 struct _test_a {
979 char a;
980 union {
981 char b;
982 DBL d;
983 };
984 } t;
985 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
986 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
987 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
988 }
989 #endif // KMP_DEBUG
990
991 /* save the term in thread private dispatch structure */
992 *(DBL*)&pr->u.p.parm3 = x;
993
994 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
995 {
996 UT left, right, mid;
997 long double p;
998
999 /* estimate initial upper and lower bound */
1000
1001 /* doesn't matter what value right is as long as it is positive, but
1002 it affects performance of the solver
1003 */
1004 right = 229;
1005 p = __kmp_pow< UT >(x,right);
1006 if ( p > target ) {
1007 do{
1008 p *= p;
1009 right <<= 1;
1010 } while(p>target && right < (1<<27));
1011 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1012 } else {
1013 left = 0;
1014 }
1015
1016 /* bisection root-finding method */
1017 while ( left + 1 < right ) {
1018 mid = (left + right) / 2;
1019 if ( __kmp_pow< UT >(x,mid) > target ) {
1020 left = mid;
1021 } else {
1022 right = mid;
1023 }
1024 } // while
1025 cross = right;
1026 }
1027 /* assert sanity of computed crossover point */
1028 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1029
1030 /* save the crossover point in thread private dispatch structure */
1031 pr->u.p.parm2 = cross;
1032
1033 // C75803
1034 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1035 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1036 #else
1037 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1038 #endif
1039 /* dynamic-style scheduling offset */
1040 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1041 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1042 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +00001043 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001044 #endif
1045 } // if
1046 } else {
1047 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1048 gtid ) );
1049 schedule = kmp_sch_static_greedy;
1050 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1051 pr->u.p.parm1 = tc;
1052 } // if
1053 } // case
1054 break;
1055 case kmp_sch_static_greedy:
1056 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1057 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1058 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1059 tc;
1060 break;
1061 case kmp_sch_static_chunked :
1062 case kmp_sch_dynamic_chunked :
Jonathan Peyton70bda912015-11-06 20:32:44 +00001063 if ( pr->u.p.parm1 <= 0 ) {
1064 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1065 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00001066 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1067 break;
1068 case kmp_sch_trapezoidal :
1069 {
1070 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1071
1072 T parm1, parm2, parm3, parm4;
1073 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1074
1075 parm1 = chunk;
1076
1077 /* F : size of the first cycle */
1078 parm2 = ( tc / (2 * team->t.t_nproc) );
1079
1080 if ( parm2 < 1 ) {
1081 parm2 = 1;
1082 }
1083
1084 /* L : size of the last cycle. Make sure the last cycle
1085 * is not larger than the first cycle.
1086 */
1087 if ( parm1 < 1 ) {
1088 parm1 = 1;
1089 } else if ( parm1 > parm2 ) {
1090 parm1 = parm2;
1091 }
1092
1093 /* N : number of cycles */
1094 parm3 = ( parm2 + parm1 );
1095 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1096
1097 if ( parm3 < 2 ) {
1098 parm3 = 2;
1099 }
1100
1101 /* sigma : decreasing incr of the trapezoid */
1102 parm4 = ( parm3 - 1 );
1103 parm4 = ( parm2 - parm1 ) / parm4;
1104
1105 // pointless check, because parm4 >= 0 always
1106 //if ( parm4 < 0 ) {
1107 // parm4 = 0;
1108 //}
1109
1110 pr->u.p.parm1 = parm1;
1111 pr->u.p.parm2 = parm2;
1112 pr->u.p.parm3 = parm3;
1113 pr->u.p.parm4 = parm4;
1114 } // case
1115 break;
1116
1117 default:
1118 {
1119 __kmp_msg(
1120 kmp_ms_fatal, // Severity
1121 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1122 KMP_HNT( GetNewerLibrary ), // Hint
1123 __kmp_msg_null // Variadic argument list terminator
1124 );
1125 }
1126 break;
1127 } // switch
1128 pr->schedule = schedule;
1129 if ( active ) {
1130 /* The name of this buffer should be my_buffer_index when it's free to use it */
1131
1132 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1133 gtid, my_buffer_index, sh->buffer_index) );
1134 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1135 USE_ITT_BUILD_ARG( NULL )
1136 );
1137 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1138 // *always* 32-bit integers.
1139 KMP_MB(); /* is this necessary? */
1140 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1141 gtid, my_buffer_index, sh->buffer_index) );
1142
1143 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1144 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1145#if USE_ITT_BUILD
1146 if ( pr->ordered ) {
1147 __kmp_itt_ordered_init( gtid );
1148 }; // if
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001149 // Report loop metadata
1150 if ( itt_need_metadata_reporting ) {
1151 // Only report metadata by master of active team at level 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001152 kmp_uint64 schedtype = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001153 switch ( schedule ) {
1154 case kmp_sch_static_chunked:
1155 case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1156 break;
1157 case kmp_sch_static_greedy:
1158 cur_chunk = pr->u.p.parm1;
1159 break;
1160 case kmp_sch_dynamic_chunked:
1161 schedtype = 1;
1162 break;
1163 case kmp_sch_guided_iterative_chunked:
1164 case kmp_sch_guided_analytical_chunked:
1165 schedtype = 2;
1166 break;
1167 default:
1168// Should we put this case under "static"?
1169// case kmp_sch_static_steal:
1170 schedtype = 3;
1171 break;
1172 }
1173 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1174 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001175#endif /* USE_ITT_BUILD */
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001176 }; // if
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001177
Jim Cownie5e8470a2013-09-27 10:38:44 +00001178 #ifdef KMP_DEBUG
1179 {
1180 const char * buff;
1181 // create format specifiers before the debug output
1182 buff = __kmp_str_format(
1183 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1184 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1185 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1186 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1187 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1188 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1189 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1190 KD_TRACE(10, ( buff,
1191 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1192 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1193 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1194 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1195 __kmp_str_free( &buff );
1196 }
1197 #endif
1198 #if ( KMP_STATIC_STEAL_ENABLED )
1199 if ( ___kmp_size_type < 8 ) {
1200 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1201 // all the parm3 variables will contain the same value.
1202 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1203 // rather than program life-time increment.
1204 // So the dedicated variable is required. The 'static_steal_counter' is used.
1205 if( schedule == kmp_sch_static_steal ) {
1206 // Other threads will inspect this variable when searching for a victim.
1207 // This is a flag showing that other threads may steal from this thread since then.
1208 volatile T * p = &pr->u.p.static_steal_counter;
1209 *p = *p + 1;
1210 }
1211 }
1212 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001213
1214#if OMPT_SUPPORT && OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001215 if (ompt_enabled &&
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001216 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1217 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1218 ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1219 ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1220 team_info->parallel_id, task_info->task_id, team_info->microtask);
1221 }
1222#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001223}
1224
1225/*
1226 * For ordered loops, either __kmp_dispatch_finish() should be called after
1227 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1228 * every chunk of iterations. If the ordered section(s) were not executed
1229 * for this iteration (or every iteration in this chunk), we need to set the
1230 * ordered iteration counters so that the next thread can proceed.
1231 */
1232template< typename UT >
1233static void
1234__kmp_dispatch_finish( int gtid, ident_t *loc )
1235{
1236 typedef typename traits_t< UT >::signed_t ST;
1237 kmp_info_t *th = __kmp_threads[ gtid ];
1238
1239 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1240 if ( ! th -> th.th_team -> t.t_serialized ) {
1241
1242 dispatch_private_info_template< UT > * pr =
1243 reinterpret_cast< dispatch_private_info_template< UT >* >
1244 ( th->th.th_dispatch->th_dispatch_pr_current );
1245 dispatch_shared_info_template< UT > volatile * sh =
1246 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1247 ( th->th.th_dispatch->th_dispatch_sh_current );
1248 KMP_DEBUG_ASSERT( pr );
1249 KMP_DEBUG_ASSERT( sh );
1250 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1251 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1252
1253 if ( pr->ordered_bumped ) {
1254 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1255 gtid ) );
1256 pr->ordered_bumped = 0;
1257 } else {
1258 UT lower = pr->u.p.ordered_lower;
1259
1260 #ifdef KMP_DEBUG
1261 {
1262 const char * buff;
1263 // create format specifiers before the debug output
1264 buff = __kmp_str_format(
1265 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1266 traits_t< UT >::spec, traits_t< UT >::spec );
1267 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1268 __kmp_str_free( &buff );
1269 }
1270 #endif
1271
1272 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1273 USE_ITT_BUILD_ARG(NULL)
1274 );
1275 KMP_MB(); /* is this necessary? */
1276 #ifdef KMP_DEBUG
1277 {
1278 const char * buff;
1279 // create format specifiers before the debug output
1280 buff = __kmp_str_format(
1281 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1282 traits_t< UT >::spec, traits_t< UT >::spec );
1283 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1284 __kmp_str_free( &buff );
1285 }
1286 #endif
1287
1288 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1289 } // if
1290 } // if
1291 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1292}
1293
1294#ifdef KMP_GOMP_COMPAT
1295
1296template< typename UT >
1297static void
1298__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1299{
1300 typedef typename traits_t< UT >::signed_t ST;
1301 kmp_info_t *th = __kmp_threads[ gtid ];
1302
1303 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1304 if ( ! th -> th.th_team -> t.t_serialized ) {
1305// int cid;
1306 dispatch_private_info_template< UT > * pr =
1307 reinterpret_cast< dispatch_private_info_template< UT >* >
1308 ( th->th.th_dispatch->th_dispatch_pr_current );
1309 dispatch_shared_info_template< UT > volatile * sh =
1310 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1311 ( th->th.th_dispatch->th_dispatch_sh_current );
1312 KMP_DEBUG_ASSERT( pr );
1313 KMP_DEBUG_ASSERT( sh );
1314 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1315 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1316
1317// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1318 UT lower = pr->u.p.ordered_lower;
1319 UT upper = pr->u.p.ordered_upper;
1320 UT inc = upper - lower + 1;
1321
1322 if ( pr->ordered_bumped == inc ) {
1323 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1324 gtid ) );
1325 pr->ordered_bumped = 0;
1326 } else {
1327 inc -= pr->ordered_bumped;
1328
1329 #ifdef KMP_DEBUG
1330 {
1331 const char * buff;
1332 // create format specifiers before the debug output
1333 buff = __kmp_str_format(
1334 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1335 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1336 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1337 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1338 __kmp_str_free( &buff );
1339 }
1340 #endif
1341
1342 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1343 USE_ITT_BUILD_ARG(NULL)
1344 );
1345
1346 KMP_MB(); /* is this necessary? */
1347 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1348 gtid ) );
1349 pr->ordered_bumped = 0;
1350//!!!!! TODO check if the inc should be unsigned, or signed???
1351 #ifdef KMP_DEBUG
1352 {
1353 const char * buff;
1354 // create format specifiers before the debug output
1355 buff = __kmp_str_format(
1356 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1357 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1358 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1359 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1360 __kmp_str_free( &buff );
1361 }
1362 #endif
1363
1364 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1365 }
1366// }
1367 }
1368 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1369}
1370
1371#endif /* KMP_GOMP_COMPAT */
1372
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001373/* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1374 * (no more work), then tell OMPT the loop is over. In some cases
1375 * kmp_dispatch_fini() is not called. */
1376#if OMPT_SUPPORT && OMPT_TRACE
1377#define OMPT_LOOP_END \
1378 if (status == 0) { \
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001379 if (ompt_enabled && \
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001380 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1381 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1382 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1383 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1384 team_info->parallel_id, task_info->task_id); \
1385 } \
1386 }
1387#else
1388#define OMPT_LOOP_END // no-op
1389#endif
1390
Jim Cownie5e8470a2013-09-27 10:38:44 +00001391template< typename T >
1392static int
1393__kmp_dispatch_next(
1394 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1395) {
1396
1397 typedef typename traits_t< T >::unsigned_t UT;
1398 typedef typename traits_t< T >::signed_t ST;
1399 typedef typename traits_t< T >::floating_t DBL;
Jonathan Peyton2321d572015-06-08 19:25:25 +00001400#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001401 static const int ___kmp_size_type = sizeof( UT );
Jonathan Peyton2321d572015-06-08 19:25:25 +00001402#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001403
Jonathan Peyton45be4502015-08-11 21:36:41 +00001404 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1405 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1406 // more than a compile time choice to use static scheduling would.)
1407 KMP_TIME_BLOCK(FOR_dynamic_scheduling);
1408
Jim Cownie5e8470a2013-09-27 10:38:44 +00001409 int status;
1410 dispatch_private_info_template< T > * pr;
1411 kmp_info_t * th = __kmp_threads[ gtid ];
1412 kmp_team_t * team = th -> th.th_team;
1413
Andrey Churbanov9ad5c3a2015-07-13 17:52:41 +00001414 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie5e8470a2013-09-27 10:38:44 +00001415 #ifdef KMP_DEBUG
1416 {
1417 const char * buff;
1418 // create format specifiers before the debug output
1419 buff = __kmp_str_format(
1420 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1421 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1422 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1423 __kmp_str_free( &buff );
1424 }
1425 #endif
1426
1427 if ( team -> t.t_serialized ) {
1428 /* NOTE: serialize this dispatch becase we are not at the active level */
1429 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1430 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1431 KMP_DEBUG_ASSERT( pr );
1432
1433 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1434 *p_lb = 0;
1435 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001436// if ( p_last != NULL )
1437// *p_last = 0;
1438 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001439 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001440 if ( __kmp_env_consistency_check ) {
1441 if ( pr->pushed_ws != ct_none ) {
1442 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1443 }
1444 }
1445 } else if ( pr->nomerge ) {
1446 kmp_int32 last;
1447 T start;
1448 UT limit, trip, init;
1449 ST incr;
1450 T chunk = pr->u.p.parm1;
1451
1452 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1453
1454 init = chunk * pr->u.p.count++;
1455 trip = pr->u.p.tc - 1;
1456
1457 if ( (status = (init <= trip)) == 0 ) {
1458 *p_lb = 0;
1459 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001460// if ( p_last != NULL )
1461// *p_last = 0;
1462 if ( p_st != NULL )
1463 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001464 if ( __kmp_env_consistency_check ) {
1465 if ( pr->pushed_ws != ct_none ) {
1466 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1467 }
1468 }
1469 } else {
1470 start = pr->u.p.lb;
1471 limit = chunk + init - 1;
1472 incr = pr->u.p.st;
1473
1474 if ( (last = (limit >= trip)) != 0 ) {
1475 limit = trip;
1476 #if KMP_OS_WINDOWS
1477 pr->u.p.last_upper = pr->u.p.ub;
1478 #endif /* KMP_OS_WINDOWS */
1479 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001480 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001481 *p_last = last;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001482 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001483 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001484 if ( incr == 1 ) {
1485 *p_lb = start + init;
1486 *p_ub = start + limit;
1487 } else {
1488 *p_lb = start + init * incr;
1489 *p_ub = start + limit * incr;
1490 }
1491
1492 if ( pr->ordered ) {
1493 pr->u.p.ordered_lower = init;
1494 pr->u.p.ordered_upper = limit;
1495 #ifdef KMP_DEBUG
1496 {
1497 const char * buff;
1498 // create format specifiers before the debug output
1499 buff = __kmp_str_format(
1500 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1501 traits_t< UT >::spec, traits_t< UT >::spec );
1502 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1503 __kmp_str_free( &buff );
1504 }
1505 #endif
1506 } // if
1507 } // if
1508 } else {
1509 pr->u.p.tc = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001510 *p_lb = pr->u.p.lb;
1511 *p_ub = pr->u.p.ub;
1512 #if KMP_OS_WINDOWS
1513 pr->u.p.last_upper = *p_ub;
1514 #endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001515 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001516 *p_last = TRUE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001517 if ( p_st != NULL )
1518 *p_st = pr->u.p.st;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001519 } // if
1520 #ifdef KMP_DEBUG
1521 {
1522 const char * buff;
1523 // create format specifiers before the debug output
1524 buff = __kmp_str_format(
1525 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001526 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie5e8470a2013-09-27 10:38:44 +00001527 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001528 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001529 __kmp_str_free( &buff );
1530 }
1531 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001532#if INCLUDE_SSC_MARKS
1533 SSC_MARK_DISPATCH_NEXT();
1534#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001535 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001536 return status;
1537 } else {
1538 kmp_int32 last = 0;
1539 dispatch_shared_info_template< UT > *sh;
1540 T start;
1541 ST incr;
1542 UT limit, trip, init;
1543
1544 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1545 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1546
1547 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1548 ( th->th.th_dispatch->th_dispatch_pr_current );
1549 KMP_DEBUG_ASSERT( pr );
1550 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1551 ( th->th.th_dispatch->th_dispatch_sh_current );
1552 KMP_DEBUG_ASSERT( sh );
1553
1554 if ( pr->u.p.tc == 0 ) {
1555 // zero trip count
1556 status = 0;
1557 } else {
1558 switch (pr->schedule) {
1559 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1560 case kmp_sch_static_steal:
1561 {
1562 T chunk = pr->u.p.parm1;
1563
1564 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1565
1566 trip = pr->u.p.tc - 1;
1567
1568 if ( ___kmp_size_type > 4 ) {
1569 // Other threads do not look into the data of this thread,
1570 // so it's not necessary to make volatile casting.
1571 init = ( pr->u.p.count )++;
1572 status = ( init < (UT)pr->u.p.ub );
1573 } else {
1574 typedef union {
1575 struct {
1576 UT count;
1577 T ub;
1578 } p;
1579 kmp_int64 b;
1580 } union_i4;
1581 // All operations on 'count' or 'ub' must be combined atomically together.
1582 // stealing implemented only for 4-byte indexes
1583 {
1584 union_i4 vold, vnew;
1585 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1586 vnew = vold;
1587 vnew.p.count++;
1588 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1589 ( volatile kmp_int64* )&pr->u.p.count,
1590 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1591 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1592 KMP_CPU_PAUSE();
1593 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1594 vnew = vold;
1595 vnew.p.count++;
1596 }
1597 vnew = vold;
1598 init = vnew.p.count;
1599 status = ( init < (UT)vnew.p.ub ) ;
1600 }
1601
1602 if( !status ) {
1603 kmp_info_t **other_threads = team->t.t_threads;
1604 int while_limit = 10;
1605 int while_index = 0;
1606
1607 // TODO: algorithm of searching for a victim
1608 // should be cleaned up and measured
1609 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1610 union_i4 vold, vnew;
1611 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1612 T victimIdx = pr->u.p.parm4;
1613 T oldVictimIdx = victimIdx;
1614 dispatch_private_info_template< T > * victim;
1615
1616 do {
1617 if( !victimIdx ) {
1618 victimIdx = team->t.t_nproc - 1;
1619 } else {
1620 --victimIdx;
1621 }
1622 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1623 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1624 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1625 // TODO: think about a proper place of this test
1626 if ( ( !victim ) ||
1627 ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1628 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1629 // TODO: delay would be nice
1630 continue;
1631 // the victim is not ready yet to participate in stealing
1632 // because the victim is still in kmp_init_dispatch
1633 }
1634 if ( oldVictimIdx == victimIdx ) {
1635 break;
1636 }
1637 pr->u.p.parm4 = victimIdx;
1638
1639 while( 1 ) {
1640 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1641 vnew = vold;
1642
1643 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1644 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1645 break;
1646 }
1647 vnew.p.ub -= (remaining >> 2);
1648 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1649 #pragma warning( push )
1650 // disable warning on pointless comparison of unsigned with 0
1651 #pragma warning( disable: 186 )
1652 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1653 #pragma warning( pop )
1654 // TODO: Should this be acquire or release?
1655 if ( KMP_COMPARE_AND_STORE_ACQ64(
1656 ( volatile kmp_int64 * )&victim->u.p.count,
1657 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1658 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1659 status = 1;
1660 while_index = 0;
1661 // now update own count and ub
1662 #if KMP_ARCH_X86
1663 // stealing executed on non-KMP_ARCH_X86 only
1664 // Atomic 64-bit write on ia32 is
1665 // unavailable, so we do this in steps.
1666 // This code is not tested.
1667 init = vold.p.count;
1668 pr->u.p.ub = 0;
1669 pr->u.p.count = init + 1;
1670 pr->u.p.ub = vnew.p.count;
1671 #else
1672 init = vnew.p.ub;
1673 vold.p.count = init + 1;
1674 // TODO: is it safe and enough?
1675 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1676 #endif // KMP_ARCH_X86
1677 break;
1678 } // if
1679 KMP_CPU_PAUSE();
1680 } // while (1)
1681 } // while
1682 } // if
1683 } // if
1684 if ( !status ) {
1685 *p_lb = 0;
1686 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001687 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001688 } else {
1689 start = pr->u.p.parm2;
1690 init *= chunk;
1691 limit = chunk + init - 1;
1692 incr = pr->u.p.st;
1693
1694 KMP_DEBUG_ASSERT(init <= trip);
1695 if ( (last = (limit >= trip)) != 0 )
1696 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001697 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001698
1699 if ( incr == 1 ) {
1700 *p_lb = start + init;
1701 *p_ub = start + limit;
1702 } else {
1703 *p_lb = start + init * incr;
1704 *p_ub = start + limit * incr;
1705 }
1706
1707 if ( pr->ordered ) {
1708 pr->u.p.ordered_lower = init;
1709 pr->u.p.ordered_upper = limit;
1710 #ifdef KMP_DEBUG
1711 {
1712 const char * buff;
1713 // create format specifiers before the debug output
1714 buff = __kmp_str_format(
1715 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1716 traits_t< UT >::spec, traits_t< UT >::spec );
1717 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1718 __kmp_str_free( &buff );
1719 }
1720 #endif
1721 } // if
1722 } // if
1723 break;
1724 } // case
1725 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1726 case kmp_sch_static_balanced:
1727 {
1728 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1729 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1730 pr->u.p.count = 1;
1731 *p_lb = pr->u.p.lb;
1732 *p_ub = pr->u.p.ub;
1733 last = pr->u.p.parm1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001734 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001735 *p_st = pr->u.p.st;
1736 } else { /* no iterations to do */
1737 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1738 }
1739 if ( pr->ordered ) {
1740 #ifdef KMP_DEBUG
1741 {
1742 const char * buff;
1743 // create format specifiers before the debug output
1744 buff = __kmp_str_format(
1745 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1746 traits_t< UT >::spec, traits_t< UT >::spec );
1747 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1748 __kmp_str_free( &buff );
1749 }
1750 #endif
1751 } // if
1752 } // case
1753 break;
1754 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1755 case kmp_sch_static_chunked:
1756 {
1757 T parm1;
1758
1759 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1760 gtid ) );
1761 parm1 = pr->u.p.parm1;
1762
1763 trip = pr->u.p.tc - 1;
1764 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1765
1766 if ( (status = (init <= trip)) != 0 ) {
1767 start = pr->u.p.lb;
1768 incr = pr->u.p.st;
1769 limit = parm1 + init - 1;
1770
1771 if ( (last = (limit >= trip)) != 0 )
1772 limit = trip;
1773
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001774 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001775
1776 pr->u.p.count += team->t.t_nproc;
1777
1778 if ( incr == 1 ) {
1779 *p_lb = start + init;
1780 *p_ub = start + limit;
1781 }
1782 else {
1783 *p_lb = start + init * incr;
1784 *p_ub = start + limit * incr;
1785 }
1786
1787 if ( pr->ordered ) {
1788 pr->u.p.ordered_lower = init;
1789 pr->u.p.ordered_upper = limit;
1790 #ifdef KMP_DEBUG
1791 {
1792 const char * buff;
1793 // create format specifiers before the debug output
1794 buff = __kmp_str_format(
1795 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1796 traits_t< UT >::spec, traits_t< UT >::spec );
1797 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1798 __kmp_str_free( &buff );
1799 }
1800 #endif
1801 } // if
1802 } // if
1803 } // case
1804 break;
1805
1806 case kmp_sch_dynamic_chunked:
1807 {
1808 T chunk = pr->u.p.parm1;
1809
1810 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1811 gtid ) );
1812
1813 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1814 trip = pr->u.p.tc - 1;
1815
1816 if ( (status = (init <= trip)) == 0 ) {
1817 *p_lb = 0;
1818 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001819 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001820 } else {
1821 start = pr->u.p.lb;
1822 limit = chunk + init - 1;
1823 incr = pr->u.p.st;
1824
1825 if ( (last = (limit >= trip)) != 0 )
1826 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001827
1828 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001829
1830 if ( incr == 1 ) {
1831 *p_lb = start + init;
1832 *p_ub = start + limit;
1833 } else {
1834 *p_lb = start + init * incr;
1835 *p_ub = start + limit * incr;
1836 }
1837
1838 if ( pr->ordered ) {
1839 pr->u.p.ordered_lower = init;
1840 pr->u.p.ordered_upper = limit;
1841 #ifdef KMP_DEBUG
1842 {
1843 const char * buff;
1844 // create format specifiers before the debug output
1845 buff = __kmp_str_format(
1846 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1847 traits_t< UT >::spec, traits_t< UT >::spec );
1848 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1849 __kmp_str_free( &buff );
1850 }
1851 #endif
1852 } // if
1853 } // if
1854 } // case
1855 break;
1856
1857 case kmp_sch_guided_iterative_chunked:
1858 {
1859 T chunkspec = pr->u.p.parm1;
1860 KD_TRACE(100,
1861 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1862 trip = pr->u.p.tc;
1863 // Start atomic part of calculations
1864 while(1) {
1865 ST remaining; // signed, because can be < 0
1866 init = sh->u.s.iteration; // shared value
1867 remaining = trip - init;
1868 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1869 // nothing to do, don't try atomic op
1870 status = 0;
1871 break;
1872 }
1873 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1874 // use dynamic-style shcedule
1875 // atomically inrement iterations, get old value
1876 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1877 remaining = trip - init;
1878 if (remaining <= 0) {
1879 status = 0; // all iterations got by other threads
1880 } else {
1881 // got some iterations to work on
1882 status = 1;
1883 if ( (T)remaining > chunkspec ) {
1884 limit = init + chunkspec - 1;
1885 } else {
1886 last = 1; // the last chunk
1887 limit = init + remaining - 1;
1888 } // if
1889 } // if
1890 break;
1891 } // if
1892 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1893 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1894 // CAS was successful, chunk obtained
1895 status = 1;
1896 --limit;
1897 break;
1898 } // if
1899 } // while
1900 if ( status != 0 ) {
1901 start = pr->u.p.lb;
1902 incr = pr->u.p.st;
1903 if ( p_st != NULL )
1904 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001905 *p_lb = start + init * incr;
1906 *p_ub = start + limit * incr;
1907 if ( pr->ordered ) {
1908 pr->u.p.ordered_lower = init;
1909 pr->u.p.ordered_upper = limit;
1910 #ifdef KMP_DEBUG
1911 {
1912 const char * buff;
1913 // create format specifiers before the debug output
1914 buff = __kmp_str_format(
1915 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1916 traits_t< UT >::spec, traits_t< UT >::spec );
1917 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1918 __kmp_str_free( &buff );
1919 }
1920 #endif
1921 } // if
1922 } else {
1923 *p_lb = 0;
1924 *p_ub = 0;
1925 if ( p_st != NULL )
1926 *p_st = 0;
1927 } // if
1928 } // case
1929 break;
1930
1931 case kmp_sch_guided_analytical_chunked:
1932 {
1933 T chunkspec = pr->u.p.parm1;
1934 UT chunkIdx;
1935 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1936 /* for storing original FPCW value for Windows* OS on
1937 IA-32 architecture 8-byte version */
1938 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001939 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001940 #endif
1941 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1942 gtid ) );
1943
1944 trip = pr->u.p.tc;
1945
1946 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1947 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1948
1949 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1950 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1951 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1952 --trip;
1953 /* use dynamic-style scheduling */
1954 init = chunkIdx * chunkspec + pr->u.p.count;
1955 /* need to verify init > 0 in case of overflow in the above calculation */
1956 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1957 limit = init + chunkspec -1;
1958
1959 if ( (last = (limit >= trip)) != 0 )
1960 limit = trip;
1961 }
1962 break;
1963 } else {
1964 /* use exponential-style scheduling */
1965 /* The following check is to workaround the lack of long double precision on Windows* OS.
1966 This check works around the possible effect that init != 0 for chunkIdx == 0.
1967 */
1968 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1969 /* If we haven't already done so, save original
1970 FPCW and set precision to 64-bit, as Windows* OS
1971 on IA-32 architecture defaults to 53-bit */
1972 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00001973 oldFpcw = _control87(0,0);
1974 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001975 fpcwSet = 0x30000;
1976 }
1977 #endif
1978 if ( chunkIdx ) {
1979 init = __kmp_dispatch_guided_remaining< T >(
1980 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1981 KMP_DEBUG_ASSERT(init);
1982 init = trip - init;
1983 } else
1984 init = 0;
1985 limit = trip - __kmp_dispatch_guided_remaining< T >(
1986 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1987 KMP_ASSERT(init <= limit);
1988 if ( init < limit ) {
1989 KMP_DEBUG_ASSERT(limit <= trip);
1990 --limit;
1991 status = 1;
1992 break;
1993 } // if
1994 } // if
1995 } // while (1)
1996 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00001997 /* restore FPCW if necessary
1998 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1999 */
2000 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2001 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002002 #endif
2003 if ( status != 0 ) {
2004 start = pr->u.p.lb;
2005 incr = pr->u.p.st;
2006 if ( p_st != NULL )
2007 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002008 *p_lb = start + init * incr;
2009 *p_ub = start + limit * incr;
2010 if ( pr->ordered ) {
2011 pr->u.p.ordered_lower = init;
2012 pr->u.p.ordered_upper = limit;
2013 #ifdef KMP_DEBUG
2014 {
2015 const char * buff;
2016 // create format specifiers before the debug output
2017 buff = __kmp_str_format(
2018 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2019 traits_t< UT >::spec, traits_t< UT >::spec );
2020 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2021 __kmp_str_free( &buff );
2022 }
2023 #endif
2024 }
2025 } else {
2026 *p_lb = 0;
2027 *p_ub = 0;
2028 if ( p_st != NULL )
2029 *p_st = 0;
2030 }
2031 } // case
2032 break;
2033
2034 case kmp_sch_trapezoidal:
2035 {
2036 UT index;
2037 T parm2 = pr->u.p.parm2;
2038 T parm3 = pr->u.p.parm3;
2039 T parm4 = pr->u.p.parm4;
2040 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2041 gtid ) );
2042
2043 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2044
2045 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2046 trip = pr->u.p.tc - 1;
2047
2048 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2049 *p_lb = 0;
2050 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002051 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002052 } else {
2053 start = pr->u.p.lb;
2054 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2055 incr = pr->u.p.st;
2056
2057 if ( (last = (limit >= trip)) != 0 )
2058 limit = trip;
2059
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002060 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002061
2062 if ( incr == 1 ) {
2063 *p_lb = start + init;
2064 *p_ub = start + limit;
2065 } else {
2066 *p_lb = start + init * incr;
2067 *p_ub = start + limit * incr;
2068 }
2069
2070 if ( pr->ordered ) {
2071 pr->u.p.ordered_lower = init;
2072 pr->u.p.ordered_upper = limit;
2073 #ifdef KMP_DEBUG
2074 {
2075 const char * buff;
2076 // create format specifiers before the debug output
2077 buff = __kmp_str_format(
2078 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2079 traits_t< UT >::spec, traits_t< UT >::spec );
2080 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2081 __kmp_str_free( &buff );
2082 }
2083 #endif
2084 } // if
2085 } // if
2086 } // case
2087 break;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002088 default:
2089 {
2090 status = 0; // to avoid complaints on uninitialized variable use
2091 __kmp_msg(
2092 kmp_ms_fatal, // Severity
2093 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2094 KMP_HNT( GetNewerLibrary ), // Hint
2095 __kmp_msg_null // Variadic argument list terminator
2096 );
2097 }
2098 break;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002099 } // switch
2100 } // if tc == 0;
2101
2102 if ( status == 0 ) {
2103 UT num_done;
2104
2105 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2106 #ifdef KMP_DEBUG
2107 {
2108 const char * buff;
2109 // create format specifiers before the debug output
2110 buff = __kmp_str_format(
2111 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2112 traits_t< UT >::spec );
2113 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2114 __kmp_str_free( &buff );
2115 }
2116 #endif
2117
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002118 if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002119 /* NOTE: release this buffer to be reused */
2120
2121 KMP_MB(); /* Flush all pending memory write invalidates. */
2122
2123 sh->u.s.num_done = 0;
2124 sh->u.s.iteration = 0;
2125
2126 /* TODO replace with general release procedure? */
2127 if ( pr->ordered ) {
2128 sh->u.s.ordered_iteration = 0;
2129 }
2130
2131 KMP_MB(); /* Flush all pending memory write invalidates. */
2132
2133 sh -> buffer_index += KMP_MAX_DISP_BUF;
2134 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2135 gtid, sh->buffer_index) );
2136
2137 KMP_MB(); /* Flush all pending memory write invalidates. */
2138
2139 } // if
2140 if ( __kmp_env_consistency_check ) {
2141 if ( pr->pushed_ws != ct_none ) {
2142 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2143 }
2144 }
2145
2146 th -> th.th_dispatch -> th_deo_fcn = NULL;
2147 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2148 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2149 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2150 } // if (status == 0)
2151#if KMP_OS_WINDOWS
2152 else if ( last ) {
2153 pr->u.p.last_upper = pr->u.p.ub;
2154 }
2155#endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002156 if ( p_last != NULL && status != 0 )
2157 *p_last = last;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002158 } // if
2159
2160 #ifdef KMP_DEBUG
2161 {
2162 const char * buff;
2163 // create format specifiers before the debug output
2164 buff = __kmp_str_format(
2165 "__kmp_dispatch_next: T#%%d normal case: " \
2166 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2167 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2168 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2169 __kmp_str_free( &buff );
2170 }
2171 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002172#if INCLUDE_SSC_MARKS
2173 SSC_MARK_DISPATCH_NEXT();
2174#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00002175 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002176 return status;
2177}
2178
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002179template< typename T >
2180static void
2181__kmp_dist_get_bounds(
2182 ident_t *loc,
2183 kmp_int32 gtid,
2184 kmp_int32 *plastiter,
2185 T *plower,
2186 T *pupper,
2187 typename traits_t< T >::signed_t incr
2188) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002189 typedef typename traits_t< T >::unsigned_t UT;
2190 typedef typename traits_t< T >::signed_t ST;
2191 register kmp_uint32 team_id;
2192 register kmp_uint32 nteams;
2193 register UT trip_count;
2194 register kmp_team_t *team;
2195 kmp_info_t * th;
2196
2197 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2198 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2199 #ifdef KMP_DEBUG
2200 {
2201 const char * buff;
2202 // create format specifiers before the debug output
2203 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2204 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2205 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2206 traits_t< T >::spec );
2207 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2208 __kmp_str_free( &buff );
2209 }
2210 #endif
2211
2212 if( __kmp_env_consistency_check ) {
2213 if( incr == 0 ) {
2214 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2215 }
2216 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2217 // The loop is illegal.
2218 // Some zero-trip loops maintained by compiler, e.g.:
2219 // for(i=10;i<0;++i) // lower >= upper - run-time check
2220 // for(i=0;i>10;--i) // lower <= upper - run-time check
2221 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2222 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2223 // Compiler does not check the following illegal loops:
2224 // for(i=0;i<10;i+=incr) // where incr<0
2225 // for(i=10;i>0;i-=incr) // where incr<0
2226 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2227 }
2228 }
2229 th = __kmp_threads[gtid];
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002230 team = th->th.th_team;
2231 #if OMP_40_ENABLED
Jonathan Peyton441f3372015-09-21 17:24:46 +00002232 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002233 nteams = th->th.th_teams_size.nteams;
2234 #endif
2235 team_id = team->t.t_master_tid;
2236 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2237
2238 // compute global trip count
2239 if( incr == 1 ) {
2240 trip_count = *pupper - *plower + 1;
2241 } else if(incr == -1) {
2242 trip_count = *plower - *pupper + 1;
2243 } else {
2244 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2245 }
Jonathan Peyton45be4502015-08-11 21:36:41 +00002246
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002247 if( trip_count <= nteams ) {
2248 KMP_DEBUG_ASSERT(
2249 __kmp_static == kmp_sch_static_greedy || \
2250 __kmp_static == kmp_sch_static_balanced
2251 ); // Unknown static scheduling type.
2252 // only some teams get single iteration, others get nothing
2253 if( team_id < trip_count ) {
2254 *pupper = *plower = *plower + team_id * incr;
2255 } else {
2256 *plower = *pupper + incr; // zero-trip loop
2257 }
2258 if( plastiter != NULL )
2259 *plastiter = ( team_id == trip_count - 1 );
2260 } else {
2261 if( __kmp_static == kmp_sch_static_balanced ) {
2262 register UT chunk = trip_count / nteams;
2263 register UT extras = trip_count % nteams;
2264 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2265 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2266 if( plastiter != NULL )
2267 *plastiter = ( team_id == nteams - 1 );
2268 } else {
2269 register T chunk_inc_count =
2270 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2271 register T upper = *pupper;
2272 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2273 // Unknown static scheduling type.
2274 *plower += team_id * chunk_inc_count;
2275 *pupper = *plower + chunk_inc_count - incr;
2276 // Check/correct bounds if needed
2277 if( incr > 0 ) {
2278 if( *pupper < *plower )
2279 *pupper = i_maxmin< T >::mx;
2280 if( plastiter != NULL )
2281 *plastiter = *plower <= upper && *pupper > upper - incr;
2282 if( *pupper > upper )
2283 *pupper = upper; // tracker C73258
2284 } else {
2285 if( *pupper > *plower )
2286 *pupper = i_maxmin< T >::mn;
2287 if( plastiter != NULL )
2288 *plastiter = *plower >= upper && *pupper < upper - incr;
2289 if( *pupper < upper )
2290 *pupper = upper; // tracker C73258
2291 }
2292 }
2293 }
2294}
2295
Jim Cownie5e8470a2013-09-27 10:38:44 +00002296//-----------------------------------------------------------------------------------------
2297// Dispatch routines
2298// Transfer call to template< type T >
2299// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2300// T lb, T ub, ST st, ST chunk )
2301extern "C" {
2302
2303/*!
2304@ingroup WORK_SHARING
2305@{
2306@param loc Source location
2307@param gtid Global thread id
2308@param schedule Schedule type
2309@param lb Lower bound
2310@param ub Upper bound
2311@param st Step (or increment if you prefer)
2312@param chunk The chunk size to block with
2313
2314This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2315These functions are all identical apart from the types of the arguments.
2316*/
2317
2318void
2319__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2320 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2321{
2322 KMP_DEBUG_ASSERT( __kmp_init_serial );
2323 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2324}
2325/*!
2326See @ref __kmpc_dispatch_init_4
2327*/
2328void
2329__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2330 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2331{
2332 KMP_DEBUG_ASSERT( __kmp_init_serial );
2333 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2334}
2335
2336/*!
2337See @ref __kmpc_dispatch_init_4
2338*/
2339void
2340__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2341 kmp_int64 lb, kmp_int64 ub,
2342 kmp_int64 st, kmp_int64 chunk )
2343{
2344 KMP_DEBUG_ASSERT( __kmp_init_serial );
2345 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2346}
2347
2348/*!
2349See @ref __kmpc_dispatch_init_4
2350*/
2351void
2352__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2353 kmp_uint64 lb, kmp_uint64 ub,
2354 kmp_int64 st, kmp_int64 chunk )
2355{
2356 KMP_DEBUG_ASSERT( __kmp_init_serial );
2357 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2358}
2359
2360/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002361See @ref __kmpc_dispatch_init_4
2362
2363Difference from __kmpc_dispatch_init set of functions is these functions
2364are called for composite distribute parallel for construct. Thus before
2365regular iterations dispatching we need to calc per-team iteration space.
2366
2367These functions are all identical apart from the types of the arguments.
2368*/
2369void
2370__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2371 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2372{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002373 KMP_DEBUG_ASSERT( __kmp_init_serial );
2374 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2375 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2376}
2377
2378void
2379__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2380 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2381{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002382 KMP_DEBUG_ASSERT( __kmp_init_serial );
2383 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2384 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2385}
2386
2387void
2388__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2389 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2390{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002391 KMP_DEBUG_ASSERT( __kmp_init_serial );
2392 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2393 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2394}
2395
2396void
2397__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2398 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2399{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002400 KMP_DEBUG_ASSERT( __kmp_init_serial );
2401 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2402 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2403}
2404
2405/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002406@param loc Source code location
2407@param gtid Global thread id
2408@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2409@param p_lb Pointer to the lower bound for the next chunk of work
2410@param p_ub Pointer to the upper bound for the next chunk of work
2411@param p_st Pointer to the stride for the next chunk of work
2412@return one if there is work to be done, zero otherwise
2413
2414Get the next dynamically allocated chunk of work for this thread.
2415If there is no more work, then the lb,ub and stride need not be modified.
2416*/
2417int
2418__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2419 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2420{
2421 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2422}
2423
2424/*!
2425See @ref __kmpc_dispatch_next_4
2426*/
2427int
2428__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2429 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2430{
2431 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2432}
2433
2434/*!
2435See @ref __kmpc_dispatch_next_4
2436*/
2437int
2438__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2439 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2440{
2441 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2442}
2443
2444/*!
2445See @ref __kmpc_dispatch_next_4
2446*/
2447int
2448__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2449 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2450{
2451 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2452}
2453
2454/*!
2455@param loc Source code location
2456@param gtid Global thread id
2457
2458Mark the end of a dynamic loop.
2459*/
2460void
2461__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2462{
2463 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2464}
2465
2466/*!
2467See @ref __kmpc_dispatch_fini_4
2468*/
2469void
2470__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2471{
2472 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2473}
2474
2475/*!
2476See @ref __kmpc_dispatch_fini_4
2477*/
2478void
2479__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2480{
2481 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2482}
2483
2484/*!
2485See @ref __kmpc_dispatch_fini_4
2486*/
2487void
2488__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2489{
2490 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2491}
2492/*! @} */
2493
2494//-----------------------------------------------------------------------------------------
2495//Non-template routines from kmp_dispatch.c used in other sources
2496
2497kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2498 return value == checker;
2499}
2500
2501kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2502 return value != checker;
2503}
2504
2505kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2506 return value < checker;
2507}
2508
2509kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2510 return value >= checker;
2511}
2512
2513kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2514 return value <= checker;
2515}
2516kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2517 return value == checker;
2518}
2519
2520kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2521 return value != checker;
2522}
2523
2524kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2525 return value < checker;
2526}
2527
2528kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2529 return value >= checker;
2530}
2531
2532kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2533 return value <= checker;
2534}
2535
2536kmp_uint32
2537__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2538 kmp_uint32 checker,
2539 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2540 , void * obj // Higher-level synchronization object, or NULL.
2541 )
2542{
2543 // note: we may not belong to a team at this point
2544 register volatile kmp_uint32 * spin = spinner;
2545 register kmp_uint32 check = checker;
2546 register kmp_uint32 spins;
2547 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2548 register kmp_uint32 r;
2549
2550 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2551 KMP_INIT_YIELD( spins );
2552 // main wait spin loop
2553 while(!f(r = TCR_4(*spin), check)) {
2554 KMP_FSYNC_SPIN_PREPARE( obj );
2555 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2556 It causes problems with infinite recursion because of exit lock */
2557 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2558 __kmp_abort_thread(); */
2559
Jim Cownie5e8470a2013-09-27 10:38:44 +00002560 /* if we have waited a bit, or are oversubscribed, yield */
2561 /* pause is in the following code */
2562 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2563 KMP_YIELD_SPIN( spins );
2564 }
2565 KMP_FSYNC_SPIN_ACQUIRED( obj );
2566 return r;
2567}
2568
2569kmp_uint64
2570__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2571 kmp_uint64 checker,
2572 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2573 , void * obj // Higher-level synchronization object, or NULL.
2574 )
2575{
2576 // note: we may not belong to a team at this point
2577 register volatile kmp_uint64 * spin = spinner;
2578 register kmp_uint64 check = checker;
2579 register kmp_uint32 spins;
2580 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2581 register kmp_uint64 r;
2582
2583 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2584 KMP_INIT_YIELD( spins );
2585 // main wait spin loop
2586 while(!f(r = *spin, check))
2587 {
2588 KMP_FSYNC_SPIN_PREPARE( obj );
2589 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2590 It causes problems with infinite recursion because of exit lock */
2591 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2592 __kmp_abort_thread(); */
2593
Jim Cownie5e8470a2013-09-27 10:38:44 +00002594 // if we are oversubscribed,
2595 // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2596 // pause is in the following code
2597 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2598 KMP_YIELD_SPIN( spins );
2599 }
2600 KMP_FSYNC_SPIN_ACQUIRED( obj );
2601 return r;
2602}
2603
2604} // extern "C"
2605
2606#ifdef KMP_GOMP_COMPAT
2607
2608void
2609__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2610 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2611 kmp_int32 chunk, int push_ws )
2612{
2613 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2614 push_ws );
2615}
2616
2617void
2618__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2619 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2620 kmp_int32 chunk, int push_ws )
2621{
2622 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2623 push_ws );
2624}
2625
2626void
2627__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2628 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2629 kmp_int64 chunk, int push_ws )
2630{
2631 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2632 push_ws );
2633}
2634
2635void
2636__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2637 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2638 kmp_int64 chunk, int push_ws )
2639{
2640 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2641 push_ws );
2642}
2643
2644void
2645__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2646{
2647 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2648}
2649
2650void
2651__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2652{
2653 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2654}
2655
2656void
2657__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2658{
2659 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2660}
2661
2662void
2663__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2664{
2665 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2666}
2667
2668#endif /* KMP_GOMP_COMPAT */
2669
2670/* ------------------------------------------------------------------------ */
2671/* ------------------------------------------------------------------------ */
2672