blob: 65abcf7fc437ac6a6aba9f70f84c33ace6309e71 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16/*
17 * Dynamic scheduling initialization and dispatch.
18 *
19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20 * it may change values between parallel regions. __kmp_max_nth
21 * is the largest value __kmp_nth may take, 1 is the smallest.
22 *
23 */
24
25/* ------------------------------------------------------------------------ */
26/* ------------------------------------------------------------------------ */
27
28#include "kmp.h"
29#include "kmp_i18n.h"
30#include "kmp_itt.h"
31#include "kmp_str.h"
32#include "kmp_error.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000033#include "kmp_stats.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000034#if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36#endif
37
Andrey Churbanovd7d088f2015-04-29 16:42:24 +000038#if OMPT_SUPPORT
39#include "ompt-internal.h"
40#include "ompt-specific.h"
41#endif
42
Jim Cownie5e8470a2013-09-27 10:38:44 +000043/* ------------------------------------------------------------------------ */
44/* ------------------------------------------------------------------------ */
45
Jim Cownie4cc4bb42014-10-07 16:25:50 +000046// template for type limits
47template< typename T >
48struct i_maxmin {
49 static const T mx;
50 static const T mn;
51};
52template<>
53struct i_maxmin< int > {
54 static const int mx = 0x7fffffff;
55 static const int mn = 0x80000000;
56};
57template<>
58struct i_maxmin< unsigned int > {
59 static const unsigned int mx = 0xffffffff;
60 static const unsigned int mn = 0x00000000;
61};
62template<>
63struct i_maxmin< long long > {
64 static const long long mx = 0x7fffffffffffffffLL;
65 static const long long mn = 0x8000000000000000LL;
66};
67template<>
68struct i_maxmin< unsigned long long > {
69 static const unsigned long long mx = 0xffffffffffffffffLL;
70 static const unsigned long long mn = 0x0000000000000000LL;
71};
72//-------------------------------------------------------------------------
73
Jim Cownie5e8470a2013-09-27 10:38:44 +000074#ifdef KMP_STATIC_STEAL_ENABLED
75
76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77 template< typename T >
78 struct dispatch_private_infoXX_template {
79 typedef typename traits_t< T >::unsigned_t UT;
80 typedef typename traits_t< T >::signed_t ST;
81 UT count; // unsigned
82 T ub;
83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84 T lb;
85 ST st; // signed
86 UT tc; // unsigned
87 T static_steal_counter; // for static_steal only; maybe better to put after ub
88
89 /* parm[1-4] are used in different ways by different scheduling algorithms */
90
91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92 // a) parm3 is properly aligned and
93 // b) all parm1-4 are in the same cache line.
94 // Because of parm1-4 are used together, performance seems to be better
95 // if they are in the same line (not measured though).
96
97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98 T parm1;
99 T parm2;
100 T parm3;
101 T parm4;
102 };
103
104 UT ordered_lower; // unsigned
105 UT ordered_upper; // unsigned
106 #if KMP_OS_WINDOWS
107 T last_upper;
108 #endif /* KMP_OS_WINDOWS */
109 };
110
111#else /* KMP_STATIC_STEAL_ENABLED */
112
113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114 template< typename T >
115 struct dispatch_private_infoXX_template {
116 typedef typename traits_t< T >::unsigned_t UT;
117 typedef typename traits_t< T >::signed_t ST;
118 T lb;
119 T ub;
120 ST st; // signed
121 UT tc; // unsigned
122
123 T parm1;
124 T parm2;
125 T parm3;
126 T parm4;
127
128 UT count; // unsigned
129
130 UT ordered_lower; // unsigned
131 UT ordered_upper; // unsigned
132 #if KMP_OS_WINDOWS
133 T last_upper;
134 #endif /* KMP_OS_WINDOWS */
135 };
136
137#endif /* KMP_STATIC_STEAL_ENABLED */
138
139// replaces dispatch_private_info structure and dispatch_private_info_t type
140template< typename T >
141struct KMP_ALIGN_CACHE dispatch_private_info_template {
142 // duplicate alignment here, otherwise size of structure is not correct in our compiler
143 union KMP_ALIGN_CACHE private_info_tmpl {
144 dispatch_private_infoXX_template< T > p;
145 dispatch_private_info64_t p64;
146 } u;
147 enum sched_type schedule; /* scheduling algorithm */
148 kmp_uint32 ordered; /* ordered clause specified */
149 kmp_uint32 ordered_bumped;
150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152 kmp_uint32 nomerge; /* don't merge iters if serialized */
153 kmp_uint32 type_size;
154 enum cons_type pushed_ws;
155};
156
157
158// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159template< typename UT >
160struct dispatch_shared_infoXX_template {
161 /* chunk index under dynamic, number of idle threads under static-steal;
162 iteration index otherwise */
163 volatile UT iteration;
164 volatile UT num_done;
165 volatile UT ordered_iteration;
166 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
167};
168
169// replaces dispatch_shared_info structure and dispatch_shared_info_t type
170template< typename UT >
171struct dispatch_shared_info_template {
172 // we need union here to keep the structure size
173 union shared_info_tmpl {
174 dispatch_shared_infoXX_template< UT > s;
175 dispatch_shared_info64_t s64;
176 } u;
177 volatile kmp_uint32 buffer_index;
178};
179
180/* ------------------------------------------------------------------------ */
181/* ------------------------------------------------------------------------ */
182
Jim Cownie5e8470a2013-09-27 10:38:44 +0000183#undef USE_TEST_LOCKS
184
185// test_then_add template (general template should NOT be used)
186template< typename T >
187static __forceinline T
188test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
189
190template<>
191__forceinline kmp_int32
192test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
193{
194 kmp_int32 r;
195 r = KMP_TEST_THEN_ADD32( p, d );
196 return r;
197}
198
199template<>
200__forceinline kmp_int64
201test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
202{
203 kmp_int64 r;
204 r = KMP_TEST_THEN_ADD64( p, d );
205 return r;
206}
207
208// test_then_inc_acq template (general template should NOT be used)
209template< typename T >
210static __forceinline T
211test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
212
213template<>
214__forceinline kmp_int32
215test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
216{
217 kmp_int32 r;
218 r = KMP_TEST_THEN_INC_ACQ32( p );
219 return r;
220}
221
222template<>
223__forceinline kmp_int64
224test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
225{
226 kmp_int64 r;
227 r = KMP_TEST_THEN_INC_ACQ64( p );
228 return r;
229}
230
231// test_then_inc template (general template should NOT be used)
232template< typename T >
233static __forceinline T
234test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
235
236template<>
237__forceinline kmp_int32
238test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
239{
240 kmp_int32 r;
241 r = KMP_TEST_THEN_INC32( p );
242 return r;
243}
244
245template<>
246__forceinline kmp_int64
247test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
248{
249 kmp_int64 r;
250 r = KMP_TEST_THEN_INC64( p );
251 return r;
252}
253
254// compare_and_swap template (general template should NOT be used)
255template< typename T >
256static __forceinline kmp_int32
257compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
258
259template<>
260__forceinline kmp_int32
261compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
262{
263 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
264}
265
266template<>
267__forceinline kmp_int32
268compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
269{
270 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
271}
272
273/*
274 Spin wait loop that first does pause, then yield.
275 Waits until function returns non-zero when called with *spinner and check.
276 Does NOT put threads to sleep.
277#if USE_ITT_BUILD
278 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000279 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000280 locks consistently. For example, if lock is acquired immediately, its address is
281 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
282 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
283 address, not an address of low-level spinner.
284#endif // USE_ITT_BUILD
285*/
286template< typename UT >
287// ToDo: make inline function (move to header file for icl)
288static UT // unsigned 4- or 8-byte type
289__kmp_wait_yield( volatile UT * spinner,
290 UT checker,
291 kmp_uint32 (* pred)( UT, UT )
292 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
293 )
294{
295 // note: we may not belong to a team at this point
296 register volatile UT * spin = spinner;
297 register UT check = checker;
298 register kmp_uint32 spins;
299 register kmp_uint32 (*f) ( UT, UT ) = pred;
300 register UT r;
301
302 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
303 KMP_INIT_YIELD( spins );
304 // main wait spin loop
305 while(!f(r = *spin, check))
306 {
307 KMP_FSYNC_SPIN_PREPARE( obj );
308 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
309 It causes problems with infinite recursion because of exit lock */
310 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
311 __kmp_abort_thread(); */
312
Jim Cownie5e8470a2013-09-27 10:38:44 +0000313 // if we are oversubscribed,
314 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
315 // pause is in the following code
316 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
317 KMP_YIELD_SPIN( spins );
318 }
319 KMP_FSYNC_SPIN_ACQUIRED( obj );
320 return r;
321}
322
323template< typename UT >
324static kmp_uint32 __kmp_eq( UT value, UT checker) {
325 return value == checker;
326}
327
328template< typename UT >
329static kmp_uint32 __kmp_neq( UT value, UT checker) {
330 return value != checker;
331}
332
333template< typename UT >
334static kmp_uint32 __kmp_lt( UT value, UT checker) {
335 return value < checker;
336}
337
338template< typename UT >
339static kmp_uint32 __kmp_ge( UT value, UT checker) {
340 return value >= checker;
341}
342
343template< typename UT >
344static kmp_uint32 __kmp_le( UT value, UT checker) {
345 return value <= checker;
346}
347
348
349/* ------------------------------------------------------------------------ */
350/* ------------------------------------------------------------------------ */
351
352static void
353__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
354{
355 kmp_info_t *th;
356
357 KMP_DEBUG_ASSERT( gtid_ref );
358
359 if ( __kmp_env_consistency_check ) {
360 th = __kmp_threads[*gtid_ref];
361 if ( th -> th.th_root -> r.r_active
362 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000363#if KMP_USE_DYNAMIC_LOCK
364 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
365#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000366 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000367#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000368 }
369 }
370}
371
372template< typename UT >
373static void
374__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
375{
376 typedef typename traits_t< UT >::signed_t ST;
377 dispatch_private_info_template< UT > * pr;
378
379 int gtid = *gtid_ref;
380// int cid = *cid_ref;
381 kmp_info_t *th = __kmp_threads[ gtid ];
382 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
383
384 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
385 if ( __kmp_env_consistency_check ) {
386 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
387 ( th -> th.th_dispatch -> th_dispatch_pr_current );
388 if ( pr -> pushed_ws != ct_none ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000389#if KMP_USE_DYNAMIC_LOCK
390 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
391#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000392 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000393#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000394 }
395 }
396
397 if ( ! th -> th.th_team -> t.t_serialized ) {
398 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
399 ( th -> th.th_dispatch -> th_dispatch_sh_current );
400 UT lower;
401
402 if ( ! __kmp_env_consistency_check ) {
403 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404 ( th -> th.th_dispatch -> th_dispatch_pr_current );
405 }
406 lower = pr->u.p.ordered_lower;
407
408 #if ! defined( KMP_GOMP_COMPAT )
409 if ( __kmp_env_consistency_check ) {
410 if ( pr->ordered_bumped ) {
411 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
412 __kmp_error_construct2(
413 kmp_i18n_msg_CnsMultipleNesting,
414 ct_ordered_in_pdo, loc_ref,
415 & p->stack_data[ p->w_top ]
416 );
417 }
418 }
419 #endif /* !defined(KMP_GOMP_COMPAT) */
420
421 KMP_MB();
422 #ifdef KMP_DEBUG
423 {
424 const char * buff;
425 // create format specifiers before the debug output
426 buff = __kmp_str_format(
427 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
428 traits_t< UT >::spec, traits_t< UT >::spec );
429 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
430 __kmp_str_free( &buff );
431 }
432 #endif
433
434 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
435 USE_ITT_BUILD_ARG( NULL )
436 );
437 KMP_MB(); /* is this necessary? */
438 #ifdef KMP_DEBUG
439 {
440 const char * buff;
441 // create format specifiers before the debug output
442 buff = __kmp_str_format(
443 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
444 traits_t< UT >::spec, traits_t< UT >::spec );
445 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
446 __kmp_str_free( &buff );
447 }
448 #endif
449 }
450 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
451}
452
453static void
454__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
455{
456 kmp_info_t *th;
457
458 if ( __kmp_env_consistency_check ) {
459 th = __kmp_threads[*gtid_ref];
460 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
461 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
462 }
463 }
464}
465
466template< typename UT >
467static void
468__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
469{
470 typedef typename traits_t< UT >::signed_t ST;
471 dispatch_private_info_template< UT > * pr;
472
473 int gtid = *gtid_ref;
474// int cid = *cid_ref;
475 kmp_info_t *th = __kmp_threads[ gtid ];
476 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
477
478 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
479 if ( __kmp_env_consistency_check ) {
480 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
481 ( th -> th.th_dispatch -> th_dispatch_pr_current );
482 if ( pr -> pushed_ws != ct_none ) {
483 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
484 }
485 }
486
487 if ( ! th -> th.th_team -> t.t_serialized ) {
488 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
489 ( th -> th.th_dispatch -> th_dispatch_sh_current );
490
491 if ( ! __kmp_env_consistency_check ) {
492 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
493 ( th -> th.th_dispatch -> th_dispatch_pr_current );
494 }
495
496 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
497 #if ! defined( KMP_GOMP_COMPAT )
498 if ( __kmp_env_consistency_check ) {
499 if ( pr->ordered_bumped != 0 ) {
500 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
501 /* How to test it? - OM */
502 __kmp_error_construct2(
503 kmp_i18n_msg_CnsMultipleNesting,
504 ct_ordered_in_pdo, loc_ref,
505 & p->stack_data[ p->w_top ]
506 );
507 }
508 }
509 #endif /* !defined(KMP_GOMP_COMPAT) */
510
511 KMP_MB(); /* Flush all pending memory write invalidates. */
512
513 pr->ordered_bumped += 1;
514
515 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
516 gtid, pr->ordered_bumped ) );
517
518 KMP_MB(); /* Flush all pending memory write invalidates. */
519
520 /* TODO use general release procedure? */
521 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
522
523 KMP_MB(); /* Flush all pending memory write invalidates. */
524 }
525 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
526}
527
528/* Computes and returns x to the power of y, where y must a non-negative integer */
529template< typename UT >
530static __forceinline long double
531__kmp_pow(long double x, UT y) {
532 long double s=1.0L;
533
534 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
535 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
536 while(y) {
537 if ( y & 1 )
538 s *= x;
539 x *= x;
540 y >>= 1;
541 }
542 return s;
543}
544
545/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
546 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
547 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
548 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
549*/
550template< typename T >
551static __inline typename traits_t< T >::unsigned_t
552__kmp_dispatch_guided_remaining(
553 T tc,
554 typename traits_t< T >::floating_t base,
555 typename traits_t< T >::unsigned_t idx
556) {
557 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
558 least for ICL 8.1, long double arithmetic may not really have
559 long double precision, even with /Qlong_double. Currently, we
560 workaround that in the caller code, by manipulating the FPCW for
561 Windows* OS on IA-32 architecture. The lack of precision is not
562 expected to be a correctness issue, though.
563 */
564 typedef typename traits_t< T >::unsigned_t UT;
565
566 long double x = tc * __kmp_pow< UT >(base, idx);
567 UT r = (UT) x;
568 if ( x == r )
569 return r;
570 return r + 1;
571}
572
573// Parameters of the guided-iterative algorithm:
574// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
575// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
576// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
577// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
578static int guided_int_param = 2;
579static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
580
581// UT - unsigned flavor of T, ST - signed flavor of T,
582// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
583template< typename T >
584static void
585__kmp_dispatch_init(
586 ident_t * loc,
587 int gtid,
588 enum sched_type schedule,
589 T lb,
590 T ub,
591 typename traits_t< T >::signed_t st,
592 typename traits_t< T >::signed_t chunk,
593 int push_ws
594) {
595 typedef typename traits_t< T >::unsigned_t UT;
596 typedef typename traits_t< T >::signed_t ST;
597 typedef typename traits_t< T >::floating_t DBL;
598 static const int ___kmp_size_type = sizeof( UT );
599
600 int active;
601 T tc;
602 kmp_info_t * th;
603 kmp_team_t * team;
604 kmp_uint32 my_buffer_index;
605 dispatch_private_info_template< T > * pr;
606 dispatch_shared_info_template< UT > volatile * sh;
607
608 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
609 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
610
611 if ( ! TCR_4( __kmp_init_parallel ) )
612 __kmp_parallel_initialize();
613
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000614#if INCLUDE_SSC_MARKS
615 SSC_MARK_DISPATCH_INIT();
616#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000617 #ifdef KMP_DEBUG
618 {
619 const char * buff;
620 // create format specifiers before the debug output
621 buff = __kmp_str_format(
622 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
623 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
624 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
625 __kmp_str_free( &buff );
626 }
627 #endif
628 /* setup data */
629 th = __kmp_threads[ gtid ];
630 team = th -> th.th_team;
631 active = ! team -> t.t_serialized;
632 th->th.th_ident = loc;
633
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000634#if USE_ITT_BUILD
635 kmp_uint64 cur_chunk = chunk;
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000636 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
637 KMP_MASTER_GTID(gtid) &&
638#if OMP_40_ENABLED
639 th->th.th_teams_microtask == NULL &&
640#endif
641 team->t.t_active_level == 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000642#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000643 if ( ! active ) {
644 pr = reinterpret_cast< dispatch_private_info_template< T >* >
645 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
646 } else {
647 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
648 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
649
650 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
651
652 /* What happens when number of threads changes, need to resize buffer? */
653 pr = reinterpret_cast< dispatch_private_info_template< T > * >
654 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
655 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
656 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
657 }
658
659 /* Pick up the nomerge/ordered bits from the scheduling type */
660 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
661 pr->nomerge = TRUE;
662 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
663 } else {
664 pr->nomerge = FALSE;
665 }
666 pr->type_size = ___kmp_size_type; // remember the size of variables
667 if ( kmp_ord_lower & schedule ) {
668 pr->ordered = TRUE;
669 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
670 } else {
671 pr->ordered = FALSE;
672 }
673 if ( schedule == kmp_sch_static ) {
674 schedule = __kmp_static;
675 } else {
676 if ( schedule == kmp_sch_runtime ) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000677 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
678 schedule = team -> t.t_sched.r_sched_type;
679 // Detail the schedule if needed (global controls are differentiated appropriately)
680 if ( schedule == kmp_sch_guided_chunked ) {
681 schedule = __kmp_guided;
682 } else if ( schedule == kmp_sch_static ) {
683 schedule = __kmp_static;
684 }
685 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
686 chunk = team -> t.t_sched.chunk;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000687
688 #ifdef KMP_DEBUG
689 {
690 const char * buff;
691 // create format specifiers before the debug output
692 buff = __kmp_str_format(
693 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
694 traits_t< ST >::spec );
695 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
696 __kmp_str_free( &buff );
697 }
698 #endif
699 } else {
700 if ( schedule == kmp_sch_guided_chunked ) {
701 schedule = __kmp_guided;
702 }
703 if ( chunk <= 0 ) {
704 chunk = KMP_DEFAULT_CHUNK;
705 }
706 }
707
Jim Cownie5e8470a2013-09-27 10:38:44 +0000708 if ( schedule == kmp_sch_auto ) {
709 // mapping and differentiation: in the __kmp_do_serial_initialize()
710 schedule = __kmp_auto;
711 #ifdef KMP_DEBUG
712 {
713 const char * buff;
714 // create format specifiers before the debug output
715 buff = __kmp_str_format(
716 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
717 traits_t< ST >::spec );
718 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
719 __kmp_str_free( &buff );
720 }
721 #endif
722 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000723
724 /* guided analytical not safe for too many threads */
725 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
726 schedule = kmp_sch_guided_iterative_chunked;
727 KMP_WARNING( DispatchManyThreads );
728 }
729 pr->u.p.parm1 = chunk;
730 }
731 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
732 "unknown scheduling type" );
733
734 pr->u.p.count = 0;
735
736 if ( __kmp_env_consistency_check ) {
737 if ( st == 0 ) {
738 __kmp_error_construct(
739 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
740 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
741 );
742 }
743 }
744
745 tc = ( ub - lb + st );
746 if ( st != 1 ) {
747 if ( st < 0 ) {
748 if ( lb < ub ) {
749 tc = 0; // zero-trip
750 } else { // lb >= ub
751 tc = (ST)tc / st; // convert to signed division
752 }
753 } else { // st > 0
754 if ( ub < lb ) {
755 tc = 0; // zero-trip
756 } else { // lb >= ub
757 tc /= st;
758 }
759 }
760 } else if ( ub < lb ) { // st == 1
761 tc = 0; // zero-trip
762 }
763
764 pr->u.p.lb = lb;
765 pr->u.p.ub = ub;
766 pr->u.p.st = st;
767 pr->u.p.tc = tc;
768
769 #if KMP_OS_WINDOWS
770 pr->u.p.last_upper = ub + st;
771 #endif /* KMP_OS_WINDOWS */
772
773 /* NOTE: only the active parallel region(s) has active ordered sections */
774
775 if ( active ) {
776 if ( pr->ordered == 0 ) {
777 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
778 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
779 } else {
780 pr->ordered_bumped = 0;
781
782 pr->u.p.ordered_lower = 1;
783 pr->u.p.ordered_upper = 0;
784
785 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
786 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
787 }
788 }
789
790 if ( __kmp_env_consistency_check ) {
791 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
792 if ( push_ws ) {
793 __kmp_push_workshare( gtid, ws, loc );
794 pr->pushed_ws = ws;
795 } else {
796 __kmp_check_workshare( gtid, ws, loc );
797 pr->pushed_ws = ct_none;
798 }
799 }
800
801 switch ( schedule ) {
802 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
803 case kmp_sch_static_steal:
804 {
805 T nproc = team->t.t_nproc;
806 T ntc, init;
807
808 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
809
810 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
811 if ( nproc > 1 && ntc >= nproc ) {
812 T id = __kmp_tid_from_gtid(gtid);
813 T small_chunk, extras;
814
815 small_chunk = ntc / nproc;
816 extras = ntc % nproc;
817
818 init = id * small_chunk + ( id < extras ? id : extras );
819 pr->u.p.count = init;
820 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
821
822 pr->u.p.parm2 = lb;
823 //pr->pfields.parm3 = 0; // it's not used in static_steal
824 pr->u.p.parm4 = id;
825 pr->u.p.st = st;
826 break;
827 } else {
828 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
829 gtid ) );
830 schedule = kmp_sch_static_balanced;
831 /* too few iterations: fall-through to kmp_sch_static_balanced */
832 } // if
833 /* FALL-THROUGH to static balanced */
834 } // case
835 #endif
836 case kmp_sch_static_balanced:
837 {
838 T nproc = team->t.t_nproc;
839 T init, limit;
840
841 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
842 gtid ) );
843
844 if ( nproc > 1 ) {
845 T id = __kmp_tid_from_gtid(gtid);
846
847 if ( tc < nproc ) {
848 if ( id < tc ) {
849 init = id;
850 limit = id;
851 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
852 } else {
853 pr->u.p.count = 1; /* means no more chunks to execute */
854 pr->u.p.parm1 = FALSE;
855 break;
856 }
857 } else {
858 T small_chunk = tc / nproc;
859 T extras = tc % nproc;
860 init = id * small_chunk + (id < extras ? id : extras);
861 limit = init + small_chunk - (id < extras ? 0 : 1);
862 pr->u.p.parm1 = (id == nproc - 1);
863 }
864 } else {
865 if ( tc > 0 ) {
866 init = 0;
867 limit = tc - 1;
868 pr->u.p.parm1 = TRUE;
869 } else {
870 // zero trip count
871 pr->u.p.count = 1; /* means no more chunks to execute */
872 pr->u.p.parm1 = FALSE;
873 break;
874 }
875 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000876#if USE_ITT_BUILD
877 // Calculate chunk for metadata report
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000878 if ( itt_need_metadata_reporting )
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000879 cur_chunk = limit - init + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000880#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000881 if ( st == 1 ) {
882 pr->u.p.lb = lb + init;
883 pr->u.p.ub = lb + limit;
884 } else {
885 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
886 pr->u.p.lb = lb + init * st;
887 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
888 if ( st > 0 ) {
889 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
890 } else {
891 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
892 }
893 }
894 if ( pr->ordered ) {
895 pr->u.p.ordered_lower = init;
896 pr->u.p.ordered_upper = limit;
897 }
898 break;
899 } // case
900 case kmp_sch_guided_iterative_chunked :
901 {
902 T nproc = team->t.t_nproc;
903 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
904
905 if ( nproc > 1 ) {
906 if ( (2L * chunk + 1 ) * nproc >= tc ) {
907 /* chunk size too large, switch to dynamic */
908 schedule = kmp_sch_dynamic_chunked;
909 } else {
910 // when remaining iters become less than parm2 - switch to dynamic
911 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
912 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
913 }
914 } else {
915 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
916 schedule = kmp_sch_static_greedy;
917 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
918 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
919 pr->u.p.parm1 = tc;
920 } // if
921 } // case
922 break;
923 case kmp_sch_guided_analytical_chunked:
924 {
925 T nproc = team->t.t_nproc;
926 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
927
928 if ( nproc > 1 ) {
929 if ( (2L * chunk + 1 ) * nproc >= tc ) {
930 /* chunk size too large, switch to dynamic */
931 schedule = kmp_sch_dynamic_chunked;
932 } else {
933 /* commonly used term: (2 nproc - 1)/(2 nproc) */
934 DBL x;
935
936 #if KMP_OS_WINDOWS && KMP_ARCH_X86
937 /* Linux* OS already has 64-bit computation by default for
938 long double, and on Windows* OS on Intel(R) 64,
939 /Qlong_double doesn't work. On Windows* OS
940 on IA-32 architecture, we need to set precision to
941 64-bit instead of the default 53-bit. Even though long
942 double doesn't work on Windows* OS on Intel(R) 64, the
943 resulting lack of precision is not expected to impact
944 the correctness of the algorithm, but this has not been
945 mathematically proven.
946 */
947 // save original FPCW and set precision to 64-bit, as
948 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +0000949 unsigned int oldFpcw = _control87(0,0);
950 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +0000951 #endif
952 /* value used for comparison in solver for cross-over point */
953 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
954
955 /* crossover point--chunk indexes equal to or greater than
956 this point switch to dynamic-style scheduling */
957 UT cross;
958
959 /* commonly used term: (2 nproc - 1)/(2 nproc) */
960 x = (long double)1.0 - (long double)0.5 / nproc;
961
962 #ifdef KMP_DEBUG
963 { // test natural alignment
964 struct _test_a {
965 char a;
966 union {
967 char b;
968 DBL d;
969 };
970 } t;
971 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
972 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
973 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
974 }
975 #endif // KMP_DEBUG
976
977 /* save the term in thread private dispatch structure */
978 *(DBL*)&pr->u.p.parm3 = x;
979
980 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
981 {
982 UT left, right, mid;
983 long double p;
984
985 /* estimate initial upper and lower bound */
986
987 /* doesn't matter what value right is as long as it is positive, but
988 it affects performance of the solver
989 */
990 right = 229;
991 p = __kmp_pow< UT >(x,right);
992 if ( p > target ) {
993 do{
994 p *= p;
995 right <<= 1;
996 } while(p>target && right < (1<<27));
997 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
998 } else {
999 left = 0;
1000 }
1001
1002 /* bisection root-finding method */
1003 while ( left + 1 < right ) {
1004 mid = (left + right) / 2;
1005 if ( __kmp_pow< UT >(x,mid) > target ) {
1006 left = mid;
1007 } else {
1008 right = mid;
1009 }
1010 } // while
1011 cross = right;
1012 }
1013 /* assert sanity of computed crossover point */
1014 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1015
1016 /* save the crossover point in thread private dispatch structure */
1017 pr->u.p.parm2 = cross;
1018
1019 // C75803
1020 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1021 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1022 #else
1023 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1024 #endif
1025 /* dynamic-style scheduling offset */
1026 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1027 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1028 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +00001029 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001030 #endif
1031 } // if
1032 } else {
1033 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1034 gtid ) );
1035 schedule = kmp_sch_static_greedy;
1036 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1037 pr->u.p.parm1 = tc;
1038 } // if
1039 } // case
1040 break;
1041 case kmp_sch_static_greedy:
1042 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1043 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1044 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1045 tc;
1046 break;
1047 case kmp_sch_static_chunked :
1048 case kmp_sch_dynamic_chunked :
1049 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1050 break;
1051 case kmp_sch_trapezoidal :
1052 {
1053 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1054
1055 T parm1, parm2, parm3, parm4;
1056 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1057
1058 parm1 = chunk;
1059
1060 /* F : size of the first cycle */
1061 parm2 = ( tc / (2 * team->t.t_nproc) );
1062
1063 if ( parm2 < 1 ) {
1064 parm2 = 1;
1065 }
1066
1067 /* L : size of the last cycle. Make sure the last cycle
1068 * is not larger than the first cycle.
1069 */
1070 if ( parm1 < 1 ) {
1071 parm1 = 1;
1072 } else if ( parm1 > parm2 ) {
1073 parm1 = parm2;
1074 }
1075
1076 /* N : number of cycles */
1077 parm3 = ( parm2 + parm1 );
1078 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1079
1080 if ( parm3 < 2 ) {
1081 parm3 = 2;
1082 }
1083
1084 /* sigma : decreasing incr of the trapezoid */
1085 parm4 = ( parm3 - 1 );
1086 parm4 = ( parm2 - parm1 ) / parm4;
1087
1088 // pointless check, because parm4 >= 0 always
1089 //if ( parm4 < 0 ) {
1090 // parm4 = 0;
1091 //}
1092
1093 pr->u.p.parm1 = parm1;
1094 pr->u.p.parm2 = parm2;
1095 pr->u.p.parm3 = parm3;
1096 pr->u.p.parm4 = parm4;
1097 } // case
1098 break;
1099
1100 default:
1101 {
1102 __kmp_msg(
1103 kmp_ms_fatal, // Severity
1104 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1105 KMP_HNT( GetNewerLibrary ), // Hint
1106 __kmp_msg_null // Variadic argument list terminator
1107 );
1108 }
1109 break;
1110 } // switch
1111 pr->schedule = schedule;
1112 if ( active ) {
1113 /* The name of this buffer should be my_buffer_index when it's free to use it */
1114
1115 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1116 gtid, my_buffer_index, sh->buffer_index) );
1117 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1118 USE_ITT_BUILD_ARG( NULL )
1119 );
1120 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1121 // *always* 32-bit integers.
1122 KMP_MB(); /* is this necessary? */
1123 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1124 gtid, my_buffer_index, sh->buffer_index) );
1125
1126 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1127 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1128#if USE_ITT_BUILD
1129 if ( pr->ordered ) {
1130 __kmp_itt_ordered_init( gtid );
1131 }; // if
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001132 // Report loop metadata
1133 if ( itt_need_metadata_reporting ) {
1134 // Only report metadata by master of active team at level 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001135 kmp_uint64 schedtype = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001136 switch ( schedule ) {
1137 case kmp_sch_static_chunked:
1138 case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1139 break;
1140 case kmp_sch_static_greedy:
1141 cur_chunk = pr->u.p.parm1;
1142 break;
1143 case kmp_sch_dynamic_chunked:
1144 schedtype = 1;
1145 break;
1146 case kmp_sch_guided_iterative_chunked:
1147 case kmp_sch_guided_analytical_chunked:
1148 schedtype = 2;
1149 break;
1150 default:
1151// Should we put this case under "static"?
1152// case kmp_sch_static_steal:
1153 schedtype = 3;
1154 break;
1155 }
1156 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1157 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001158#endif /* USE_ITT_BUILD */
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001159 }; // if
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001160
Jim Cownie5e8470a2013-09-27 10:38:44 +00001161 #ifdef KMP_DEBUG
1162 {
1163 const char * buff;
1164 // create format specifiers before the debug output
1165 buff = __kmp_str_format(
1166 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1167 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1168 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1169 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1170 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1171 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1172 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1173 KD_TRACE(10, ( buff,
1174 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1175 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1176 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1177 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1178 __kmp_str_free( &buff );
1179 }
1180 #endif
1181 #if ( KMP_STATIC_STEAL_ENABLED )
1182 if ( ___kmp_size_type < 8 ) {
1183 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1184 // all the parm3 variables will contain the same value.
1185 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1186 // rather than program life-time increment.
1187 // So the dedicated variable is required. The 'static_steal_counter' is used.
1188 if( schedule == kmp_sch_static_steal ) {
1189 // Other threads will inspect this variable when searching for a victim.
1190 // This is a flag showing that other threads may steal from this thread since then.
1191 volatile T * p = &pr->u.p.static_steal_counter;
1192 *p = *p + 1;
1193 }
1194 }
1195 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001196
1197#if OMPT_SUPPORT && OMPT_TRACE
1198 if ((ompt_status == ompt_status_track_callback) &&
1199 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1200 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1201 ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1202 ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1203 team_info->parallel_id, task_info->task_id, team_info->microtask);
1204 }
1205#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001206}
1207
1208/*
1209 * For ordered loops, either __kmp_dispatch_finish() should be called after
1210 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1211 * every chunk of iterations. If the ordered section(s) were not executed
1212 * for this iteration (or every iteration in this chunk), we need to set the
1213 * ordered iteration counters so that the next thread can proceed.
1214 */
1215template< typename UT >
1216static void
1217__kmp_dispatch_finish( int gtid, ident_t *loc )
1218{
1219 typedef typename traits_t< UT >::signed_t ST;
1220 kmp_info_t *th = __kmp_threads[ gtid ];
1221
1222 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1223 if ( ! th -> th.th_team -> t.t_serialized ) {
1224
1225 dispatch_private_info_template< UT > * pr =
1226 reinterpret_cast< dispatch_private_info_template< UT >* >
1227 ( th->th.th_dispatch->th_dispatch_pr_current );
1228 dispatch_shared_info_template< UT > volatile * sh =
1229 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1230 ( th->th.th_dispatch->th_dispatch_sh_current );
1231 KMP_DEBUG_ASSERT( pr );
1232 KMP_DEBUG_ASSERT( sh );
1233 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1234 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1235
1236 if ( pr->ordered_bumped ) {
1237 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1238 gtid ) );
1239 pr->ordered_bumped = 0;
1240 } else {
1241 UT lower = pr->u.p.ordered_lower;
1242
1243 #ifdef KMP_DEBUG
1244 {
1245 const char * buff;
1246 // create format specifiers before the debug output
1247 buff = __kmp_str_format(
1248 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1249 traits_t< UT >::spec, traits_t< UT >::spec );
1250 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1251 __kmp_str_free( &buff );
1252 }
1253 #endif
1254
1255 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1256 USE_ITT_BUILD_ARG(NULL)
1257 );
1258 KMP_MB(); /* is this necessary? */
1259 #ifdef KMP_DEBUG
1260 {
1261 const char * buff;
1262 // create format specifiers before the debug output
1263 buff = __kmp_str_format(
1264 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1265 traits_t< UT >::spec, traits_t< UT >::spec );
1266 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1267 __kmp_str_free( &buff );
1268 }
1269 #endif
1270
1271 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1272 } // if
1273 } // if
1274 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1275}
1276
1277#ifdef KMP_GOMP_COMPAT
1278
1279template< typename UT >
1280static void
1281__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1282{
1283 typedef typename traits_t< UT >::signed_t ST;
1284 kmp_info_t *th = __kmp_threads[ gtid ];
1285
1286 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1287 if ( ! th -> th.th_team -> t.t_serialized ) {
1288// int cid;
1289 dispatch_private_info_template< UT > * pr =
1290 reinterpret_cast< dispatch_private_info_template< UT >* >
1291 ( th->th.th_dispatch->th_dispatch_pr_current );
1292 dispatch_shared_info_template< UT > volatile * sh =
1293 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1294 ( th->th.th_dispatch->th_dispatch_sh_current );
1295 KMP_DEBUG_ASSERT( pr );
1296 KMP_DEBUG_ASSERT( sh );
1297 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1298 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1299
1300// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1301 UT lower = pr->u.p.ordered_lower;
1302 UT upper = pr->u.p.ordered_upper;
1303 UT inc = upper - lower + 1;
1304
1305 if ( pr->ordered_bumped == inc ) {
1306 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1307 gtid ) );
1308 pr->ordered_bumped = 0;
1309 } else {
1310 inc -= pr->ordered_bumped;
1311
1312 #ifdef KMP_DEBUG
1313 {
1314 const char * buff;
1315 // create format specifiers before the debug output
1316 buff = __kmp_str_format(
1317 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1318 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1319 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1320 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1321 __kmp_str_free( &buff );
1322 }
1323 #endif
1324
1325 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1326 USE_ITT_BUILD_ARG(NULL)
1327 );
1328
1329 KMP_MB(); /* is this necessary? */
1330 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1331 gtid ) );
1332 pr->ordered_bumped = 0;
1333//!!!!! TODO check if the inc should be unsigned, or signed???
1334 #ifdef KMP_DEBUG
1335 {
1336 const char * buff;
1337 // create format specifiers before the debug output
1338 buff = __kmp_str_format(
1339 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1340 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1341 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1342 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1343 __kmp_str_free( &buff );
1344 }
1345 #endif
1346
1347 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1348 }
1349// }
1350 }
1351 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1352}
1353
1354#endif /* KMP_GOMP_COMPAT */
1355
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001356/* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1357 * (no more work), then tell OMPT the loop is over. In some cases
1358 * kmp_dispatch_fini() is not called. */
1359#if OMPT_SUPPORT && OMPT_TRACE
1360#define OMPT_LOOP_END \
1361 if (status == 0) { \
1362 if ((ompt_status == ompt_status_track_callback) && \
1363 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1364 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1365 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1366 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1367 team_info->parallel_id, task_info->task_id); \
1368 } \
1369 }
1370#else
1371#define OMPT_LOOP_END // no-op
1372#endif
1373
Jim Cownie5e8470a2013-09-27 10:38:44 +00001374template< typename T >
1375static int
1376__kmp_dispatch_next(
1377 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1378) {
1379
1380 typedef typename traits_t< T >::unsigned_t UT;
1381 typedef typename traits_t< T >::signed_t ST;
1382 typedef typename traits_t< T >::floating_t DBL;
Jonathan Peyton2321d572015-06-08 19:25:25 +00001383#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001384 static const int ___kmp_size_type = sizeof( UT );
Jonathan Peyton2321d572015-06-08 19:25:25 +00001385#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001386
1387 int status;
1388 dispatch_private_info_template< T > * pr;
1389 kmp_info_t * th = __kmp_threads[ gtid ];
1390 kmp_team_t * team = th -> th.th_team;
1391
Andrey Churbanov9ad5c3a2015-07-13 17:52:41 +00001392 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie5e8470a2013-09-27 10:38:44 +00001393 #ifdef KMP_DEBUG
1394 {
1395 const char * buff;
1396 // create format specifiers before the debug output
1397 buff = __kmp_str_format(
1398 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1399 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1400 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1401 __kmp_str_free( &buff );
1402 }
1403 #endif
1404
1405 if ( team -> t.t_serialized ) {
1406 /* NOTE: serialize this dispatch becase we are not at the active level */
1407 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1408 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1409 KMP_DEBUG_ASSERT( pr );
1410
1411 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1412 *p_lb = 0;
1413 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001414// if ( p_last != NULL )
1415// *p_last = 0;
1416 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001417 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001418 if ( __kmp_env_consistency_check ) {
1419 if ( pr->pushed_ws != ct_none ) {
1420 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1421 }
1422 }
1423 } else if ( pr->nomerge ) {
1424 kmp_int32 last;
1425 T start;
1426 UT limit, trip, init;
1427 ST incr;
1428 T chunk = pr->u.p.parm1;
1429
1430 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1431
1432 init = chunk * pr->u.p.count++;
1433 trip = pr->u.p.tc - 1;
1434
1435 if ( (status = (init <= trip)) == 0 ) {
1436 *p_lb = 0;
1437 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001438// if ( p_last != NULL )
1439// *p_last = 0;
1440 if ( p_st != NULL )
1441 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001442 if ( __kmp_env_consistency_check ) {
1443 if ( pr->pushed_ws != ct_none ) {
1444 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1445 }
1446 }
1447 } else {
1448 start = pr->u.p.lb;
1449 limit = chunk + init - 1;
1450 incr = pr->u.p.st;
1451
1452 if ( (last = (limit >= trip)) != 0 ) {
1453 limit = trip;
1454 #if KMP_OS_WINDOWS
1455 pr->u.p.last_upper = pr->u.p.ub;
1456 #endif /* KMP_OS_WINDOWS */
1457 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001458 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001459 *p_last = last;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001460 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001461 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001462 if ( incr == 1 ) {
1463 *p_lb = start + init;
1464 *p_ub = start + limit;
1465 } else {
1466 *p_lb = start + init * incr;
1467 *p_ub = start + limit * incr;
1468 }
1469
1470 if ( pr->ordered ) {
1471 pr->u.p.ordered_lower = init;
1472 pr->u.p.ordered_upper = limit;
1473 #ifdef KMP_DEBUG
1474 {
1475 const char * buff;
1476 // create format specifiers before the debug output
1477 buff = __kmp_str_format(
1478 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1479 traits_t< UT >::spec, traits_t< UT >::spec );
1480 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1481 __kmp_str_free( &buff );
1482 }
1483 #endif
1484 } // if
1485 } // if
1486 } else {
1487 pr->u.p.tc = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001488 *p_lb = pr->u.p.lb;
1489 *p_ub = pr->u.p.ub;
1490 #if KMP_OS_WINDOWS
1491 pr->u.p.last_upper = *p_ub;
1492 #endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001493 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001494 *p_last = TRUE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001495 if ( p_st != NULL )
1496 *p_st = pr->u.p.st;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001497 } // if
1498 #ifdef KMP_DEBUG
1499 {
1500 const char * buff;
1501 // create format specifiers before the debug output
1502 buff = __kmp_str_format(
1503 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001504 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie5e8470a2013-09-27 10:38:44 +00001505 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001506 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001507 __kmp_str_free( &buff );
1508 }
1509 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001510#if INCLUDE_SSC_MARKS
1511 SSC_MARK_DISPATCH_NEXT();
1512#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001513 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001514 return status;
1515 } else {
1516 kmp_int32 last = 0;
1517 dispatch_shared_info_template< UT > *sh;
1518 T start;
1519 ST incr;
1520 UT limit, trip, init;
1521
1522 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1523 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1524
1525 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1526 ( th->th.th_dispatch->th_dispatch_pr_current );
1527 KMP_DEBUG_ASSERT( pr );
1528 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1529 ( th->th.th_dispatch->th_dispatch_sh_current );
1530 KMP_DEBUG_ASSERT( sh );
1531
1532 if ( pr->u.p.tc == 0 ) {
1533 // zero trip count
1534 status = 0;
1535 } else {
1536 switch (pr->schedule) {
1537 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1538 case kmp_sch_static_steal:
1539 {
1540 T chunk = pr->u.p.parm1;
1541
1542 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1543
1544 trip = pr->u.p.tc - 1;
1545
1546 if ( ___kmp_size_type > 4 ) {
1547 // Other threads do not look into the data of this thread,
1548 // so it's not necessary to make volatile casting.
1549 init = ( pr->u.p.count )++;
1550 status = ( init < (UT)pr->u.p.ub );
1551 } else {
1552 typedef union {
1553 struct {
1554 UT count;
1555 T ub;
1556 } p;
1557 kmp_int64 b;
1558 } union_i4;
1559 // All operations on 'count' or 'ub' must be combined atomically together.
1560 // stealing implemented only for 4-byte indexes
1561 {
1562 union_i4 vold, vnew;
1563 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1564 vnew = vold;
1565 vnew.p.count++;
1566 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1567 ( volatile kmp_int64* )&pr->u.p.count,
1568 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1569 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1570 KMP_CPU_PAUSE();
1571 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1572 vnew = vold;
1573 vnew.p.count++;
1574 }
1575 vnew = vold;
1576 init = vnew.p.count;
1577 status = ( init < (UT)vnew.p.ub ) ;
1578 }
1579
1580 if( !status ) {
1581 kmp_info_t **other_threads = team->t.t_threads;
1582 int while_limit = 10;
1583 int while_index = 0;
1584
1585 // TODO: algorithm of searching for a victim
1586 // should be cleaned up and measured
1587 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1588 union_i4 vold, vnew;
1589 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1590 T victimIdx = pr->u.p.parm4;
1591 T oldVictimIdx = victimIdx;
1592 dispatch_private_info_template< T > * victim;
1593
1594 do {
1595 if( !victimIdx ) {
1596 victimIdx = team->t.t_nproc - 1;
1597 } else {
1598 --victimIdx;
1599 }
1600 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1601 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1602 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1603 // TODO: think about a proper place of this test
1604 if ( ( !victim ) ||
1605 ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1606 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1607 // TODO: delay would be nice
1608 continue;
1609 // the victim is not ready yet to participate in stealing
1610 // because the victim is still in kmp_init_dispatch
1611 }
1612 if ( oldVictimIdx == victimIdx ) {
1613 break;
1614 }
1615 pr->u.p.parm4 = victimIdx;
1616
1617 while( 1 ) {
1618 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1619 vnew = vold;
1620
1621 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1622 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1623 break;
1624 }
1625 vnew.p.ub -= (remaining >> 2);
1626 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1627 #pragma warning( push )
1628 // disable warning on pointless comparison of unsigned with 0
1629 #pragma warning( disable: 186 )
1630 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1631 #pragma warning( pop )
1632 // TODO: Should this be acquire or release?
1633 if ( KMP_COMPARE_AND_STORE_ACQ64(
1634 ( volatile kmp_int64 * )&victim->u.p.count,
1635 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1636 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1637 status = 1;
1638 while_index = 0;
1639 // now update own count and ub
1640 #if KMP_ARCH_X86
1641 // stealing executed on non-KMP_ARCH_X86 only
1642 // Atomic 64-bit write on ia32 is
1643 // unavailable, so we do this in steps.
1644 // This code is not tested.
1645 init = vold.p.count;
1646 pr->u.p.ub = 0;
1647 pr->u.p.count = init + 1;
1648 pr->u.p.ub = vnew.p.count;
1649 #else
1650 init = vnew.p.ub;
1651 vold.p.count = init + 1;
1652 // TODO: is it safe and enough?
1653 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1654 #endif // KMP_ARCH_X86
1655 break;
1656 } // if
1657 KMP_CPU_PAUSE();
1658 } // while (1)
1659 } // while
1660 } // if
1661 } // if
1662 if ( !status ) {
1663 *p_lb = 0;
1664 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001665 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001666 } else {
1667 start = pr->u.p.parm2;
1668 init *= chunk;
1669 limit = chunk + init - 1;
1670 incr = pr->u.p.st;
1671
1672 KMP_DEBUG_ASSERT(init <= trip);
1673 if ( (last = (limit >= trip)) != 0 )
1674 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001675 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001676
1677 if ( incr == 1 ) {
1678 *p_lb = start + init;
1679 *p_ub = start + limit;
1680 } else {
1681 *p_lb = start + init * incr;
1682 *p_ub = start + limit * incr;
1683 }
1684
1685 if ( pr->ordered ) {
1686 pr->u.p.ordered_lower = init;
1687 pr->u.p.ordered_upper = limit;
1688 #ifdef KMP_DEBUG
1689 {
1690 const char * buff;
1691 // create format specifiers before the debug output
1692 buff = __kmp_str_format(
1693 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1694 traits_t< UT >::spec, traits_t< UT >::spec );
1695 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1696 __kmp_str_free( &buff );
1697 }
1698 #endif
1699 } // if
1700 } // if
1701 break;
1702 } // case
1703 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1704 case kmp_sch_static_balanced:
1705 {
1706 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1707 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1708 pr->u.p.count = 1;
1709 *p_lb = pr->u.p.lb;
1710 *p_ub = pr->u.p.ub;
1711 last = pr->u.p.parm1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001712 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001713 *p_st = pr->u.p.st;
1714 } else { /* no iterations to do */
1715 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1716 }
1717 if ( pr->ordered ) {
1718 #ifdef KMP_DEBUG
1719 {
1720 const char * buff;
1721 // create format specifiers before the debug output
1722 buff = __kmp_str_format(
1723 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1724 traits_t< UT >::spec, traits_t< UT >::spec );
1725 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1726 __kmp_str_free( &buff );
1727 }
1728 #endif
1729 } // if
1730 } // case
1731 break;
1732 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1733 case kmp_sch_static_chunked:
1734 {
1735 T parm1;
1736
1737 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1738 gtid ) );
1739 parm1 = pr->u.p.parm1;
1740
1741 trip = pr->u.p.tc - 1;
1742 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1743
1744 if ( (status = (init <= trip)) != 0 ) {
1745 start = pr->u.p.lb;
1746 incr = pr->u.p.st;
1747 limit = parm1 + init - 1;
1748
1749 if ( (last = (limit >= trip)) != 0 )
1750 limit = trip;
1751
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001752 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001753
1754 pr->u.p.count += team->t.t_nproc;
1755
1756 if ( incr == 1 ) {
1757 *p_lb = start + init;
1758 *p_ub = start + limit;
1759 }
1760 else {
1761 *p_lb = start + init * incr;
1762 *p_ub = start + limit * incr;
1763 }
1764
1765 if ( pr->ordered ) {
1766 pr->u.p.ordered_lower = init;
1767 pr->u.p.ordered_upper = limit;
1768 #ifdef KMP_DEBUG
1769 {
1770 const char * buff;
1771 // create format specifiers before the debug output
1772 buff = __kmp_str_format(
1773 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1774 traits_t< UT >::spec, traits_t< UT >::spec );
1775 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1776 __kmp_str_free( &buff );
1777 }
1778 #endif
1779 } // if
1780 } // if
1781 } // case
1782 break;
1783
1784 case kmp_sch_dynamic_chunked:
1785 {
1786 T chunk = pr->u.p.parm1;
1787
1788 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1789 gtid ) );
1790
1791 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1792 trip = pr->u.p.tc - 1;
1793
1794 if ( (status = (init <= trip)) == 0 ) {
1795 *p_lb = 0;
1796 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001797 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001798 } else {
1799 start = pr->u.p.lb;
1800 limit = chunk + init - 1;
1801 incr = pr->u.p.st;
1802
1803 if ( (last = (limit >= trip)) != 0 )
1804 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001805
1806 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001807
1808 if ( incr == 1 ) {
1809 *p_lb = start + init;
1810 *p_ub = start + limit;
1811 } else {
1812 *p_lb = start + init * incr;
1813 *p_ub = start + limit * incr;
1814 }
1815
1816 if ( pr->ordered ) {
1817 pr->u.p.ordered_lower = init;
1818 pr->u.p.ordered_upper = limit;
1819 #ifdef KMP_DEBUG
1820 {
1821 const char * buff;
1822 // create format specifiers before the debug output
1823 buff = __kmp_str_format(
1824 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1825 traits_t< UT >::spec, traits_t< UT >::spec );
1826 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1827 __kmp_str_free( &buff );
1828 }
1829 #endif
1830 } // if
1831 } // if
1832 } // case
1833 break;
1834
1835 case kmp_sch_guided_iterative_chunked:
1836 {
1837 T chunkspec = pr->u.p.parm1;
1838 KD_TRACE(100,
1839 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1840 trip = pr->u.p.tc;
1841 // Start atomic part of calculations
1842 while(1) {
1843 ST remaining; // signed, because can be < 0
1844 init = sh->u.s.iteration; // shared value
1845 remaining = trip - init;
1846 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1847 // nothing to do, don't try atomic op
1848 status = 0;
1849 break;
1850 }
1851 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1852 // use dynamic-style shcedule
1853 // atomically inrement iterations, get old value
1854 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1855 remaining = trip - init;
1856 if (remaining <= 0) {
1857 status = 0; // all iterations got by other threads
1858 } else {
1859 // got some iterations to work on
1860 status = 1;
1861 if ( (T)remaining > chunkspec ) {
1862 limit = init + chunkspec - 1;
1863 } else {
1864 last = 1; // the last chunk
1865 limit = init + remaining - 1;
1866 } // if
1867 } // if
1868 break;
1869 } // if
1870 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1871 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1872 // CAS was successful, chunk obtained
1873 status = 1;
1874 --limit;
1875 break;
1876 } // if
1877 } // while
1878 if ( status != 0 ) {
1879 start = pr->u.p.lb;
1880 incr = pr->u.p.st;
1881 if ( p_st != NULL )
1882 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001883 *p_lb = start + init * incr;
1884 *p_ub = start + limit * incr;
1885 if ( pr->ordered ) {
1886 pr->u.p.ordered_lower = init;
1887 pr->u.p.ordered_upper = limit;
1888 #ifdef KMP_DEBUG
1889 {
1890 const char * buff;
1891 // create format specifiers before the debug output
1892 buff = __kmp_str_format(
1893 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1894 traits_t< UT >::spec, traits_t< UT >::spec );
1895 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1896 __kmp_str_free( &buff );
1897 }
1898 #endif
1899 } // if
1900 } else {
1901 *p_lb = 0;
1902 *p_ub = 0;
1903 if ( p_st != NULL )
1904 *p_st = 0;
1905 } // if
1906 } // case
1907 break;
1908
1909 case kmp_sch_guided_analytical_chunked:
1910 {
1911 T chunkspec = pr->u.p.parm1;
1912 UT chunkIdx;
1913 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1914 /* for storing original FPCW value for Windows* OS on
1915 IA-32 architecture 8-byte version */
1916 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001917 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001918 #endif
1919 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1920 gtid ) );
1921
1922 trip = pr->u.p.tc;
1923
1924 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1925 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1926
1927 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1928 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1929 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1930 --trip;
1931 /* use dynamic-style scheduling */
1932 init = chunkIdx * chunkspec + pr->u.p.count;
1933 /* need to verify init > 0 in case of overflow in the above calculation */
1934 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1935 limit = init + chunkspec -1;
1936
1937 if ( (last = (limit >= trip)) != 0 )
1938 limit = trip;
1939 }
1940 break;
1941 } else {
1942 /* use exponential-style scheduling */
1943 /* The following check is to workaround the lack of long double precision on Windows* OS.
1944 This check works around the possible effect that init != 0 for chunkIdx == 0.
1945 */
1946 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1947 /* If we haven't already done so, save original
1948 FPCW and set precision to 64-bit, as Windows* OS
1949 on IA-32 architecture defaults to 53-bit */
1950 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00001951 oldFpcw = _control87(0,0);
1952 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001953 fpcwSet = 0x30000;
1954 }
1955 #endif
1956 if ( chunkIdx ) {
1957 init = __kmp_dispatch_guided_remaining< T >(
1958 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1959 KMP_DEBUG_ASSERT(init);
1960 init = trip - init;
1961 } else
1962 init = 0;
1963 limit = trip - __kmp_dispatch_guided_remaining< T >(
1964 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1965 KMP_ASSERT(init <= limit);
1966 if ( init < limit ) {
1967 KMP_DEBUG_ASSERT(limit <= trip);
1968 --limit;
1969 status = 1;
1970 break;
1971 } // if
1972 } // if
1973 } // while (1)
1974 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00001975 /* restore FPCW if necessary
1976 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1977 */
1978 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1979 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001980 #endif
1981 if ( status != 0 ) {
1982 start = pr->u.p.lb;
1983 incr = pr->u.p.st;
1984 if ( p_st != NULL )
1985 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001986 *p_lb = start + init * incr;
1987 *p_ub = start + limit * incr;
1988 if ( pr->ordered ) {
1989 pr->u.p.ordered_lower = init;
1990 pr->u.p.ordered_upper = limit;
1991 #ifdef KMP_DEBUG
1992 {
1993 const char * buff;
1994 // create format specifiers before the debug output
1995 buff = __kmp_str_format(
1996 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1997 traits_t< UT >::spec, traits_t< UT >::spec );
1998 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1999 __kmp_str_free( &buff );
2000 }
2001 #endif
2002 }
2003 } else {
2004 *p_lb = 0;
2005 *p_ub = 0;
2006 if ( p_st != NULL )
2007 *p_st = 0;
2008 }
2009 } // case
2010 break;
2011
2012 case kmp_sch_trapezoidal:
2013 {
2014 UT index;
2015 T parm2 = pr->u.p.parm2;
2016 T parm3 = pr->u.p.parm3;
2017 T parm4 = pr->u.p.parm4;
2018 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2019 gtid ) );
2020
2021 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2022
2023 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2024 trip = pr->u.p.tc - 1;
2025
2026 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2027 *p_lb = 0;
2028 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002029 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002030 } else {
2031 start = pr->u.p.lb;
2032 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2033 incr = pr->u.p.st;
2034
2035 if ( (last = (limit >= trip)) != 0 )
2036 limit = trip;
2037
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002038 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002039
2040 if ( incr == 1 ) {
2041 *p_lb = start + init;
2042 *p_ub = start + limit;
2043 } else {
2044 *p_lb = start + init * incr;
2045 *p_ub = start + limit * incr;
2046 }
2047
2048 if ( pr->ordered ) {
2049 pr->u.p.ordered_lower = init;
2050 pr->u.p.ordered_upper = limit;
2051 #ifdef KMP_DEBUG
2052 {
2053 const char * buff;
2054 // create format specifiers before the debug output
2055 buff = __kmp_str_format(
2056 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2057 traits_t< UT >::spec, traits_t< UT >::spec );
2058 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2059 __kmp_str_free( &buff );
2060 }
2061 #endif
2062 } // if
2063 } // if
2064 } // case
2065 break;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002066 default:
2067 {
2068 status = 0; // to avoid complaints on uninitialized variable use
2069 __kmp_msg(
2070 kmp_ms_fatal, // Severity
2071 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2072 KMP_HNT( GetNewerLibrary ), // Hint
2073 __kmp_msg_null // Variadic argument list terminator
2074 );
2075 }
2076 break;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002077 } // switch
2078 } // if tc == 0;
2079
2080 if ( status == 0 ) {
2081 UT num_done;
2082
2083 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2084 #ifdef KMP_DEBUG
2085 {
2086 const char * buff;
2087 // create format specifiers before the debug output
2088 buff = __kmp_str_format(
2089 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2090 traits_t< UT >::spec );
2091 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2092 __kmp_str_free( &buff );
2093 }
2094 #endif
2095
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002096 if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002097 /* NOTE: release this buffer to be reused */
2098
2099 KMP_MB(); /* Flush all pending memory write invalidates. */
2100
2101 sh->u.s.num_done = 0;
2102 sh->u.s.iteration = 0;
2103
2104 /* TODO replace with general release procedure? */
2105 if ( pr->ordered ) {
2106 sh->u.s.ordered_iteration = 0;
2107 }
2108
2109 KMP_MB(); /* Flush all pending memory write invalidates. */
2110
2111 sh -> buffer_index += KMP_MAX_DISP_BUF;
2112 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2113 gtid, sh->buffer_index) );
2114
2115 KMP_MB(); /* Flush all pending memory write invalidates. */
2116
2117 } // if
2118 if ( __kmp_env_consistency_check ) {
2119 if ( pr->pushed_ws != ct_none ) {
2120 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2121 }
2122 }
2123
2124 th -> th.th_dispatch -> th_deo_fcn = NULL;
2125 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2126 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2127 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2128 } // if (status == 0)
2129#if KMP_OS_WINDOWS
2130 else if ( last ) {
2131 pr->u.p.last_upper = pr->u.p.ub;
2132 }
2133#endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002134 if ( p_last != NULL && status != 0 )
2135 *p_last = last;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002136 } // if
2137
2138 #ifdef KMP_DEBUG
2139 {
2140 const char * buff;
2141 // create format specifiers before the debug output
2142 buff = __kmp_str_format(
2143 "__kmp_dispatch_next: T#%%d normal case: " \
2144 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2145 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2146 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2147 __kmp_str_free( &buff );
2148 }
2149 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002150#if INCLUDE_SSC_MARKS
2151 SSC_MARK_DISPATCH_NEXT();
2152#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00002153 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002154 return status;
2155}
2156
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002157template< typename T >
2158static void
2159__kmp_dist_get_bounds(
2160 ident_t *loc,
2161 kmp_int32 gtid,
2162 kmp_int32 *plastiter,
2163 T *plower,
2164 T *pupper,
2165 typename traits_t< T >::signed_t incr
2166) {
2167 KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2168 typedef typename traits_t< T >::unsigned_t UT;
2169 typedef typename traits_t< T >::signed_t ST;
2170 register kmp_uint32 team_id;
2171 register kmp_uint32 nteams;
2172 register UT trip_count;
2173 register kmp_team_t *team;
2174 kmp_info_t * th;
2175
2176 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2177 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2178 #ifdef KMP_DEBUG
2179 {
2180 const char * buff;
2181 // create format specifiers before the debug output
2182 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2183 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2184 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2185 traits_t< T >::spec );
2186 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2187 __kmp_str_free( &buff );
2188 }
2189 #endif
2190
2191 if( __kmp_env_consistency_check ) {
2192 if( incr == 0 ) {
2193 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2194 }
2195 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2196 // The loop is illegal.
2197 // Some zero-trip loops maintained by compiler, e.g.:
2198 // for(i=10;i<0;++i) // lower >= upper - run-time check
2199 // for(i=0;i>10;--i) // lower <= upper - run-time check
2200 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2201 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2202 // Compiler does not check the following illegal loops:
2203 // for(i=0;i<10;i+=incr) // where incr<0
2204 // for(i=10;i>0;i-=incr) // where incr<0
2205 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2206 }
2207 }
2208 th = __kmp_threads[gtid];
2209 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2210 team = th->th.th_team;
2211 #if OMP_40_ENABLED
2212 nteams = th->th.th_teams_size.nteams;
2213 #endif
2214 team_id = team->t.t_master_tid;
2215 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2216
2217 // compute global trip count
2218 if( incr == 1 ) {
2219 trip_count = *pupper - *plower + 1;
2220 } else if(incr == -1) {
2221 trip_count = *plower - *pupper + 1;
2222 } else {
2223 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2224 }
2225 if( trip_count <= nteams ) {
2226 KMP_DEBUG_ASSERT(
2227 __kmp_static == kmp_sch_static_greedy || \
2228 __kmp_static == kmp_sch_static_balanced
2229 ); // Unknown static scheduling type.
2230 // only some teams get single iteration, others get nothing
2231 if( team_id < trip_count ) {
2232 *pupper = *plower = *plower + team_id * incr;
2233 } else {
2234 *plower = *pupper + incr; // zero-trip loop
2235 }
2236 if( plastiter != NULL )
2237 *plastiter = ( team_id == trip_count - 1 );
2238 } else {
2239 if( __kmp_static == kmp_sch_static_balanced ) {
2240 register UT chunk = trip_count / nteams;
2241 register UT extras = trip_count % nteams;
2242 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2243 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2244 if( plastiter != NULL )
2245 *plastiter = ( team_id == nteams - 1 );
2246 } else {
2247 register T chunk_inc_count =
2248 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2249 register T upper = *pupper;
2250 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2251 // Unknown static scheduling type.
2252 *plower += team_id * chunk_inc_count;
2253 *pupper = *plower + chunk_inc_count - incr;
2254 // Check/correct bounds if needed
2255 if( incr > 0 ) {
2256 if( *pupper < *plower )
2257 *pupper = i_maxmin< T >::mx;
2258 if( plastiter != NULL )
2259 *plastiter = *plower <= upper && *pupper > upper - incr;
2260 if( *pupper > upper )
2261 *pupper = upper; // tracker C73258
2262 } else {
2263 if( *pupper > *plower )
2264 *pupper = i_maxmin< T >::mn;
2265 if( plastiter != NULL )
2266 *plastiter = *plower >= upper && *pupper < upper - incr;
2267 if( *pupper < upper )
2268 *pupper = upper; // tracker C73258
2269 }
2270 }
2271 }
2272}
2273
Jim Cownie5e8470a2013-09-27 10:38:44 +00002274//-----------------------------------------------------------------------------------------
2275// Dispatch routines
2276// Transfer call to template< type T >
2277// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2278// T lb, T ub, ST st, ST chunk )
2279extern "C" {
2280
2281/*!
2282@ingroup WORK_SHARING
2283@{
2284@param loc Source location
2285@param gtid Global thread id
2286@param schedule Schedule type
2287@param lb Lower bound
2288@param ub Upper bound
2289@param st Step (or increment if you prefer)
2290@param chunk The chunk size to block with
2291
2292This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2293These functions are all identical apart from the types of the arguments.
2294*/
2295
2296void
2297__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2298 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2299{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002300 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002301 KMP_DEBUG_ASSERT( __kmp_init_serial );
2302 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2303}
2304/*!
2305See @ref __kmpc_dispatch_init_4
2306*/
2307void
2308__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2309 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2310{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002311 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002312 KMP_DEBUG_ASSERT( __kmp_init_serial );
2313 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2314}
2315
2316/*!
2317See @ref __kmpc_dispatch_init_4
2318*/
2319void
2320__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2321 kmp_int64 lb, kmp_int64 ub,
2322 kmp_int64 st, kmp_int64 chunk )
2323{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002324 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002325 KMP_DEBUG_ASSERT( __kmp_init_serial );
2326 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2327}
2328
2329/*!
2330See @ref __kmpc_dispatch_init_4
2331*/
2332void
2333__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2334 kmp_uint64 lb, kmp_uint64 ub,
2335 kmp_int64 st, kmp_int64 chunk )
2336{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002337 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002338 KMP_DEBUG_ASSERT( __kmp_init_serial );
2339 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2340}
2341
2342/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002343See @ref __kmpc_dispatch_init_4
2344
2345Difference from __kmpc_dispatch_init set of functions is these functions
2346are called for composite distribute parallel for construct. Thus before
2347regular iterations dispatching we need to calc per-team iteration space.
2348
2349These functions are all identical apart from the types of the arguments.
2350*/
2351void
2352__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2353 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2354{
2355 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2356 KMP_DEBUG_ASSERT( __kmp_init_serial );
2357 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2358 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2359}
2360
2361void
2362__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2363 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2364{
2365 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2366 KMP_DEBUG_ASSERT( __kmp_init_serial );
2367 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2368 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2369}
2370
2371void
2372__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2373 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2374{
2375 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2376 KMP_DEBUG_ASSERT( __kmp_init_serial );
2377 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2378 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2379}
2380
2381void
2382__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2383 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2384{
2385 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2386 KMP_DEBUG_ASSERT( __kmp_init_serial );
2387 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2388 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2389}
2390
2391/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002392@param loc Source code location
2393@param gtid Global thread id
2394@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2395@param p_lb Pointer to the lower bound for the next chunk of work
2396@param p_ub Pointer to the upper bound for the next chunk of work
2397@param p_st Pointer to the stride for the next chunk of work
2398@return one if there is work to be done, zero otherwise
2399
2400Get the next dynamically allocated chunk of work for this thread.
2401If there is no more work, then the lb,ub and stride need not be modified.
2402*/
2403int
2404__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2405 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2406{
2407 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2408}
2409
2410/*!
2411See @ref __kmpc_dispatch_next_4
2412*/
2413int
2414__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2415 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2416{
2417 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2418}
2419
2420/*!
2421See @ref __kmpc_dispatch_next_4
2422*/
2423int
2424__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2425 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2426{
2427 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2428}
2429
2430/*!
2431See @ref __kmpc_dispatch_next_4
2432*/
2433int
2434__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2435 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2436{
2437 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2438}
2439
2440/*!
2441@param loc Source code location
2442@param gtid Global thread id
2443
2444Mark the end of a dynamic loop.
2445*/
2446void
2447__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2448{
2449 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2450}
2451
2452/*!
2453See @ref __kmpc_dispatch_fini_4
2454*/
2455void
2456__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2457{
2458 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2459}
2460
2461/*!
2462See @ref __kmpc_dispatch_fini_4
2463*/
2464void
2465__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2466{
2467 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2468}
2469
2470/*!
2471See @ref __kmpc_dispatch_fini_4
2472*/
2473void
2474__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2475{
2476 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2477}
2478/*! @} */
2479
2480//-----------------------------------------------------------------------------------------
2481//Non-template routines from kmp_dispatch.c used in other sources
2482
2483kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2484 return value == checker;
2485}
2486
2487kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2488 return value != checker;
2489}
2490
2491kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2492 return value < checker;
2493}
2494
2495kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2496 return value >= checker;
2497}
2498
2499kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2500 return value <= checker;
2501}
2502kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2503 return value == checker;
2504}
2505
2506kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2507 return value != checker;
2508}
2509
2510kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2511 return value < checker;
2512}
2513
2514kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2515 return value >= checker;
2516}
2517
2518kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2519 return value <= checker;
2520}
2521
2522kmp_uint32
2523__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2524 kmp_uint32 checker,
2525 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2526 , void * obj // Higher-level synchronization object, or NULL.
2527 )
2528{
2529 // note: we may not belong to a team at this point
2530 register volatile kmp_uint32 * spin = spinner;
2531 register kmp_uint32 check = checker;
2532 register kmp_uint32 spins;
2533 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2534 register kmp_uint32 r;
2535
2536 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2537 KMP_INIT_YIELD( spins );
2538 // main wait spin loop
2539 while(!f(r = TCR_4(*spin), check)) {
2540 KMP_FSYNC_SPIN_PREPARE( obj );
2541 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2542 It causes problems with infinite recursion because of exit lock */
2543 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2544 __kmp_abort_thread(); */
2545
Jim Cownie5e8470a2013-09-27 10:38:44 +00002546 /* if we have waited a bit, or are oversubscribed, yield */
2547 /* pause is in the following code */
2548 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2549 KMP_YIELD_SPIN( spins );
2550 }
2551 KMP_FSYNC_SPIN_ACQUIRED( obj );
2552 return r;
2553}
2554
2555kmp_uint64
2556__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2557 kmp_uint64 checker,
2558 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2559 , void * obj // Higher-level synchronization object, or NULL.
2560 )
2561{
2562 // note: we may not belong to a team at this point
2563 register volatile kmp_uint64 * spin = spinner;
2564 register kmp_uint64 check = checker;
2565 register kmp_uint32 spins;
2566 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2567 register kmp_uint64 r;
2568
2569 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2570 KMP_INIT_YIELD( spins );
2571 // main wait spin loop
2572 while(!f(r = *spin, check))
2573 {
2574 KMP_FSYNC_SPIN_PREPARE( obj );
2575 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2576 It causes problems with infinite recursion because of exit lock */
2577 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2578 __kmp_abort_thread(); */
2579
Jim Cownie5e8470a2013-09-27 10:38:44 +00002580 // if we are oversubscribed,
2581 // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2582 // pause is in the following code
2583 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2584 KMP_YIELD_SPIN( spins );
2585 }
2586 KMP_FSYNC_SPIN_ACQUIRED( obj );
2587 return r;
2588}
2589
2590} // extern "C"
2591
2592#ifdef KMP_GOMP_COMPAT
2593
2594void
2595__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2596 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2597 kmp_int32 chunk, int push_ws )
2598{
2599 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2600 push_ws );
2601}
2602
2603void
2604__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2605 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2606 kmp_int32 chunk, int push_ws )
2607{
2608 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2609 push_ws );
2610}
2611
2612void
2613__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2614 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2615 kmp_int64 chunk, int push_ws )
2616{
2617 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2618 push_ws );
2619}
2620
2621void
2622__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2623 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2624 kmp_int64 chunk, int push_ws )
2625{
2626 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2627 push_ws );
2628}
2629
2630void
2631__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2632{
2633 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2634}
2635
2636void
2637__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2638{
2639 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2640}
2641
2642void
2643__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2644{
2645 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2646}
2647
2648void
2649__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2650{
2651 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2652}
2653
2654#endif /* KMP_GOMP_COMPAT */
2655
2656/* ------------------------------------------------------------------------ */
2657/* ------------------------------------------------------------------------ */
2658