blob: 71c3a40fbb0fca0a5fc1f589e3c79b350f2f8641 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16/*
17 * Dynamic scheduling initialization and dispatch.
18 *
19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20 * it may change values between parallel regions. __kmp_max_nth
21 * is the largest value __kmp_nth may take, 1 is the smallest.
22 *
23 */
24
25/* ------------------------------------------------------------------------ */
26/* ------------------------------------------------------------------------ */
27
28#include "kmp.h"
29#include "kmp_i18n.h"
30#include "kmp_itt.h"
31#include "kmp_str.h"
32#include "kmp_error.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000033#include "kmp_stats.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000034#if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36#endif
37
Andrey Churbanovd7d088f2015-04-29 16:42:24 +000038#if OMPT_SUPPORT
39#include "ompt-internal.h"
40#include "ompt-specific.h"
41#endif
42
Jim Cownie5e8470a2013-09-27 10:38:44 +000043/* ------------------------------------------------------------------------ */
44/* ------------------------------------------------------------------------ */
45
Jim Cownie4cc4bb42014-10-07 16:25:50 +000046// template for type limits
47template< typename T >
48struct i_maxmin {
49 static const T mx;
50 static const T mn;
51};
52template<>
53struct i_maxmin< int > {
54 static const int mx = 0x7fffffff;
55 static const int mn = 0x80000000;
56};
57template<>
58struct i_maxmin< unsigned int > {
59 static const unsigned int mx = 0xffffffff;
60 static const unsigned int mn = 0x00000000;
61};
62template<>
63struct i_maxmin< long long > {
64 static const long long mx = 0x7fffffffffffffffLL;
65 static const long long mn = 0x8000000000000000LL;
66};
67template<>
68struct i_maxmin< unsigned long long > {
69 static const unsigned long long mx = 0xffffffffffffffffLL;
70 static const unsigned long long mn = 0x0000000000000000LL;
71};
72//-------------------------------------------------------------------------
73
Jim Cownie5e8470a2013-09-27 10:38:44 +000074#ifdef KMP_STATIC_STEAL_ENABLED
75
76 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
77 template< typename T >
78 struct dispatch_private_infoXX_template {
79 typedef typename traits_t< T >::unsigned_t UT;
80 typedef typename traits_t< T >::signed_t ST;
81 UT count; // unsigned
82 T ub;
83 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
84 T lb;
85 ST st; // signed
86 UT tc; // unsigned
87 T static_steal_counter; // for static_steal only; maybe better to put after ub
88
89 /* parm[1-4] are used in different ways by different scheduling algorithms */
90
91 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
92 // a) parm3 is properly aligned and
93 // b) all parm1-4 are in the same cache line.
94 // Because of parm1-4 are used together, performance seems to be better
95 // if they are in the same line (not measured though).
96
97 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
98 T parm1;
99 T parm2;
100 T parm3;
101 T parm4;
102 };
103
104 UT ordered_lower; // unsigned
105 UT ordered_upper; // unsigned
106 #if KMP_OS_WINDOWS
107 T last_upper;
108 #endif /* KMP_OS_WINDOWS */
109 };
110
111#else /* KMP_STATIC_STEAL_ENABLED */
112
113 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
114 template< typename T >
115 struct dispatch_private_infoXX_template {
116 typedef typename traits_t< T >::unsigned_t UT;
117 typedef typename traits_t< T >::signed_t ST;
118 T lb;
119 T ub;
120 ST st; // signed
121 UT tc; // unsigned
122
123 T parm1;
124 T parm2;
125 T parm3;
126 T parm4;
127
128 UT count; // unsigned
129
130 UT ordered_lower; // unsigned
131 UT ordered_upper; // unsigned
132 #if KMP_OS_WINDOWS
133 T last_upper;
134 #endif /* KMP_OS_WINDOWS */
135 };
136
137#endif /* KMP_STATIC_STEAL_ENABLED */
138
139// replaces dispatch_private_info structure and dispatch_private_info_t type
140template< typename T >
141struct KMP_ALIGN_CACHE dispatch_private_info_template {
142 // duplicate alignment here, otherwise size of structure is not correct in our compiler
143 union KMP_ALIGN_CACHE private_info_tmpl {
144 dispatch_private_infoXX_template< T > p;
145 dispatch_private_info64_t p64;
146 } u;
147 enum sched_type schedule; /* scheduling algorithm */
148 kmp_uint32 ordered; /* ordered clause specified */
149 kmp_uint32 ordered_bumped;
150 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
151 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
152 kmp_uint32 nomerge; /* don't merge iters if serialized */
153 kmp_uint32 type_size;
154 enum cons_type pushed_ws;
155};
156
157
158// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
159template< typename UT >
160struct dispatch_shared_infoXX_template {
161 /* chunk index under dynamic, number of idle threads under static-steal;
162 iteration index otherwise */
163 volatile UT iteration;
164 volatile UT num_done;
165 volatile UT ordered_iteration;
Jonathan Peyton71909c52016-03-02 22:42:06 +0000166 UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
Jim Cownie5e8470a2013-09-27 10:38:44 +0000167};
168
169// replaces dispatch_shared_info structure and dispatch_shared_info_t type
170template< typename UT >
171struct dispatch_shared_info_template {
172 // we need union here to keep the structure size
173 union shared_info_tmpl {
174 dispatch_shared_infoXX_template< UT > s;
175 dispatch_shared_info64_t s64;
176 } u;
177 volatile kmp_uint32 buffer_index;
Jonathan Peyton71909c52016-03-02 22:42:06 +0000178#if OMP_41_ENABLED
179 volatile kmp_int32 doacross_buf_idx; // teamwise index
180 kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
181 kmp_int32 doacross_num_done; // count finished threads
182#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000183};
184
185/* ------------------------------------------------------------------------ */
186/* ------------------------------------------------------------------------ */
187
Jim Cownie5e8470a2013-09-27 10:38:44 +0000188#undef USE_TEST_LOCKS
189
190// test_then_add template (general template should NOT be used)
191template< typename T >
192static __forceinline T
193test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
194
195template<>
196__forceinline kmp_int32
197test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
198{
199 kmp_int32 r;
200 r = KMP_TEST_THEN_ADD32( p, d );
201 return r;
202}
203
204template<>
205__forceinline kmp_int64
206test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
207{
208 kmp_int64 r;
209 r = KMP_TEST_THEN_ADD64( p, d );
210 return r;
211}
212
213// test_then_inc_acq template (general template should NOT be used)
214template< typename T >
215static __forceinline T
216test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
217
218template<>
219__forceinline kmp_int32
220test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
221{
222 kmp_int32 r;
223 r = KMP_TEST_THEN_INC_ACQ32( p );
224 return r;
225}
226
227template<>
228__forceinline kmp_int64
229test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
230{
231 kmp_int64 r;
232 r = KMP_TEST_THEN_INC_ACQ64( p );
233 return r;
234}
235
236// test_then_inc template (general template should NOT be used)
237template< typename T >
238static __forceinline T
239test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
240
241template<>
242__forceinline kmp_int32
243test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
244{
245 kmp_int32 r;
246 r = KMP_TEST_THEN_INC32( p );
247 return r;
248}
249
250template<>
251__forceinline kmp_int64
252test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
253{
254 kmp_int64 r;
255 r = KMP_TEST_THEN_INC64( p );
256 return r;
257}
258
259// compare_and_swap template (general template should NOT be used)
260template< typename T >
261static __forceinline kmp_int32
262compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
263
264template<>
265__forceinline kmp_int32
266compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
267{
268 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
269}
270
271template<>
272__forceinline kmp_int32
273compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
274{
275 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
276}
277
278/*
279 Spin wait loop that first does pause, then yield.
280 Waits until function returns non-zero when called with *spinner and check.
281 Does NOT put threads to sleep.
282#if USE_ITT_BUILD
283 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000284 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000285 locks consistently. For example, if lock is acquired immediately, its address is
286 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
287 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
288 address, not an address of low-level spinner.
289#endif // USE_ITT_BUILD
290*/
291template< typename UT >
292// ToDo: make inline function (move to header file for icl)
293static UT // unsigned 4- or 8-byte type
294__kmp_wait_yield( volatile UT * spinner,
295 UT checker,
296 kmp_uint32 (* pred)( UT, UT )
297 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
298 )
299{
300 // note: we may not belong to a team at this point
301 register volatile UT * spin = spinner;
302 register UT check = checker;
303 register kmp_uint32 spins;
304 register kmp_uint32 (*f) ( UT, UT ) = pred;
305 register UT r;
306
307 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
308 KMP_INIT_YIELD( spins );
309 // main wait spin loop
310 while(!f(r = *spin, check))
311 {
312 KMP_FSYNC_SPIN_PREPARE( obj );
313 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
314 It causes problems with infinite recursion because of exit lock */
315 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
316 __kmp_abort_thread(); */
317
Jim Cownie5e8470a2013-09-27 10:38:44 +0000318 // if we are oversubscribed,
319 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
320 // pause is in the following code
321 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
322 KMP_YIELD_SPIN( spins );
323 }
324 KMP_FSYNC_SPIN_ACQUIRED( obj );
325 return r;
326}
327
328template< typename UT >
329static kmp_uint32 __kmp_eq( UT value, UT checker) {
330 return value == checker;
331}
332
333template< typename UT >
334static kmp_uint32 __kmp_neq( UT value, UT checker) {
335 return value != checker;
336}
337
338template< typename UT >
339static kmp_uint32 __kmp_lt( UT value, UT checker) {
340 return value < checker;
341}
342
343template< typename UT >
344static kmp_uint32 __kmp_ge( UT value, UT checker) {
345 return value >= checker;
346}
347
348template< typename UT >
349static kmp_uint32 __kmp_le( UT value, UT checker) {
350 return value <= checker;
351}
352
353
354/* ------------------------------------------------------------------------ */
355/* ------------------------------------------------------------------------ */
356
357static void
358__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
359{
360 kmp_info_t *th;
361
362 KMP_DEBUG_ASSERT( gtid_ref );
363
364 if ( __kmp_env_consistency_check ) {
365 th = __kmp_threads[*gtid_ref];
366 if ( th -> th.th_root -> r.r_active
367 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000368#if KMP_USE_DYNAMIC_LOCK
369 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
370#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000371 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000372#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000373 }
374 }
375}
376
377template< typename UT >
378static void
379__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
380{
381 typedef typename traits_t< UT >::signed_t ST;
382 dispatch_private_info_template< UT > * pr;
383
384 int gtid = *gtid_ref;
385// int cid = *cid_ref;
386 kmp_info_t *th = __kmp_threads[ gtid ];
387 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
388
389 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
390 if ( __kmp_env_consistency_check ) {
391 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
392 ( th -> th.th_dispatch -> th_dispatch_pr_current );
393 if ( pr -> pushed_ws != ct_none ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000394#if KMP_USE_DYNAMIC_LOCK
395 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
396#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000397 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000398#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000399 }
400 }
401
402 if ( ! th -> th.th_team -> t.t_serialized ) {
403 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
404 ( th -> th.th_dispatch -> th_dispatch_sh_current );
405 UT lower;
406
407 if ( ! __kmp_env_consistency_check ) {
408 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
409 ( th -> th.th_dispatch -> th_dispatch_pr_current );
410 }
411 lower = pr->u.p.ordered_lower;
412
413 #if ! defined( KMP_GOMP_COMPAT )
414 if ( __kmp_env_consistency_check ) {
415 if ( pr->ordered_bumped ) {
416 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
417 __kmp_error_construct2(
418 kmp_i18n_msg_CnsMultipleNesting,
419 ct_ordered_in_pdo, loc_ref,
420 & p->stack_data[ p->w_top ]
421 );
422 }
423 }
424 #endif /* !defined(KMP_GOMP_COMPAT) */
425
426 KMP_MB();
427 #ifdef KMP_DEBUG
428 {
429 const char * buff;
430 // create format specifiers before the debug output
431 buff = __kmp_str_format(
432 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
433 traits_t< UT >::spec, traits_t< UT >::spec );
434 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
435 __kmp_str_free( &buff );
436 }
437 #endif
438
439 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
440 USE_ITT_BUILD_ARG( NULL )
441 );
442 KMP_MB(); /* is this necessary? */
443 #ifdef KMP_DEBUG
444 {
445 const char * buff;
446 // create format specifiers before the debug output
447 buff = __kmp_str_format(
448 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
449 traits_t< UT >::spec, traits_t< UT >::spec );
450 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
451 __kmp_str_free( &buff );
452 }
453 #endif
454 }
455 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
456}
457
458static void
459__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
460{
461 kmp_info_t *th;
462
463 if ( __kmp_env_consistency_check ) {
464 th = __kmp_threads[*gtid_ref];
465 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
466 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
467 }
468 }
469}
470
471template< typename UT >
472static void
473__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
474{
475 typedef typename traits_t< UT >::signed_t ST;
476 dispatch_private_info_template< UT > * pr;
477
478 int gtid = *gtid_ref;
479// int cid = *cid_ref;
480 kmp_info_t *th = __kmp_threads[ gtid ];
481 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
482
483 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
484 if ( __kmp_env_consistency_check ) {
485 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
486 ( th -> th.th_dispatch -> th_dispatch_pr_current );
487 if ( pr -> pushed_ws != ct_none ) {
488 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
489 }
490 }
491
492 if ( ! th -> th.th_team -> t.t_serialized ) {
493 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
494 ( th -> th.th_dispatch -> th_dispatch_sh_current );
495
496 if ( ! __kmp_env_consistency_check ) {
497 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
498 ( th -> th.th_dispatch -> th_dispatch_pr_current );
499 }
500
501 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
502 #if ! defined( KMP_GOMP_COMPAT )
503 if ( __kmp_env_consistency_check ) {
504 if ( pr->ordered_bumped != 0 ) {
505 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
506 /* How to test it? - OM */
507 __kmp_error_construct2(
508 kmp_i18n_msg_CnsMultipleNesting,
509 ct_ordered_in_pdo, loc_ref,
510 & p->stack_data[ p->w_top ]
511 );
512 }
513 }
514 #endif /* !defined(KMP_GOMP_COMPAT) */
515
516 KMP_MB(); /* Flush all pending memory write invalidates. */
517
518 pr->ordered_bumped += 1;
519
520 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
521 gtid, pr->ordered_bumped ) );
522
523 KMP_MB(); /* Flush all pending memory write invalidates. */
524
525 /* TODO use general release procedure? */
526 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
527
528 KMP_MB(); /* Flush all pending memory write invalidates. */
529 }
530 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
531}
532
533/* Computes and returns x to the power of y, where y must a non-negative integer */
534template< typename UT >
535static __forceinline long double
536__kmp_pow(long double x, UT y) {
537 long double s=1.0L;
538
539 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
540 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
541 while(y) {
542 if ( y & 1 )
543 s *= x;
544 x *= x;
545 y >>= 1;
546 }
547 return s;
548}
549
550/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
551 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
552 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
553 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
554*/
555template< typename T >
556static __inline typename traits_t< T >::unsigned_t
557__kmp_dispatch_guided_remaining(
558 T tc,
559 typename traits_t< T >::floating_t base,
560 typename traits_t< T >::unsigned_t idx
561) {
562 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
563 least for ICL 8.1, long double arithmetic may not really have
564 long double precision, even with /Qlong_double. Currently, we
565 workaround that in the caller code, by manipulating the FPCW for
566 Windows* OS on IA-32 architecture. The lack of precision is not
567 expected to be a correctness issue, though.
568 */
569 typedef typename traits_t< T >::unsigned_t UT;
570
571 long double x = tc * __kmp_pow< UT >(base, idx);
572 UT r = (UT) x;
573 if ( x == r )
574 return r;
575 return r + 1;
576}
577
578// Parameters of the guided-iterative algorithm:
579// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
580// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
581// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
582// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
583static int guided_int_param = 2;
584static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
585
586// UT - unsigned flavor of T, ST - signed flavor of T,
587// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
588template< typename T >
589static void
590__kmp_dispatch_init(
591 ident_t * loc,
592 int gtid,
593 enum sched_type schedule,
594 T lb,
595 T ub,
596 typename traits_t< T >::signed_t st,
597 typename traits_t< T >::signed_t chunk,
598 int push_ws
599) {
600 typedef typename traits_t< T >::unsigned_t UT;
601 typedef typename traits_t< T >::signed_t ST;
602 typedef typename traits_t< T >::floating_t DBL;
603 static const int ___kmp_size_type = sizeof( UT );
604
605 int active;
606 T tc;
607 kmp_info_t * th;
608 kmp_team_t * team;
609 kmp_uint32 my_buffer_index;
610 dispatch_private_info_template< T > * pr;
611 dispatch_shared_info_template< UT > volatile * sh;
612
613 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
614 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
615
616 if ( ! TCR_4( __kmp_init_parallel ) )
617 __kmp_parallel_initialize();
618
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000619#if INCLUDE_SSC_MARKS
620 SSC_MARK_DISPATCH_INIT();
621#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000622 #ifdef KMP_DEBUG
623 {
624 const char * buff;
625 // create format specifiers before the debug output
626 buff = __kmp_str_format(
627 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
628 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
629 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
630 __kmp_str_free( &buff );
631 }
632 #endif
633 /* setup data */
634 th = __kmp_threads[ gtid ];
635 team = th -> th.th_team;
636 active = ! team -> t.t_serialized;
637 th->th.th_ident = loc;
638
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000639#if USE_ITT_BUILD
640 kmp_uint64 cur_chunk = chunk;
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000641 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
642 KMP_MASTER_GTID(gtid) &&
643#if OMP_40_ENABLED
644 th->th.th_teams_microtask == NULL &&
645#endif
646 team->t.t_active_level == 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000647#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000648 if ( ! active ) {
649 pr = reinterpret_cast< dispatch_private_info_template< T >* >
650 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
651 } else {
652 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
653 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
654
655 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
656
657 /* What happens when number of threads changes, need to resize buffer? */
658 pr = reinterpret_cast< dispatch_private_info_template< T > * >
Jonathan Peyton067325f2016-05-31 19:01:15 +0000659 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000660 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
Jonathan Peyton067325f2016-05-31 19:01:15 +0000661 ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000662 }
663
Jonathan Peytonea0fe1d2016-02-25 17:55:50 +0000664 /* Currently just ignore the monotonic and non-monotonic modifiers (the compiler isn't producing them
665 * yet anyway).
666 * When it is we'll want to look at them somewhere here and use that information to add to our
667 * schedule choice. We shouldn't need to pass them on, they merely affect which schedule we can
668 * legally choose for various dynamic cases. (In paritcular, whether or not a stealing scheme is legal).
669 */
670 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
671
Jim Cownie5e8470a2013-09-27 10:38:44 +0000672 /* Pick up the nomerge/ordered bits from the scheduling type */
673 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
674 pr->nomerge = TRUE;
675 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
676 } else {
677 pr->nomerge = FALSE;
678 }
679 pr->type_size = ___kmp_size_type; // remember the size of variables
680 if ( kmp_ord_lower & schedule ) {
681 pr->ordered = TRUE;
682 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
683 } else {
684 pr->ordered = FALSE;
685 }
Jonathan Peyton45be4502015-08-11 21:36:41 +0000686
Jim Cownie5e8470a2013-09-27 10:38:44 +0000687 if ( schedule == kmp_sch_static ) {
688 schedule = __kmp_static;
689 } else {
690 if ( schedule == kmp_sch_runtime ) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000691 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
692 schedule = team -> t.t_sched.r_sched_type;
693 // Detail the schedule if needed (global controls are differentiated appropriately)
694 if ( schedule == kmp_sch_guided_chunked ) {
695 schedule = __kmp_guided;
696 } else if ( schedule == kmp_sch_static ) {
697 schedule = __kmp_static;
698 }
699 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
700 chunk = team -> t.t_sched.chunk;
Jonathan Peyton00afbd02015-11-12 21:26:22 +0000701#if USE_ITT_BUILD
702 cur_chunk = chunk;
703#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000704 #ifdef KMP_DEBUG
705 {
706 const char * buff;
707 // create format specifiers before the debug output
708 buff = __kmp_str_format(
709 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
710 traits_t< ST >::spec );
711 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
712 __kmp_str_free( &buff );
713 }
714 #endif
715 } else {
716 if ( schedule == kmp_sch_guided_chunked ) {
717 schedule = __kmp_guided;
718 }
719 if ( chunk <= 0 ) {
720 chunk = KMP_DEFAULT_CHUNK;
721 }
722 }
723
Jim Cownie5e8470a2013-09-27 10:38:44 +0000724 if ( schedule == kmp_sch_auto ) {
725 // mapping and differentiation: in the __kmp_do_serial_initialize()
726 schedule = __kmp_auto;
727 #ifdef KMP_DEBUG
728 {
729 const char * buff;
730 // create format specifiers before the debug output
731 buff = __kmp_str_format(
732 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
733 traits_t< ST >::spec );
734 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
735 __kmp_str_free( &buff );
736 }
737 #endif
738 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000739
740 /* guided analytical not safe for too many threads */
741 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
742 schedule = kmp_sch_guided_iterative_chunked;
743 KMP_WARNING( DispatchManyThreads );
744 }
745 pr->u.p.parm1 = chunk;
746 }
747 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
748 "unknown scheduling type" );
749
750 pr->u.p.count = 0;
751
752 if ( __kmp_env_consistency_check ) {
753 if ( st == 0 ) {
754 __kmp_error_construct(
755 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
756 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
757 );
758 }
759 }
Jonathan Peyton5235a1b2016-04-18 21:38:29 +0000760 // compute trip count
761 if ( st == 1 ) { // most common case
762 if ( ub >= lb ) {
763 tc = ub - lb + 1;
764 } else { // ub < lb
765 tc = 0; // zero-trip
Jim Cownie5e8470a2013-09-27 10:38:44 +0000766 }
Jonathan Peyton5235a1b2016-04-18 21:38:29 +0000767 } else if ( st < 0 ) {
768 if ( lb >= ub ) {
769 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
770 // where the division needs to be unsigned regardless of the result type
771 tc = (UT)(lb - ub) / (-st) + 1;
772 } else { // lb < ub
773 tc = 0; // zero-trip
774 }
775 } else { // st > 0
776 if ( ub >= lb ) {
777 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
778 // where the division needs to be unsigned regardless of the result type
779 tc = (UT)(ub - lb) / st + 1;
780 } else { // ub < lb
781 tc = 0; // zero-trip
782 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000783 }
784
Jonathan Peyton45be4502015-08-11 21:36:41 +0000785 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
786 // when statistics are disabled.
787 if (schedule == __kmp_static)
788 {
789 KMP_COUNT_BLOCK(OMP_FOR_static);
790 KMP_COUNT_VALUE(FOR_static_iterations, tc);
791 }
792 else
793 {
794 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
795 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
796 }
797
Jim Cownie5e8470a2013-09-27 10:38:44 +0000798 pr->u.p.lb = lb;
799 pr->u.p.ub = ub;
800 pr->u.p.st = st;
801 pr->u.p.tc = tc;
802
803 #if KMP_OS_WINDOWS
804 pr->u.p.last_upper = ub + st;
805 #endif /* KMP_OS_WINDOWS */
806
807 /* NOTE: only the active parallel region(s) has active ordered sections */
808
809 if ( active ) {
810 if ( pr->ordered == 0 ) {
811 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
812 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
813 } else {
814 pr->ordered_bumped = 0;
815
816 pr->u.p.ordered_lower = 1;
817 pr->u.p.ordered_upper = 0;
818
819 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
820 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
821 }
822 }
823
824 if ( __kmp_env_consistency_check ) {
825 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
826 if ( push_ws ) {
827 __kmp_push_workshare( gtid, ws, loc );
828 pr->pushed_ws = ws;
829 } else {
830 __kmp_check_workshare( gtid, ws, loc );
831 pr->pushed_ws = ct_none;
832 }
833 }
834
835 switch ( schedule ) {
836 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
837 case kmp_sch_static_steal:
838 {
839 T nproc = team->t.t_nproc;
840 T ntc, init;
841
842 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
843
844 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
845 if ( nproc > 1 && ntc >= nproc ) {
846 T id = __kmp_tid_from_gtid(gtid);
847 T small_chunk, extras;
848
849 small_chunk = ntc / nproc;
850 extras = ntc % nproc;
851
852 init = id * small_chunk + ( id < extras ? id : extras );
853 pr->u.p.count = init;
854 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
855
856 pr->u.p.parm2 = lb;
857 //pr->pfields.parm3 = 0; // it's not used in static_steal
858 pr->u.p.parm4 = id;
859 pr->u.p.st = st;
860 break;
861 } else {
862 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
863 gtid ) );
864 schedule = kmp_sch_static_balanced;
865 /* too few iterations: fall-through to kmp_sch_static_balanced */
866 } // if
867 /* FALL-THROUGH to static balanced */
868 } // case
869 #endif
870 case kmp_sch_static_balanced:
871 {
872 T nproc = team->t.t_nproc;
873 T init, limit;
874
875 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
876 gtid ) );
877
878 if ( nproc > 1 ) {
879 T id = __kmp_tid_from_gtid(gtid);
880
881 if ( tc < nproc ) {
882 if ( id < tc ) {
883 init = id;
884 limit = id;
885 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
886 } else {
887 pr->u.p.count = 1; /* means no more chunks to execute */
888 pr->u.p.parm1 = FALSE;
889 break;
890 }
891 } else {
892 T small_chunk = tc / nproc;
893 T extras = tc % nproc;
894 init = id * small_chunk + (id < extras ? id : extras);
895 limit = init + small_chunk - (id < extras ? 0 : 1);
896 pr->u.p.parm1 = (id == nproc - 1);
897 }
898 } else {
899 if ( tc > 0 ) {
900 init = 0;
901 limit = tc - 1;
902 pr->u.p.parm1 = TRUE;
903 } else {
904 // zero trip count
905 pr->u.p.count = 1; /* means no more chunks to execute */
906 pr->u.p.parm1 = FALSE;
907 break;
908 }
909 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000910#if USE_ITT_BUILD
911 // Calculate chunk for metadata report
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000912 if ( itt_need_metadata_reporting )
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000913 cur_chunk = limit - init + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000914#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000915 if ( st == 1 ) {
916 pr->u.p.lb = lb + init;
917 pr->u.p.ub = lb + limit;
918 } else {
919 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
920 pr->u.p.lb = lb + init * st;
921 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
922 if ( st > 0 ) {
923 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
924 } else {
925 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
926 }
927 }
928 if ( pr->ordered ) {
929 pr->u.p.ordered_lower = init;
930 pr->u.p.ordered_upper = limit;
931 }
932 break;
933 } // case
934 case kmp_sch_guided_iterative_chunked :
935 {
936 T nproc = team->t.t_nproc;
937 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
938
939 if ( nproc > 1 ) {
940 if ( (2L * chunk + 1 ) * nproc >= tc ) {
941 /* chunk size too large, switch to dynamic */
942 schedule = kmp_sch_dynamic_chunked;
943 } else {
944 // when remaining iters become less than parm2 - switch to dynamic
945 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
946 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
947 }
948 } else {
949 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
950 schedule = kmp_sch_static_greedy;
951 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
952 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
953 pr->u.p.parm1 = tc;
954 } // if
955 } // case
956 break;
957 case kmp_sch_guided_analytical_chunked:
958 {
959 T nproc = team->t.t_nproc;
960 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
961
962 if ( nproc > 1 ) {
963 if ( (2L * chunk + 1 ) * nproc >= tc ) {
964 /* chunk size too large, switch to dynamic */
965 schedule = kmp_sch_dynamic_chunked;
966 } else {
967 /* commonly used term: (2 nproc - 1)/(2 nproc) */
968 DBL x;
969
970 #if KMP_OS_WINDOWS && KMP_ARCH_X86
971 /* Linux* OS already has 64-bit computation by default for
972 long double, and on Windows* OS on Intel(R) 64,
973 /Qlong_double doesn't work. On Windows* OS
974 on IA-32 architecture, we need to set precision to
975 64-bit instead of the default 53-bit. Even though long
976 double doesn't work on Windows* OS on Intel(R) 64, the
977 resulting lack of precision is not expected to impact
978 the correctness of the algorithm, but this has not been
979 mathematically proven.
980 */
981 // save original FPCW and set precision to 64-bit, as
982 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +0000983 unsigned int oldFpcw = _control87(0,0);
984 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +0000985 #endif
986 /* value used for comparison in solver for cross-over point */
987 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
988
989 /* crossover point--chunk indexes equal to or greater than
990 this point switch to dynamic-style scheduling */
991 UT cross;
992
993 /* commonly used term: (2 nproc - 1)/(2 nproc) */
994 x = (long double)1.0 - (long double)0.5 / nproc;
995
996 #ifdef KMP_DEBUG
997 { // test natural alignment
998 struct _test_a {
999 char a;
1000 union {
1001 char b;
1002 DBL d;
1003 };
1004 } t;
1005 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
1006 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
1007 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
1008 }
1009 #endif // KMP_DEBUG
1010
1011 /* save the term in thread private dispatch structure */
1012 *(DBL*)&pr->u.p.parm3 = x;
1013
1014 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
1015 {
1016 UT left, right, mid;
1017 long double p;
1018
1019 /* estimate initial upper and lower bound */
1020
1021 /* doesn't matter what value right is as long as it is positive, but
1022 it affects performance of the solver
1023 */
1024 right = 229;
1025 p = __kmp_pow< UT >(x,right);
1026 if ( p > target ) {
1027 do{
1028 p *= p;
1029 right <<= 1;
1030 } while(p>target && right < (1<<27));
1031 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1032 } else {
1033 left = 0;
1034 }
1035
1036 /* bisection root-finding method */
1037 while ( left + 1 < right ) {
1038 mid = (left + right) / 2;
1039 if ( __kmp_pow< UT >(x,mid) > target ) {
1040 left = mid;
1041 } else {
1042 right = mid;
1043 }
1044 } // while
1045 cross = right;
1046 }
1047 /* assert sanity of computed crossover point */
1048 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1049
1050 /* save the crossover point in thread private dispatch structure */
1051 pr->u.p.parm2 = cross;
1052
1053 // C75803
1054 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1055 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1056 #else
1057 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1058 #endif
1059 /* dynamic-style scheduling offset */
1060 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1061 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1062 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +00001063 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001064 #endif
1065 } // if
1066 } else {
1067 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1068 gtid ) );
1069 schedule = kmp_sch_static_greedy;
1070 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1071 pr->u.p.parm1 = tc;
1072 } // if
1073 } // case
1074 break;
1075 case kmp_sch_static_greedy:
1076 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1077 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1078 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1079 tc;
1080 break;
1081 case kmp_sch_static_chunked :
1082 case kmp_sch_dynamic_chunked :
Jonathan Peyton70bda912015-11-06 20:32:44 +00001083 if ( pr->u.p.parm1 <= 0 ) {
1084 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1085 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00001086 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1087 break;
1088 case kmp_sch_trapezoidal :
1089 {
1090 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1091
1092 T parm1, parm2, parm3, parm4;
1093 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1094
1095 parm1 = chunk;
1096
1097 /* F : size of the first cycle */
1098 parm2 = ( tc / (2 * team->t.t_nproc) );
1099
1100 if ( parm2 < 1 ) {
1101 parm2 = 1;
1102 }
1103
1104 /* L : size of the last cycle. Make sure the last cycle
1105 * is not larger than the first cycle.
1106 */
1107 if ( parm1 < 1 ) {
1108 parm1 = 1;
1109 } else if ( parm1 > parm2 ) {
1110 parm1 = parm2;
1111 }
1112
1113 /* N : number of cycles */
1114 parm3 = ( parm2 + parm1 );
1115 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1116
1117 if ( parm3 < 2 ) {
1118 parm3 = 2;
1119 }
1120
1121 /* sigma : decreasing incr of the trapezoid */
1122 parm4 = ( parm3 - 1 );
1123 parm4 = ( parm2 - parm1 ) / parm4;
1124
1125 // pointless check, because parm4 >= 0 always
1126 //if ( parm4 < 0 ) {
1127 // parm4 = 0;
1128 //}
1129
1130 pr->u.p.parm1 = parm1;
1131 pr->u.p.parm2 = parm2;
1132 pr->u.p.parm3 = parm3;
1133 pr->u.p.parm4 = parm4;
1134 } // case
1135 break;
1136
1137 default:
1138 {
1139 __kmp_msg(
1140 kmp_ms_fatal, // Severity
1141 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1142 KMP_HNT( GetNewerLibrary ), // Hint
1143 __kmp_msg_null // Variadic argument list terminator
1144 );
1145 }
1146 break;
1147 } // switch
1148 pr->schedule = schedule;
1149 if ( active ) {
1150 /* The name of this buffer should be my_buffer_index when it's free to use it */
1151
1152 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1153 gtid, my_buffer_index, sh->buffer_index) );
1154 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1155 USE_ITT_BUILD_ARG( NULL )
1156 );
1157 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1158 // *always* 32-bit integers.
1159 KMP_MB(); /* is this necessary? */
1160 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1161 gtid, my_buffer_index, sh->buffer_index) );
1162
1163 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1164 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1165#if USE_ITT_BUILD
1166 if ( pr->ordered ) {
1167 __kmp_itt_ordered_init( gtid );
1168 }; // if
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001169 // Report loop metadata
1170 if ( itt_need_metadata_reporting ) {
1171 // Only report metadata by master of active team at level 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001172 kmp_uint64 schedtype = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001173 switch ( schedule ) {
1174 case kmp_sch_static_chunked:
1175 case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1176 break;
1177 case kmp_sch_static_greedy:
1178 cur_chunk = pr->u.p.parm1;
1179 break;
1180 case kmp_sch_dynamic_chunked:
1181 schedtype = 1;
1182 break;
1183 case kmp_sch_guided_iterative_chunked:
1184 case kmp_sch_guided_analytical_chunked:
1185 schedtype = 2;
1186 break;
1187 default:
1188// Should we put this case under "static"?
1189// case kmp_sch_static_steal:
1190 schedtype = 3;
1191 break;
1192 }
1193 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1194 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001195#endif /* USE_ITT_BUILD */
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001196 }; // if
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001197
Jim Cownie5e8470a2013-09-27 10:38:44 +00001198 #ifdef KMP_DEBUG
1199 {
1200 const char * buff;
1201 // create format specifiers before the debug output
1202 buff = __kmp_str_format(
1203 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1204 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1205 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1206 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1207 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1208 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1209 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1210 KD_TRACE(10, ( buff,
1211 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1212 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1213 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1214 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1215 __kmp_str_free( &buff );
1216 }
1217 #endif
1218 #if ( KMP_STATIC_STEAL_ENABLED )
1219 if ( ___kmp_size_type < 8 ) {
1220 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1221 // all the parm3 variables will contain the same value.
1222 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1223 // rather than program life-time increment.
1224 // So the dedicated variable is required. The 'static_steal_counter' is used.
1225 if( schedule == kmp_sch_static_steal ) {
1226 // Other threads will inspect this variable when searching for a victim.
1227 // This is a flag showing that other threads may steal from this thread since then.
1228 volatile T * p = &pr->u.p.static_steal_counter;
1229 *p = *p + 1;
1230 }
1231 }
1232 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001233
1234#if OMPT_SUPPORT && OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001235 if (ompt_enabled &&
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001236 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1237 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1238 ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1239 ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1240 team_info->parallel_id, task_info->task_id, team_info->microtask);
1241 }
1242#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001243}
1244
1245/*
1246 * For ordered loops, either __kmp_dispatch_finish() should be called after
1247 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1248 * every chunk of iterations. If the ordered section(s) were not executed
1249 * for this iteration (or every iteration in this chunk), we need to set the
1250 * ordered iteration counters so that the next thread can proceed.
1251 */
1252template< typename UT >
1253static void
1254__kmp_dispatch_finish( int gtid, ident_t *loc )
1255{
1256 typedef typename traits_t< UT >::signed_t ST;
1257 kmp_info_t *th = __kmp_threads[ gtid ];
1258
1259 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1260 if ( ! th -> th.th_team -> t.t_serialized ) {
1261
1262 dispatch_private_info_template< UT > * pr =
1263 reinterpret_cast< dispatch_private_info_template< UT >* >
1264 ( th->th.th_dispatch->th_dispatch_pr_current );
1265 dispatch_shared_info_template< UT > volatile * sh =
1266 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1267 ( th->th.th_dispatch->th_dispatch_sh_current );
1268 KMP_DEBUG_ASSERT( pr );
1269 KMP_DEBUG_ASSERT( sh );
1270 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1271 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1272
1273 if ( pr->ordered_bumped ) {
1274 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1275 gtid ) );
1276 pr->ordered_bumped = 0;
1277 } else {
1278 UT lower = pr->u.p.ordered_lower;
1279
1280 #ifdef KMP_DEBUG
1281 {
1282 const char * buff;
1283 // create format specifiers before the debug output
1284 buff = __kmp_str_format(
1285 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1286 traits_t< UT >::spec, traits_t< UT >::spec );
1287 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1288 __kmp_str_free( &buff );
1289 }
1290 #endif
1291
1292 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1293 USE_ITT_BUILD_ARG(NULL)
1294 );
1295 KMP_MB(); /* is this necessary? */
1296 #ifdef KMP_DEBUG
1297 {
1298 const char * buff;
1299 // create format specifiers before the debug output
1300 buff = __kmp_str_format(
1301 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1302 traits_t< UT >::spec, traits_t< UT >::spec );
1303 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1304 __kmp_str_free( &buff );
1305 }
1306 #endif
1307
1308 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1309 } // if
1310 } // if
1311 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1312}
1313
1314#ifdef KMP_GOMP_COMPAT
1315
1316template< typename UT >
1317static void
1318__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1319{
1320 typedef typename traits_t< UT >::signed_t ST;
1321 kmp_info_t *th = __kmp_threads[ gtid ];
1322
1323 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1324 if ( ! th -> th.th_team -> t.t_serialized ) {
1325// int cid;
1326 dispatch_private_info_template< UT > * pr =
1327 reinterpret_cast< dispatch_private_info_template< UT >* >
1328 ( th->th.th_dispatch->th_dispatch_pr_current );
1329 dispatch_shared_info_template< UT > volatile * sh =
1330 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1331 ( th->th.th_dispatch->th_dispatch_sh_current );
1332 KMP_DEBUG_ASSERT( pr );
1333 KMP_DEBUG_ASSERT( sh );
1334 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1335 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1336
1337// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1338 UT lower = pr->u.p.ordered_lower;
1339 UT upper = pr->u.p.ordered_upper;
1340 UT inc = upper - lower + 1;
1341
1342 if ( pr->ordered_bumped == inc ) {
1343 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1344 gtid ) );
1345 pr->ordered_bumped = 0;
1346 } else {
1347 inc -= pr->ordered_bumped;
1348
1349 #ifdef KMP_DEBUG
1350 {
1351 const char * buff;
1352 // create format specifiers before the debug output
1353 buff = __kmp_str_format(
1354 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1355 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1356 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1357 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1358 __kmp_str_free( &buff );
1359 }
1360 #endif
1361
1362 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1363 USE_ITT_BUILD_ARG(NULL)
1364 );
1365
1366 KMP_MB(); /* is this necessary? */
1367 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1368 gtid ) );
1369 pr->ordered_bumped = 0;
1370//!!!!! TODO check if the inc should be unsigned, or signed???
1371 #ifdef KMP_DEBUG
1372 {
1373 const char * buff;
1374 // create format specifiers before the debug output
1375 buff = __kmp_str_format(
1376 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1377 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1378 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1379 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1380 __kmp_str_free( &buff );
1381 }
1382 #endif
1383
1384 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1385 }
1386// }
1387 }
1388 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1389}
1390
1391#endif /* KMP_GOMP_COMPAT */
1392
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001393/* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1394 * (no more work), then tell OMPT the loop is over. In some cases
1395 * kmp_dispatch_fini() is not called. */
1396#if OMPT_SUPPORT && OMPT_TRACE
1397#define OMPT_LOOP_END \
1398 if (status == 0) { \
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001399 if (ompt_enabled && \
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001400 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1401 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1402 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1403 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1404 team_info->parallel_id, task_info->task_id); \
1405 } \
1406 }
1407#else
1408#define OMPT_LOOP_END // no-op
1409#endif
1410
Jim Cownie5e8470a2013-09-27 10:38:44 +00001411template< typename T >
1412static int
1413__kmp_dispatch_next(
1414 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1415) {
1416
1417 typedef typename traits_t< T >::unsigned_t UT;
1418 typedef typename traits_t< T >::signed_t ST;
1419 typedef typename traits_t< T >::floating_t DBL;
Jonathan Peyton2321d572015-06-08 19:25:25 +00001420#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001421 static const int ___kmp_size_type = sizeof( UT );
Jonathan Peyton2321d572015-06-08 19:25:25 +00001422#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001423
Jonathan Peyton45be4502015-08-11 21:36:41 +00001424 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1425 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1426 // more than a compile time choice to use static scheduling would.)
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001427 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001428
Jim Cownie5e8470a2013-09-27 10:38:44 +00001429 int status;
1430 dispatch_private_info_template< T > * pr;
1431 kmp_info_t * th = __kmp_threads[ gtid ];
1432 kmp_team_t * team = th -> th.th_team;
1433
Andrey Churbanov9ad5c3a2015-07-13 17:52:41 +00001434 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie5e8470a2013-09-27 10:38:44 +00001435 #ifdef KMP_DEBUG
1436 {
1437 const char * buff;
1438 // create format specifiers before the debug output
1439 buff = __kmp_str_format(
1440 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1441 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1442 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1443 __kmp_str_free( &buff );
1444 }
1445 #endif
1446
1447 if ( team -> t.t_serialized ) {
1448 /* NOTE: serialize this dispatch becase we are not at the active level */
1449 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1450 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1451 KMP_DEBUG_ASSERT( pr );
1452
1453 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1454 *p_lb = 0;
1455 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001456// if ( p_last != NULL )
1457// *p_last = 0;
1458 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001459 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001460 if ( __kmp_env_consistency_check ) {
1461 if ( pr->pushed_ws != ct_none ) {
1462 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1463 }
1464 }
1465 } else if ( pr->nomerge ) {
1466 kmp_int32 last;
1467 T start;
1468 UT limit, trip, init;
1469 ST incr;
1470 T chunk = pr->u.p.parm1;
1471
1472 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1473
1474 init = chunk * pr->u.p.count++;
1475 trip = pr->u.p.tc - 1;
1476
1477 if ( (status = (init <= trip)) == 0 ) {
1478 *p_lb = 0;
1479 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001480// if ( p_last != NULL )
1481// *p_last = 0;
1482 if ( p_st != NULL )
1483 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001484 if ( __kmp_env_consistency_check ) {
1485 if ( pr->pushed_ws != ct_none ) {
1486 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1487 }
1488 }
1489 } else {
1490 start = pr->u.p.lb;
1491 limit = chunk + init - 1;
1492 incr = pr->u.p.st;
1493
1494 if ( (last = (limit >= trip)) != 0 ) {
1495 limit = trip;
1496 #if KMP_OS_WINDOWS
1497 pr->u.p.last_upper = pr->u.p.ub;
1498 #endif /* KMP_OS_WINDOWS */
1499 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001500 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001501 *p_last = last;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001502 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001503 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001504 if ( incr == 1 ) {
1505 *p_lb = start + init;
1506 *p_ub = start + limit;
1507 } else {
1508 *p_lb = start + init * incr;
1509 *p_ub = start + limit * incr;
1510 }
1511
1512 if ( pr->ordered ) {
1513 pr->u.p.ordered_lower = init;
1514 pr->u.p.ordered_upper = limit;
1515 #ifdef KMP_DEBUG
1516 {
1517 const char * buff;
1518 // create format specifiers before the debug output
1519 buff = __kmp_str_format(
1520 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1521 traits_t< UT >::spec, traits_t< UT >::spec );
1522 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1523 __kmp_str_free( &buff );
1524 }
1525 #endif
1526 } // if
1527 } // if
1528 } else {
1529 pr->u.p.tc = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001530 *p_lb = pr->u.p.lb;
1531 *p_ub = pr->u.p.ub;
1532 #if KMP_OS_WINDOWS
1533 pr->u.p.last_upper = *p_ub;
1534 #endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001535 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001536 *p_last = TRUE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001537 if ( p_st != NULL )
1538 *p_st = pr->u.p.st;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001539 } // if
1540 #ifdef KMP_DEBUG
1541 {
1542 const char * buff;
1543 // create format specifiers before the debug output
1544 buff = __kmp_str_format(
1545 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001546 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie5e8470a2013-09-27 10:38:44 +00001547 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001548 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001549 __kmp_str_free( &buff );
1550 }
1551 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001552#if INCLUDE_SSC_MARKS
1553 SSC_MARK_DISPATCH_NEXT();
1554#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001555 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001556 return status;
1557 } else {
1558 kmp_int32 last = 0;
1559 dispatch_shared_info_template< UT > *sh;
1560 T start;
1561 ST incr;
1562 UT limit, trip, init;
1563
1564 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1565 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1566
1567 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1568 ( th->th.th_dispatch->th_dispatch_pr_current );
1569 KMP_DEBUG_ASSERT( pr );
1570 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1571 ( th->th.th_dispatch->th_dispatch_sh_current );
1572 KMP_DEBUG_ASSERT( sh );
1573
1574 if ( pr->u.p.tc == 0 ) {
1575 // zero trip count
1576 status = 0;
1577 } else {
1578 switch (pr->schedule) {
1579 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1580 case kmp_sch_static_steal:
1581 {
1582 T chunk = pr->u.p.parm1;
1583
1584 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1585
1586 trip = pr->u.p.tc - 1;
1587
1588 if ( ___kmp_size_type > 4 ) {
1589 // Other threads do not look into the data of this thread,
1590 // so it's not necessary to make volatile casting.
1591 init = ( pr->u.p.count )++;
1592 status = ( init < (UT)pr->u.p.ub );
1593 } else {
1594 typedef union {
1595 struct {
1596 UT count;
1597 T ub;
1598 } p;
1599 kmp_int64 b;
1600 } union_i4;
1601 // All operations on 'count' or 'ub' must be combined atomically together.
1602 // stealing implemented only for 4-byte indexes
1603 {
1604 union_i4 vold, vnew;
1605 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1606 vnew = vold;
1607 vnew.p.count++;
1608 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1609 ( volatile kmp_int64* )&pr->u.p.count,
1610 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1611 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1612 KMP_CPU_PAUSE();
1613 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1614 vnew = vold;
1615 vnew.p.count++;
1616 }
1617 vnew = vold;
1618 init = vnew.p.count;
1619 status = ( init < (UT)vnew.p.ub ) ;
1620 }
1621
1622 if( !status ) {
1623 kmp_info_t **other_threads = team->t.t_threads;
1624 int while_limit = 10;
1625 int while_index = 0;
1626
1627 // TODO: algorithm of searching for a victim
1628 // should be cleaned up and measured
1629 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1630 union_i4 vold, vnew;
1631 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1632 T victimIdx = pr->u.p.parm4;
1633 T oldVictimIdx = victimIdx;
1634 dispatch_private_info_template< T > * victim;
1635
1636 do {
1637 if( !victimIdx ) {
1638 victimIdx = team->t.t_nproc - 1;
1639 } else {
1640 --victimIdx;
1641 }
1642 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1643 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1644 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1645 // TODO: think about a proper place of this test
1646 if ( ( !victim ) ||
1647 ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1648 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1649 // TODO: delay would be nice
1650 continue;
1651 // the victim is not ready yet to participate in stealing
1652 // because the victim is still in kmp_init_dispatch
1653 }
1654 if ( oldVictimIdx == victimIdx ) {
1655 break;
1656 }
1657 pr->u.p.parm4 = victimIdx;
1658
1659 while( 1 ) {
1660 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1661 vnew = vold;
1662
1663 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1664 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1665 break;
1666 }
1667 vnew.p.ub -= (remaining >> 2);
1668 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1669 #pragma warning( push )
1670 // disable warning on pointless comparison of unsigned with 0
1671 #pragma warning( disable: 186 )
1672 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1673 #pragma warning( pop )
1674 // TODO: Should this be acquire or release?
1675 if ( KMP_COMPARE_AND_STORE_ACQ64(
1676 ( volatile kmp_int64 * )&victim->u.p.count,
1677 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1678 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1679 status = 1;
1680 while_index = 0;
1681 // now update own count and ub
1682 #if KMP_ARCH_X86
1683 // stealing executed on non-KMP_ARCH_X86 only
1684 // Atomic 64-bit write on ia32 is
1685 // unavailable, so we do this in steps.
1686 // This code is not tested.
1687 init = vold.p.count;
1688 pr->u.p.ub = 0;
1689 pr->u.p.count = init + 1;
1690 pr->u.p.ub = vnew.p.count;
1691 #else
1692 init = vnew.p.ub;
1693 vold.p.count = init + 1;
1694 // TODO: is it safe and enough?
1695 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1696 #endif // KMP_ARCH_X86
1697 break;
1698 } // if
1699 KMP_CPU_PAUSE();
1700 } // while (1)
1701 } // while
1702 } // if
1703 } // if
1704 if ( !status ) {
1705 *p_lb = 0;
1706 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001707 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001708 } else {
1709 start = pr->u.p.parm2;
1710 init *= chunk;
1711 limit = chunk + init - 1;
1712 incr = pr->u.p.st;
1713
1714 KMP_DEBUG_ASSERT(init <= trip);
1715 if ( (last = (limit >= trip)) != 0 )
1716 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001717 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001718
1719 if ( incr == 1 ) {
1720 *p_lb = start + init;
1721 *p_ub = start + limit;
1722 } else {
1723 *p_lb = start + init * incr;
1724 *p_ub = start + limit * incr;
1725 }
1726
1727 if ( pr->ordered ) {
1728 pr->u.p.ordered_lower = init;
1729 pr->u.p.ordered_upper = limit;
1730 #ifdef KMP_DEBUG
1731 {
1732 const char * buff;
1733 // create format specifiers before the debug output
1734 buff = __kmp_str_format(
1735 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1736 traits_t< UT >::spec, traits_t< UT >::spec );
1737 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1738 __kmp_str_free( &buff );
1739 }
1740 #endif
1741 } // if
1742 } // if
1743 break;
1744 } // case
1745 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1746 case kmp_sch_static_balanced:
1747 {
1748 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1749 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1750 pr->u.p.count = 1;
1751 *p_lb = pr->u.p.lb;
1752 *p_ub = pr->u.p.ub;
1753 last = pr->u.p.parm1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001754 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001755 *p_st = pr->u.p.st;
1756 } else { /* no iterations to do */
1757 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1758 }
1759 if ( pr->ordered ) {
1760 #ifdef KMP_DEBUG
1761 {
1762 const char * buff;
1763 // create format specifiers before the debug output
1764 buff = __kmp_str_format(
1765 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1766 traits_t< UT >::spec, traits_t< UT >::spec );
1767 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1768 __kmp_str_free( &buff );
1769 }
1770 #endif
1771 } // if
1772 } // case
1773 break;
1774 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1775 case kmp_sch_static_chunked:
1776 {
1777 T parm1;
1778
1779 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1780 gtid ) );
1781 parm1 = pr->u.p.parm1;
1782
1783 trip = pr->u.p.tc - 1;
1784 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1785
1786 if ( (status = (init <= trip)) != 0 ) {
1787 start = pr->u.p.lb;
1788 incr = pr->u.p.st;
1789 limit = parm1 + init - 1;
1790
1791 if ( (last = (limit >= trip)) != 0 )
1792 limit = trip;
1793
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001794 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001795
1796 pr->u.p.count += team->t.t_nproc;
1797
1798 if ( incr == 1 ) {
1799 *p_lb = start + init;
1800 *p_ub = start + limit;
1801 }
1802 else {
1803 *p_lb = start + init * incr;
1804 *p_ub = start + limit * incr;
1805 }
1806
1807 if ( pr->ordered ) {
1808 pr->u.p.ordered_lower = init;
1809 pr->u.p.ordered_upper = limit;
1810 #ifdef KMP_DEBUG
1811 {
1812 const char * buff;
1813 // create format specifiers before the debug output
1814 buff = __kmp_str_format(
1815 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1816 traits_t< UT >::spec, traits_t< UT >::spec );
1817 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1818 __kmp_str_free( &buff );
1819 }
1820 #endif
1821 } // if
1822 } // if
1823 } // case
1824 break;
1825
1826 case kmp_sch_dynamic_chunked:
1827 {
1828 T chunk = pr->u.p.parm1;
1829
1830 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1831 gtid ) );
1832
1833 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1834 trip = pr->u.p.tc - 1;
1835
1836 if ( (status = (init <= trip)) == 0 ) {
1837 *p_lb = 0;
1838 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001839 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001840 } else {
1841 start = pr->u.p.lb;
1842 limit = chunk + init - 1;
1843 incr = pr->u.p.st;
1844
1845 if ( (last = (limit >= trip)) != 0 )
1846 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001847
1848 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001849
1850 if ( incr == 1 ) {
1851 *p_lb = start + init;
1852 *p_ub = start + limit;
1853 } else {
1854 *p_lb = start + init * incr;
1855 *p_ub = start + limit * incr;
1856 }
1857
1858 if ( pr->ordered ) {
1859 pr->u.p.ordered_lower = init;
1860 pr->u.p.ordered_upper = limit;
1861 #ifdef KMP_DEBUG
1862 {
1863 const char * buff;
1864 // create format specifiers before the debug output
1865 buff = __kmp_str_format(
1866 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1867 traits_t< UT >::spec, traits_t< UT >::spec );
1868 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1869 __kmp_str_free( &buff );
1870 }
1871 #endif
1872 } // if
1873 } // if
1874 } // case
1875 break;
1876
1877 case kmp_sch_guided_iterative_chunked:
1878 {
1879 T chunkspec = pr->u.p.parm1;
1880 KD_TRACE(100,
1881 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1882 trip = pr->u.p.tc;
1883 // Start atomic part of calculations
1884 while(1) {
1885 ST remaining; // signed, because can be < 0
1886 init = sh->u.s.iteration; // shared value
1887 remaining = trip - init;
1888 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1889 // nothing to do, don't try atomic op
1890 status = 0;
1891 break;
1892 }
1893 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1894 // use dynamic-style shcedule
1895 // atomically inrement iterations, get old value
1896 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1897 remaining = trip - init;
1898 if (remaining <= 0) {
1899 status = 0; // all iterations got by other threads
1900 } else {
1901 // got some iterations to work on
1902 status = 1;
1903 if ( (T)remaining > chunkspec ) {
1904 limit = init + chunkspec - 1;
1905 } else {
1906 last = 1; // the last chunk
1907 limit = init + remaining - 1;
1908 } // if
1909 } // if
1910 break;
1911 } // if
1912 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1913 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1914 // CAS was successful, chunk obtained
1915 status = 1;
1916 --limit;
1917 break;
1918 } // if
1919 } // while
1920 if ( status != 0 ) {
1921 start = pr->u.p.lb;
1922 incr = pr->u.p.st;
1923 if ( p_st != NULL )
1924 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001925 *p_lb = start + init * incr;
1926 *p_ub = start + limit * incr;
1927 if ( pr->ordered ) {
1928 pr->u.p.ordered_lower = init;
1929 pr->u.p.ordered_upper = limit;
1930 #ifdef KMP_DEBUG
1931 {
1932 const char * buff;
1933 // create format specifiers before the debug output
1934 buff = __kmp_str_format(
1935 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1936 traits_t< UT >::spec, traits_t< UT >::spec );
1937 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1938 __kmp_str_free( &buff );
1939 }
1940 #endif
1941 } // if
1942 } else {
1943 *p_lb = 0;
1944 *p_ub = 0;
1945 if ( p_st != NULL )
1946 *p_st = 0;
1947 } // if
1948 } // case
1949 break;
1950
1951 case kmp_sch_guided_analytical_chunked:
1952 {
1953 T chunkspec = pr->u.p.parm1;
1954 UT chunkIdx;
1955 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1956 /* for storing original FPCW value for Windows* OS on
1957 IA-32 architecture 8-byte version */
1958 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001959 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001960 #endif
1961 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1962 gtid ) );
1963
1964 trip = pr->u.p.tc;
1965
1966 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1967 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1968
1969 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1970 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1971 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1972 --trip;
1973 /* use dynamic-style scheduling */
1974 init = chunkIdx * chunkspec + pr->u.p.count;
1975 /* need to verify init > 0 in case of overflow in the above calculation */
1976 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1977 limit = init + chunkspec -1;
1978
1979 if ( (last = (limit >= trip)) != 0 )
1980 limit = trip;
1981 }
1982 break;
1983 } else {
1984 /* use exponential-style scheduling */
1985 /* The following check is to workaround the lack of long double precision on Windows* OS.
1986 This check works around the possible effect that init != 0 for chunkIdx == 0.
1987 */
1988 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1989 /* If we haven't already done so, save original
1990 FPCW and set precision to 64-bit, as Windows* OS
1991 on IA-32 architecture defaults to 53-bit */
1992 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00001993 oldFpcw = _control87(0,0);
1994 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001995 fpcwSet = 0x30000;
1996 }
1997 #endif
1998 if ( chunkIdx ) {
1999 init = __kmp_dispatch_guided_remaining< T >(
2000 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
2001 KMP_DEBUG_ASSERT(init);
2002 init = trip - init;
2003 } else
2004 init = 0;
2005 limit = trip - __kmp_dispatch_guided_remaining< T >(
2006 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
2007 KMP_ASSERT(init <= limit);
2008 if ( init < limit ) {
2009 KMP_DEBUG_ASSERT(limit <= trip);
2010 --limit;
2011 status = 1;
2012 break;
2013 } // if
2014 } // if
2015 } // while (1)
2016 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00002017 /* restore FPCW if necessary
2018 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2019 */
2020 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2021 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002022 #endif
2023 if ( status != 0 ) {
2024 start = pr->u.p.lb;
2025 incr = pr->u.p.st;
2026 if ( p_st != NULL )
2027 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002028 *p_lb = start + init * incr;
2029 *p_ub = start + limit * incr;
2030 if ( pr->ordered ) {
2031 pr->u.p.ordered_lower = init;
2032 pr->u.p.ordered_upper = limit;
2033 #ifdef KMP_DEBUG
2034 {
2035 const char * buff;
2036 // create format specifiers before the debug output
2037 buff = __kmp_str_format(
2038 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2039 traits_t< UT >::spec, traits_t< UT >::spec );
2040 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2041 __kmp_str_free( &buff );
2042 }
2043 #endif
2044 }
2045 } else {
2046 *p_lb = 0;
2047 *p_ub = 0;
2048 if ( p_st != NULL )
2049 *p_st = 0;
2050 }
2051 } // case
2052 break;
2053
2054 case kmp_sch_trapezoidal:
2055 {
2056 UT index;
2057 T parm2 = pr->u.p.parm2;
2058 T parm3 = pr->u.p.parm3;
2059 T parm4 = pr->u.p.parm4;
2060 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2061 gtid ) );
2062
2063 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2064
2065 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2066 trip = pr->u.p.tc - 1;
2067
2068 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2069 *p_lb = 0;
2070 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002071 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002072 } else {
2073 start = pr->u.p.lb;
2074 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2075 incr = pr->u.p.st;
2076
2077 if ( (last = (limit >= trip)) != 0 )
2078 limit = trip;
2079
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002080 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002081
2082 if ( incr == 1 ) {
2083 *p_lb = start + init;
2084 *p_ub = start + limit;
2085 } else {
2086 *p_lb = start + init * incr;
2087 *p_ub = start + limit * incr;
2088 }
2089
2090 if ( pr->ordered ) {
2091 pr->u.p.ordered_lower = init;
2092 pr->u.p.ordered_upper = limit;
2093 #ifdef KMP_DEBUG
2094 {
2095 const char * buff;
2096 // create format specifiers before the debug output
2097 buff = __kmp_str_format(
2098 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2099 traits_t< UT >::spec, traits_t< UT >::spec );
2100 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2101 __kmp_str_free( &buff );
2102 }
2103 #endif
2104 } // if
2105 } // if
2106 } // case
2107 break;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002108 default:
2109 {
2110 status = 0; // to avoid complaints on uninitialized variable use
2111 __kmp_msg(
2112 kmp_ms_fatal, // Severity
2113 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2114 KMP_HNT( GetNewerLibrary ), // Hint
2115 __kmp_msg_null // Variadic argument list terminator
2116 );
2117 }
2118 break;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002119 } // switch
2120 } // if tc == 0;
2121
2122 if ( status == 0 ) {
2123 UT num_done;
2124
2125 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2126 #ifdef KMP_DEBUG
2127 {
2128 const char * buff;
2129 // create format specifiers before the debug output
2130 buff = __kmp_str_format(
2131 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2132 traits_t< UT >::spec );
2133 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2134 __kmp_str_free( &buff );
2135 }
2136 #endif
2137
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002138 if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002139 /* NOTE: release this buffer to be reused */
2140
2141 KMP_MB(); /* Flush all pending memory write invalidates. */
2142
2143 sh->u.s.num_done = 0;
2144 sh->u.s.iteration = 0;
2145
2146 /* TODO replace with general release procedure? */
2147 if ( pr->ordered ) {
2148 sh->u.s.ordered_iteration = 0;
2149 }
2150
2151 KMP_MB(); /* Flush all pending memory write invalidates. */
2152
Jonathan Peyton067325f2016-05-31 19:01:15 +00002153 sh -> buffer_index += __kmp_dispatch_num_buffers;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002154 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2155 gtid, sh->buffer_index) );
2156
2157 KMP_MB(); /* Flush all pending memory write invalidates. */
2158
2159 } // if
2160 if ( __kmp_env_consistency_check ) {
2161 if ( pr->pushed_ws != ct_none ) {
2162 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2163 }
2164 }
2165
2166 th -> th.th_dispatch -> th_deo_fcn = NULL;
2167 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2168 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2169 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2170 } // if (status == 0)
2171#if KMP_OS_WINDOWS
2172 else if ( last ) {
2173 pr->u.p.last_upper = pr->u.p.ub;
2174 }
2175#endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002176 if ( p_last != NULL && status != 0 )
2177 *p_last = last;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002178 } // if
2179
2180 #ifdef KMP_DEBUG
2181 {
2182 const char * buff;
2183 // create format specifiers before the debug output
2184 buff = __kmp_str_format(
2185 "__kmp_dispatch_next: T#%%d normal case: " \
2186 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2187 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2188 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2189 __kmp_str_free( &buff );
2190 }
2191 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002192#if INCLUDE_SSC_MARKS
2193 SSC_MARK_DISPATCH_NEXT();
2194#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00002195 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002196 return status;
2197}
2198
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002199template< typename T >
2200static void
2201__kmp_dist_get_bounds(
2202 ident_t *loc,
2203 kmp_int32 gtid,
2204 kmp_int32 *plastiter,
2205 T *plower,
2206 T *pupper,
2207 typename traits_t< T >::signed_t incr
2208) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002209 typedef typename traits_t< T >::unsigned_t UT;
2210 typedef typename traits_t< T >::signed_t ST;
2211 register kmp_uint32 team_id;
2212 register kmp_uint32 nteams;
2213 register UT trip_count;
2214 register kmp_team_t *team;
2215 kmp_info_t * th;
2216
2217 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2218 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2219 #ifdef KMP_DEBUG
2220 {
2221 const char * buff;
2222 // create format specifiers before the debug output
2223 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2224 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2225 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2226 traits_t< T >::spec );
2227 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2228 __kmp_str_free( &buff );
2229 }
2230 #endif
2231
2232 if( __kmp_env_consistency_check ) {
2233 if( incr == 0 ) {
2234 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2235 }
2236 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2237 // The loop is illegal.
2238 // Some zero-trip loops maintained by compiler, e.g.:
2239 // for(i=10;i<0;++i) // lower >= upper - run-time check
2240 // for(i=0;i>10;--i) // lower <= upper - run-time check
2241 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2242 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2243 // Compiler does not check the following illegal loops:
2244 // for(i=0;i<10;i+=incr) // where incr<0
2245 // for(i=10;i>0;i-=incr) // where incr<0
2246 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2247 }
2248 }
2249 th = __kmp_threads[gtid];
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002250 team = th->th.th_team;
2251 #if OMP_40_ENABLED
Jonathan Peyton441f3372015-09-21 17:24:46 +00002252 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002253 nteams = th->th.th_teams_size.nteams;
2254 #endif
2255 team_id = team->t.t_master_tid;
2256 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2257
2258 // compute global trip count
2259 if( incr == 1 ) {
2260 trip_count = *pupper - *plower + 1;
2261 } else if(incr == -1) {
2262 trip_count = *plower - *pupper + 1;
Jonathan Peyton5235a1b2016-04-18 21:38:29 +00002263 } else if ( incr > 0 ) {
2264 // upper-lower can exceed the limit of signed type
2265 trip_count = (UT)(*pupper - *plower) / incr + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002266 } else {
Jonathan Peyton5235a1b2016-04-18 21:38:29 +00002267 trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002268 }
Jonathan Peyton45be4502015-08-11 21:36:41 +00002269
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002270 if( trip_count <= nteams ) {
2271 KMP_DEBUG_ASSERT(
2272 __kmp_static == kmp_sch_static_greedy || \
2273 __kmp_static == kmp_sch_static_balanced
2274 ); // Unknown static scheduling type.
2275 // only some teams get single iteration, others get nothing
2276 if( team_id < trip_count ) {
2277 *pupper = *plower = *plower + team_id * incr;
2278 } else {
2279 *plower = *pupper + incr; // zero-trip loop
2280 }
2281 if( plastiter != NULL )
2282 *plastiter = ( team_id == trip_count - 1 );
2283 } else {
2284 if( __kmp_static == kmp_sch_static_balanced ) {
2285 register UT chunk = trip_count / nteams;
2286 register UT extras = trip_count % nteams;
2287 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2288 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2289 if( plastiter != NULL )
2290 *plastiter = ( team_id == nteams - 1 );
2291 } else {
2292 register T chunk_inc_count =
2293 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2294 register T upper = *pupper;
2295 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2296 // Unknown static scheduling type.
2297 *plower += team_id * chunk_inc_count;
2298 *pupper = *plower + chunk_inc_count - incr;
2299 // Check/correct bounds if needed
2300 if( incr > 0 ) {
2301 if( *pupper < *plower )
2302 *pupper = i_maxmin< T >::mx;
2303 if( plastiter != NULL )
2304 *plastiter = *plower <= upper && *pupper > upper - incr;
2305 if( *pupper > upper )
2306 *pupper = upper; // tracker C73258
2307 } else {
2308 if( *pupper > *plower )
2309 *pupper = i_maxmin< T >::mn;
2310 if( plastiter != NULL )
2311 *plastiter = *plower >= upper && *pupper < upper - incr;
2312 if( *pupper < upper )
2313 *pupper = upper; // tracker C73258
2314 }
2315 }
2316 }
2317}
2318
Jim Cownie5e8470a2013-09-27 10:38:44 +00002319//-----------------------------------------------------------------------------------------
2320// Dispatch routines
2321// Transfer call to template< type T >
2322// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2323// T lb, T ub, ST st, ST chunk )
2324extern "C" {
2325
2326/*!
2327@ingroup WORK_SHARING
2328@{
2329@param loc Source location
2330@param gtid Global thread id
2331@param schedule Schedule type
2332@param lb Lower bound
2333@param ub Upper bound
2334@param st Step (or increment if you prefer)
2335@param chunk The chunk size to block with
2336
2337This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2338These functions are all identical apart from the types of the arguments.
2339*/
2340
2341void
2342__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2343 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2344{
2345 KMP_DEBUG_ASSERT( __kmp_init_serial );
2346 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2347}
2348/*!
2349See @ref __kmpc_dispatch_init_4
2350*/
2351void
2352__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2353 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2354{
2355 KMP_DEBUG_ASSERT( __kmp_init_serial );
2356 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2357}
2358
2359/*!
2360See @ref __kmpc_dispatch_init_4
2361*/
2362void
2363__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2364 kmp_int64 lb, kmp_int64 ub,
2365 kmp_int64 st, kmp_int64 chunk )
2366{
2367 KMP_DEBUG_ASSERT( __kmp_init_serial );
2368 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2369}
2370
2371/*!
2372See @ref __kmpc_dispatch_init_4
2373*/
2374void
2375__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2376 kmp_uint64 lb, kmp_uint64 ub,
2377 kmp_int64 st, kmp_int64 chunk )
2378{
2379 KMP_DEBUG_ASSERT( __kmp_init_serial );
2380 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2381}
2382
2383/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002384See @ref __kmpc_dispatch_init_4
2385
2386Difference from __kmpc_dispatch_init set of functions is these functions
2387are called for composite distribute parallel for construct. Thus before
2388regular iterations dispatching we need to calc per-team iteration space.
2389
2390These functions are all identical apart from the types of the arguments.
2391*/
2392void
2393__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2394 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2395{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002396 KMP_DEBUG_ASSERT( __kmp_init_serial );
2397 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2398 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2399}
2400
2401void
2402__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2403 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2404{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002405 KMP_DEBUG_ASSERT( __kmp_init_serial );
2406 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2407 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2408}
2409
2410void
2411__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2412 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2413{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002414 KMP_DEBUG_ASSERT( __kmp_init_serial );
2415 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2416 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2417}
2418
2419void
2420__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2421 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2422{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002423 KMP_DEBUG_ASSERT( __kmp_init_serial );
2424 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2425 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2426}
2427
2428/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002429@param loc Source code location
2430@param gtid Global thread id
2431@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2432@param p_lb Pointer to the lower bound for the next chunk of work
2433@param p_ub Pointer to the upper bound for the next chunk of work
2434@param p_st Pointer to the stride for the next chunk of work
2435@return one if there is work to be done, zero otherwise
2436
2437Get the next dynamically allocated chunk of work for this thread.
2438If there is no more work, then the lb,ub and stride need not be modified.
2439*/
2440int
2441__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2442 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2443{
2444 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2445}
2446
2447/*!
2448See @ref __kmpc_dispatch_next_4
2449*/
2450int
2451__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2452 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2453{
2454 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2455}
2456
2457/*!
2458See @ref __kmpc_dispatch_next_4
2459*/
2460int
2461__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2462 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2463{
2464 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2465}
2466
2467/*!
2468See @ref __kmpc_dispatch_next_4
2469*/
2470int
2471__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2472 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2473{
2474 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2475}
2476
2477/*!
2478@param loc Source code location
2479@param gtid Global thread id
2480
2481Mark the end of a dynamic loop.
2482*/
2483void
2484__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2485{
2486 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2487}
2488
2489/*!
2490See @ref __kmpc_dispatch_fini_4
2491*/
2492void
2493__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2494{
2495 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2496}
2497
2498/*!
2499See @ref __kmpc_dispatch_fini_4
2500*/
2501void
2502__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2503{
2504 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2505}
2506
2507/*!
2508See @ref __kmpc_dispatch_fini_4
2509*/
2510void
2511__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2512{
2513 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2514}
2515/*! @} */
2516
2517//-----------------------------------------------------------------------------------------
2518//Non-template routines from kmp_dispatch.c used in other sources
2519
2520kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2521 return value == checker;
2522}
2523
2524kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2525 return value != checker;
2526}
2527
2528kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2529 return value < checker;
2530}
2531
2532kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2533 return value >= checker;
2534}
2535
2536kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2537 return value <= checker;
2538}
Jim Cownie5e8470a2013-09-27 10:38:44 +00002539
2540kmp_uint32
2541__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2542 kmp_uint32 checker,
2543 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2544 , void * obj // Higher-level synchronization object, or NULL.
2545 )
2546{
2547 // note: we may not belong to a team at this point
2548 register volatile kmp_uint32 * spin = spinner;
2549 register kmp_uint32 check = checker;
2550 register kmp_uint32 spins;
2551 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2552 register kmp_uint32 r;
2553
2554 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2555 KMP_INIT_YIELD( spins );
2556 // main wait spin loop
2557 while(!f(r = TCR_4(*spin), check)) {
2558 KMP_FSYNC_SPIN_PREPARE( obj );
2559 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2560 It causes problems with infinite recursion because of exit lock */
2561 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2562 __kmp_abort_thread(); */
2563
Jim Cownie5e8470a2013-09-27 10:38:44 +00002564 /* if we have waited a bit, or are oversubscribed, yield */
2565 /* pause is in the following code */
2566 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2567 KMP_YIELD_SPIN( spins );
2568 }
2569 KMP_FSYNC_SPIN_ACQUIRED( obj );
2570 return r;
2571}
2572
Paul Osmialowskif7cc6af2016-05-31 20:20:32 +00002573void
2574__kmp_wait_yield_4_ptr(void *spinner,
2575 kmp_uint32 checker,
2576 kmp_uint32 (*pred)( void *, kmp_uint32 ),
2577 void *obj // Higher-level synchronization object, or NULL.
2578 )
2579{
2580 // note: we may not belong to a team at this point
2581 register void *spin = spinner;
2582 register kmp_uint32 check = checker;
2583 register kmp_uint32 spins;
2584 register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred;
2585
2586 KMP_FSYNC_SPIN_INIT( obj, spin );
2587 KMP_INIT_YIELD( spins );
2588 // main wait spin loop
2589 while ( !f( spin, check ) ) {
2590 KMP_FSYNC_SPIN_PREPARE( obj );
2591 /* if we have waited a bit, or are oversubscribed, yield */
2592 /* pause is in the following code */
2593 KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc );
2594 KMP_YIELD_SPIN( spins );
2595 }
2596 KMP_FSYNC_SPIN_ACQUIRED( obj );
2597}
2598
Jim Cownie5e8470a2013-09-27 10:38:44 +00002599} // extern "C"
2600
2601#ifdef KMP_GOMP_COMPAT
2602
2603void
2604__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2605 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2606 kmp_int32 chunk, int push_ws )
2607{
2608 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2609 push_ws );
2610}
2611
2612void
2613__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2614 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2615 kmp_int32 chunk, int push_ws )
2616{
2617 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2618 push_ws );
2619}
2620
2621void
2622__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2623 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2624 kmp_int64 chunk, int push_ws )
2625{
2626 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2627 push_ws );
2628}
2629
2630void
2631__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2632 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2633 kmp_int64 chunk, int push_ws )
2634{
2635 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2636 push_ws );
2637}
2638
2639void
2640__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2641{
2642 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2643}
2644
2645void
2646__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2647{
2648 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2649}
2650
2651void
2652__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2653{
2654 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2655}
2656
2657void
2658__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2659{
2660 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2661}
2662
2663#endif /* KMP_GOMP_COMPAT */
2664
2665/* ------------------------------------------------------------------------ */
2666/* ------------------------------------------------------------------------ */
2667