blob: 305839b423daefdec665f484159cda5bb98071f4 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16/*
17 * Dynamic scheduling initialization and dispatch.
18 *
19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20 * it may change values between parallel regions. __kmp_max_nth
21 * is the largest value __kmp_nth may take, 1 is the smallest.
22 *
23 */
24
25/* ------------------------------------------------------------------------ */
26/* ------------------------------------------------------------------------ */
27
Andrey Churbanov429dbc22016-07-11 10:44:57 +000028// Need to raise Win version from XP to Vista here for support of InterlockedExchange64
29#if defined(_WIN32_WINNT) && defined(_M_IX86)
30#undef _WIN32_WINNT
31#define _WIN32_WINNT 0x0502
32#endif
33
Jim Cownie5e8470a2013-09-27 10:38:44 +000034#include "kmp.h"
35#include "kmp_i18n.h"
36#include "kmp_itt.h"
37#include "kmp_str.h"
38#include "kmp_error.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000039#include "kmp_stats.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000040#if KMP_OS_WINDOWS && KMP_ARCH_X86
41 #include <float.h>
42#endif
43
Andrey Churbanovd7d088f2015-04-29 16:42:24 +000044#if OMPT_SUPPORT
45#include "ompt-internal.h"
46#include "ompt-specific.h"
47#endif
48
Jim Cownie5e8470a2013-09-27 10:38:44 +000049/* ------------------------------------------------------------------------ */
50/* ------------------------------------------------------------------------ */
51
Jim Cownie4cc4bb42014-10-07 16:25:50 +000052// template for type limits
53template< typename T >
54struct i_maxmin {
55 static const T mx;
56 static const T mn;
57};
58template<>
59struct i_maxmin< int > {
60 static const int mx = 0x7fffffff;
61 static const int mn = 0x80000000;
62};
63template<>
64struct i_maxmin< unsigned int > {
65 static const unsigned int mx = 0xffffffff;
66 static const unsigned int mn = 0x00000000;
67};
68template<>
69struct i_maxmin< long long > {
70 static const long long mx = 0x7fffffffffffffffLL;
71 static const long long mn = 0x8000000000000000LL;
72};
73template<>
74struct i_maxmin< unsigned long long > {
75 static const unsigned long long mx = 0xffffffffffffffffLL;
76 static const unsigned long long mn = 0x0000000000000000LL;
77};
78//-------------------------------------------------------------------------
79
Andrey Churbanov429dbc22016-07-11 10:44:57 +000080#if KMP_STATIC_STEAL_ENABLED
Jim Cownie5e8470a2013-09-27 10:38:44 +000081
82 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
83 template< typename T >
84 struct dispatch_private_infoXX_template {
85 typedef typename traits_t< T >::unsigned_t UT;
86 typedef typename traits_t< T >::signed_t ST;
87 UT count; // unsigned
88 T ub;
89 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
90 T lb;
91 ST st; // signed
92 UT tc; // unsigned
93 T static_steal_counter; // for static_steal only; maybe better to put after ub
94
95 /* parm[1-4] are used in different ways by different scheduling algorithms */
96
97 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
98 // a) parm3 is properly aligned and
99 // b) all parm1-4 are in the same cache line.
100 // Because of parm1-4 are used together, performance seems to be better
101 // if they are in the same line (not measured though).
102
103 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
104 T parm1;
105 T parm2;
106 T parm3;
107 T parm4;
108 };
109
110 UT ordered_lower; // unsigned
111 UT ordered_upper; // unsigned
112 #if KMP_OS_WINDOWS
113 T last_upper;
114 #endif /* KMP_OS_WINDOWS */
115 };
116
117#else /* KMP_STATIC_STEAL_ENABLED */
118
119 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
120 template< typename T >
121 struct dispatch_private_infoXX_template {
122 typedef typename traits_t< T >::unsigned_t UT;
123 typedef typename traits_t< T >::signed_t ST;
124 T lb;
125 T ub;
126 ST st; // signed
127 UT tc; // unsigned
128
129 T parm1;
130 T parm2;
131 T parm3;
132 T parm4;
133
134 UT count; // unsigned
135
136 UT ordered_lower; // unsigned
137 UT ordered_upper; // unsigned
138 #if KMP_OS_WINDOWS
139 T last_upper;
140 #endif /* KMP_OS_WINDOWS */
141 };
142
143#endif /* KMP_STATIC_STEAL_ENABLED */
144
145// replaces dispatch_private_info structure and dispatch_private_info_t type
146template< typename T >
147struct KMP_ALIGN_CACHE dispatch_private_info_template {
148 // duplicate alignment here, otherwise size of structure is not correct in our compiler
149 union KMP_ALIGN_CACHE private_info_tmpl {
150 dispatch_private_infoXX_template< T > p;
151 dispatch_private_info64_t p64;
152 } u;
153 enum sched_type schedule; /* scheduling algorithm */
154 kmp_uint32 ordered; /* ordered clause specified */
155 kmp_uint32 ordered_bumped;
156 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
157 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
158 kmp_uint32 nomerge; /* don't merge iters if serialized */
159 kmp_uint32 type_size;
160 enum cons_type pushed_ws;
161};
162
163
164// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
165template< typename UT >
166struct dispatch_shared_infoXX_template {
167 /* chunk index under dynamic, number of idle threads under static-steal;
168 iteration index otherwise */
169 volatile UT iteration;
170 volatile UT num_done;
171 volatile UT ordered_iteration;
Jonathan Peyton71909c52016-03-02 22:42:06 +0000172 UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
Jim Cownie5e8470a2013-09-27 10:38:44 +0000173};
174
175// replaces dispatch_shared_info structure and dispatch_shared_info_t type
176template< typename UT >
177struct dispatch_shared_info_template {
178 // we need union here to keep the structure size
179 union shared_info_tmpl {
180 dispatch_shared_infoXX_template< UT > s;
181 dispatch_shared_info64_t s64;
182 } u;
183 volatile kmp_uint32 buffer_index;
Jonathan Peytondf6818b2016-06-14 17:57:47 +0000184#if OMP_45_ENABLED
Jonathan Peyton71909c52016-03-02 22:42:06 +0000185 volatile kmp_int32 doacross_buf_idx; // teamwise index
186 kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
187 kmp_int32 doacross_num_done; // count finished threads
188#endif
Jonathan Peyton4d3c2132016-07-08 17:43:21 +0000189#if KMP_USE_HWLOC
190 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
191 // machines (> 48 cores). Performance analysis showed that a cache thrash
192 // was occurring and this padding helps alleviate the problem.
193 char padding[64];
194#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000195};
196
197/* ------------------------------------------------------------------------ */
198/* ------------------------------------------------------------------------ */
199
Jim Cownie5e8470a2013-09-27 10:38:44 +0000200#undef USE_TEST_LOCKS
201
202// test_then_add template (general template should NOT be used)
203template< typename T >
204static __forceinline T
Jonathan Peytone1890e12016-06-13 21:33:30 +0000205test_then_add( volatile T *p, T d );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000206
207template<>
208__forceinline kmp_int32
209test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
210{
211 kmp_int32 r;
212 r = KMP_TEST_THEN_ADD32( p, d );
213 return r;
214}
215
216template<>
217__forceinline kmp_int64
218test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
219{
220 kmp_int64 r;
221 r = KMP_TEST_THEN_ADD64( p, d );
222 return r;
223}
224
225// test_then_inc_acq template (general template should NOT be used)
226template< typename T >
227static __forceinline T
Jonathan Peytone1890e12016-06-13 21:33:30 +0000228test_then_inc_acq( volatile T *p );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000229
230template<>
231__forceinline kmp_int32
232test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
233{
234 kmp_int32 r;
235 r = KMP_TEST_THEN_INC_ACQ32( p );
236 return r;
237}
238
239template<>
240__forceinline kmp_int64
241test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
242{
243 kmp_int64 r;
244 r = KMP_TEST_THEN_INC_ACQ64( p );
245 return r;
246}
247
248// test_then_inc template (general template should NOT be used)
249template< typename T >
250static __forceinline T
Jonathan Peytone1890e12016-06-13 21:33:30 +0000251test_then_inc( volatile T *p );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000252
253template<>
254__forceinline kmp_int32
255test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
256{
257 kmp_int32 r;
258 r = KMP_TEST_THEN_INC32( p );
259 return r;
260}
261
262template<>
263__forceinline kmp_int64
264test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
265{
266 kmp_int64 r;
267 r = KMP_TEST_THEN_INC64( p );
268 return r;
269}
270
271// compare_and_swap template (general template should NOT be used)
272template< typename T >
273static __forceinline kmp_int32
Jonathan Peytone1890e12016-06-13 21:33:30 +0000274compare_and_swap( volatile T *p, T c, T s );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000275
276template<>
277__forceinline kmp_int32
278compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
279{
280 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
281}
282
283template<>
284__forceinline kmp_int32
285compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
286{
287 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
288}
289
290/*
291 Spin wait loop that first does pause, then yield.
292 Waits until function returns non-zero when called with *spinner and check.
293 Does NOT put threads to sleep.
294#if USE_ITT_BUILD
295 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000296 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000297 locks consistently. For example, if lock is acquired immediately, its address is
298 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
299 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
300 address, not an address of low-level spinner.
301#endif // USE_ITT_BUILD
302*/
303template< typename UT >
304// ToDo: make inline function (move to header file for icl)
305static UT // unsigned 4- or 8-byte type
306__kmp_wait_yield( volatile UT * spinner,
307 UT checker,
308 kmp_uint32 (* pred)( UT, UT )
309 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
310 )
311{
312 // note: we may not belong to a team at this point
313 register volatile UT * spin = spinner;
314 register UT check = checker;
315 register kmp_uint32 spins;
316 register kmp_uint32 (*f) ( UT, UT ) = pred;
317 register UT r;
318
319 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
320 KMP_INIT_YIELD( spins );
321 // main wait spin loop
322 while(!f(r = *spin, check))
323 {
324 KMP_FSYNC_SPIN_PREPARE( obj );
325 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
326 It causes problems with infinite recursion because of exit lock */
327 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
328 __kmp_abort_thread(); */
329
Jim Cownie5e8470a2013-09-27 10:38:44 +0000330 // if we are oversubscribed,
331 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
332 // pause is in the following code
333 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
334 KMP_YIELD_SPIN( spins );
335 }
336 KMP_FSYNC_SPIN_ACQUIRED( obj );
337 return r;
338}
339
340template< typename UT >
341static kmp_uint32 __kmp_eq( UT value, UT checker) {
342 return value == checker;
343}
344
345template< typename UT >
346static kmp_uint32 __kmp_neq( UT value, UT checker) {
347 return value != checker;
348}
349
350template< typename UT >
351static kmp_uint32 __kmp_lt( UT value, UT checker) {
352 return value < checker;
353}
354
355template< typename UT >
356static kmp_uint32 __kmp_ge( UT value, UT checker) {
357 return value >= checker;
358}
359
360template< typename UT >
361static kmp_uint32 __kmp_le( UT value, UT checker) {
362 return value <= checker;
363}
364
365
366/* ------------------------------------------------------------------------ */
367/* ------------------------------------------------------------------------ */
368
369static void
370__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
371{
372 kmp_info_t *th;
373
374 KMP_DEBUG_ASSERT( gtid_ref );
375
376 if ( __kmp_env_consistency_check ) {
377 th = __kmp_threads[*gtid_ref];
378 if ( th -> th.th_root -> r.r_active
379 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000380#if KMP_USE_DYNAMIC_LOCK
381 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
382#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000383 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000384#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000385 }
386 }
387}
388
389template< typename UT >
390static void
391__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
392{
393 typedef typename traits_t< UT >::signed_t ST;
394 dispatch_private_info_template< UT > * pr;
395
396 int gtid = *gtid_ref;
397// int cid = *cid_ref;
398 kmp_info_t *th = __kmp_threads[ gtid ];
399 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
400
401 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
402 if ( __kmp_env_consistency_check ) {
403 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
404 ( th -> th.th_dispatch -> th_dispatch_pr_current );
405 if ( pr -> pushed_ws != ct_none ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000406#if KMP_USE_DYNAMIC_LOCK
407 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
408#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000409 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000410#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000411 }
412 }
413
414 if ( ! th -> th.th_team -> t.t_serialized ) {
415 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
416 ( th -> th.th_dispatch -> th_dispatch_sh_current );
417 UT lower;
418
419 if ( ! __kmp_env_consistency_check ) {
420 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
421 ( th -> th.th_dispatch -> th_dispatch_pr_current );
422 }
423 lower = pr->u.p.ordered_lower;
424
425 #if ! defined( KMP_GOMP_COMPAT )
426 if ( __kmp_env_consistency_check ) {
427 if ( pr->ordered_bumped ) {
428 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
429 __kmp_error_construct2(
430 kmp_i18n_msg_CnsMultipleNesting,
431 ct_ordered_in_pdo, loc_ref,
432 & p->stack_data[ p->w_top ]
433 );
434 }
435 }
436 #endif /* !defined(KMP_GOMP_COMPAT) */
437
438 KMP_MB();
439 #ifdef KMP_DEBUG
440 {
441 const char * buff;
442 // create format specifiers before the debug output
443 buff = __kmp_str_format(
444 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
445 traits_t< UT >::spec, traits_t< UT >::spec );
446 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
447 __kmp_str_free( &buff );
448 }
449 #endif
450
451 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
452 USE_ITT_BUILD_ARG( NULL )
453 );
454 KMP_MB(); /* is this necessary? */
455 #ifdef KMP_DEBUG
456 {
457 const char * buff;
458 // create format specifiers before the debug output
459 buff = __kmp_str_format(
460 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
461 traits_t< UT >::spec, traits_t< UT >::spec );
462 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
463 __kmp_str_free( &buff );
464 }
465 #endif
466 }
467 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
468}
469
470static void
471__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
472{
473 kmp_info_t *th;
474
475 if ( __kmp_env_consistency_check ) {
476 th = __kmp_threads[*gtid_ref];
477 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
478 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
479 }
480 }
481}
482
483template< typename UT >
484static void
485__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
486{
487 typedef typename traits_t< UT >::signed_t ST;
488 dispatch_private_info_template< UT > * pr;
489
490 int gtid = *gtid_ref;
491// int cid = *cid_ref;
492 kmp_info_t *th = __kmp_threads[ gtid ];
493 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
494
495 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
496 if ( __kmp_env_consistency_check ) {
497 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
498 ( th -> th.th_dispatch -> th_dispatch_pr_current );
499 if ( pr -> pushed_ws != ct_none ) {
500 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
501 }
502 }
503
504 if ( ! th -> th.th_team -> t.t_serialized ) {
505 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
506 ( th -> th.th_dispatch -> th_dispatch_sh_current );
507
508 if ( ! __kmp_env_consistency_check ) {
509 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
510 ( th -> th.th_dispatch -> th_dispatch_pr_current );
511 }
512
513 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
514 #if ! defined( KMP_GOMP_COMPAT )
515 if ( __kmp_env_consistency_check ) {
516 if ( pr->ordered_bumped != 0 ) {
517 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
518 /* How to test it? - OM */
519 __kmp_error_construct2(
520 kmp_i18n_msg_CnsMultipleNesting,
521 ct_ordered_in_pdo, loc_ref,
522 & p->stack_data[ p->w_top ]
523 );
524 }
525 }
526 #endif /* !defined(KMP_GOMP_COMPAT) */
527
528 KMP_MB(); /* Flush all pending memory write invalidates. */
529
530 pr->ordered_bumped += 1;
531
532 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
533 gtid, pr->ordered_bumped ) );
534
535 KMP_MB(); /* Flush all pending memory write invalidates. */
536
537 /* TODO use general release procedure? */
538 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
539
540 KMP_MB(); /* Flush all pending memory write invalidates. */
541 }
542 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
543}
544
545/* Computes and returns x to the power of y, where y must a non-negative integer */
546template< typename UT >
547static __forceinline long double
548__kmp_pow(long double x, UT y) {
549 long double s=1.0L;
550
551 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
552 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
553 while(y) {
554 if ( y & 1 )
555 s *= x;
556 x *= x;
557 y >>= 1;
558 }
559 return s;
560}
561
562/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
563 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
564 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
565 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
566*/
567template< typename T >
568static __inline typename traits_t< T >::unsigned_t
569__kmp_dispatch_guided_remaining(
570 T tc,
571 typename traits_t< T >::floating_t base,
572 typename traits_t< T >::unsigned_t idx
573) {
574 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
575 least for ICL 8.1, long double arithmetic may not really have
576 long double precision, even with /Qlong_double. Currently, we
577 workaround that in the caller code, by manipulating the FPCW for
578 Windows* OS on IA-32 architecture. The lack of precision is not
579 expected to be a correctness issue, though.
580 */
581 typedef typename traits_t< T >::unsigned_t UT;
582
583 long double x = tc * __kmp_pow< UT >(base, idx);
584 UT r = (UT) x;
585 if ( x == r )
586 return r;
587 return r + 1;
588}
589
590// Parameters of the guided-iterative algorithm:
591// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
592// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
593// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
594// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
595static int guided_int_param = 2;
596static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
597
598// UT - unsigned flavor of T, ST - signed flavor of T,
599// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
600template< typename T >
601static void
602__kmp_dispatch_init(
603 ident_t * loc,
604 int gtid,
605 enum sched_type schedule,
606 T lb,
607 T ub,
608 typename traits_t< T >::signed_t st,
609 typename traits_t< T >::signed_t chunk,
610 int push_ws
611) {
612 typedef typename traits_t< T >::unsigned_t UT;
613 typedef typename traits_t< T >::signed_t ST;
614 typedef typename traits_t< T >::floating_t DBL;
615 static const int ___kmp_size_type = sizeof( UT );
616
617 int active;
618 T tc;
619 kmp_info_t * th;
620 kmp_team_t * team;
621 kmp_uint32 my_buffer_index;
622 dispatch_private_info_template< T > * pr;
623 dispatch_shared_info_template< UT > volatile * sh;
624
625 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
626 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
627
628 if ( ! TCR_4( __kmp_init_parallel ) )
629 __kmp_parallel_initialize();
630
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000631#if INCLUDE_SSC_MARKS
632 SSC_MARK_DISPATCH_INIT();
633#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000634 #ifdef KMP_DEBUG
635 {
636 const char * buff;
637 // create format specifiers before the debug output
638 buff = __kmp_str_format(
639 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
640 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
641 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
642 __kmp_str_free( &buff );
643 }
644 #endif
645 /* setup data */
646 th = __kmp_threads[ gtid ];
647 team = th -> th.th_team;
648 active = ! team -> t.t_serialized;
649 th->th.th_ident = loc;
650
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000651#if USE_ITT_BUILD
652 kmp_uint64 cur_chunk = chunk;
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000653 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
654 KMP_MASTER_GTID(gtid) &&
655#if OMP_40_ENABLED
656 th->th.th_teams_microtask == NULL &&
657#endif
658 team->t.t_active_level == 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000659#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000660 if ( ! active ) {
661 pr = reinterpret_cast< dispatch_private_info_template< T >* >
662 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
663 } else {
664 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
665 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
666
667 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
668
669 /* What happens when number of threads changes, need to resize buffer? */
670 pr = reinterpret_cast< dispatch_private_info_template< T > * >
Jonathan Peyton067325f2016-05-31 19:01:15 +0000671 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000672 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
Jonathan Peyton067325f2016-05-31 19:01:15 +0000673 ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000674 }
675
Andrey Churbanov429dbc22016-07-11 10:44:57 +0000676 #if ( KMP_STATIC_STEAL_ENABLED )
677 if ( SCHEDULE_HAS_NONMONOTONIC(schedule) )
678 // AC: we now have only one implementation of stealing, so use it
679 schedule = kmp_sch_static_steal;
680 else
681 #endif
682 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
Jonathan Peytonea0fe1d2016-02-25 17:55:50 +0000683
Jim Cownie5e8470a2013-09-27 10:38:44 +0000684 /* Pick up the nomerge/ordered bits from the scheduling type */
685 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
686 pr->nomerge = TRUE;
687 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
688 } else {
689 pr->nomerge = FALSE;
690 }
691 pr->type_size = ___kmp_size_type; // remember the size of variables
692 if ( kmp_ord_lower & schedule ) {
693 pr->ordered = TRUE;
694 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
695 } else {
696 pr->ordered = FALSE;
697 }
Jonathan Peyton45be4502015-08-11 21:36:41 +0000698
Jim Cownie5e8470a2013-09-27 10:38:44 +0000699 if ( schedule == kmp_sch_static ) {
700 schedule = __kmp_static;
701 } else {
702 if ( schedule == kmp_sch_runtime ) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000703 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
704 schedule = team -> t.t_sched.r_sched_type;
705 // Detail the schedule if needed (global controls are differentiated appropriately)
706 if ( schedule == kmp_sch_guided_chunked ) {
707 schedule = __kmp_guided;
708 } else if ( schedule == kmp_sch_static ) {
709 schedule = __kmp_static;
710 }
711 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
712 chunk = team -> t.t_sched.chunk;
Jonathan Peyton00afbd02015-11-12 21:26:22 +0000713#if USE_ITT_BUILD
714 cur_chunk = chunk;
715#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000716 #ifdef KMP_DEBUG
717 {
718 const char * buff;
719 // create format specifiers before the debug output
720 buff = __kmp_str_format(
721 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
722 traits_t< ST >::spec );
723 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
724 __kmp_str_free( &buff );
725 }
726 #endif
727 } else {
728 if ( schedule == kmp_sch_guided_chunked ) {
729 schedule = __kmp_guided;
730 }
731 if ( chunk <= 0 ) {
732 chunk = KMP_DEFAULT_CHUNK;
733 }
734 }
735
Jim Cownie5e8470a2013-09-27 10:38:44 +0000736 if ( schedule == kmp_sch_auto ) {
737 // mapping and differentiation: in the __kmp_do_serial_initialize()
738 schedule = __kmp_auto;
739 #ifdef KMP_DEBUG
740 {
741 const char * buff;
742 // create format specifiers before the debug output
743 buff = __kmp_str_format(
744 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
745 traits_t< ST >::spec );
746 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
747 __kmp_str_free( &buff );
748 }
749 #endif
750 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000751
752 /* guided analytical not safe for too many threads */
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +0000753 if ( schedule == kmp_sch_guided_analytical_chunked && th->th.th_team_nproc > 1<<20 ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +0000754 schedule = kmp_sch_guided_iterative_chunked;
755 KMP_WARNING( DispatchManyThreads );
756 }
757 pr->u.p.parm1 = chunk;
758 }
759 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
760 "unknown scheduling type" );
761
762 pr->u.p.count = 0;
763
764 if ( __kmp_env_consistency_check ) {
765 if ( st == 0 ) {
766 __kmp_error_construct(
767 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
768 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
769 );
770 }
771 }
Jonathan Peyton5235a1b2016-04-18 21:38:29 +0000772 // compute trip count
773 if ( st == 1 ) { // most common case
774 if ( ub >= lb ) {
775 tc = ub - lb + 1;
776 } else { // ub < lb
777 tc = 0; // zero-trip
Jim Cownie5e8470a2013-09-27 10:38:44 +0000778 }
Jonathan Peyton5235a1b2016-04-18 21:38:29 +0000779 } else if ( st < 0 ) {
780 if ( lb >= ub ) {
781 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
782 // where the division needs to be unsigned regardless of the result type
783 tc = (UT)(lb - ub) / (-st) + 1;
784 } else { // lb < ub
785 tc = 0; // zero-trip
786 }
787 } else { // st > 0
788 if ( ub >= lb ) {
789 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
790 // where the division needs to be unsigned regardless of the result type
791 tc = (UT)(ub - lb) / st + 1;
792 } else { // ub < lb
793 tc = 0; // zero-trip
794 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000795 }
796
Jonathan Peyton45be4502015-08-11 21:36:41 +0000797 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
798 // when statistics are disabled.
799 if (schedule == __kmp_static)
800 {
801 KMP_COUNT_BLOCK(OMP_FOR_static);
802 KMP_COUNT_VALUE(FOR_static_iterations, tc);
803 }
804 else
805 {
806 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
807 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
808 }
809
Jim Cownie5e8470a2013-09-27 10:38:44 +0000810 pr->u.p.lb = lb;
811 pr->u.p.ub = ub;
812 pr->u.p.st = st;
813 pr->u.p.tc = tc;
814
815 #if KMP_OS_WINDOWS
816 pr->u.p.last_upper = ub + st;
817 #endif /* KMP_OS_WINDOWS */
818
819 /* NOTE: only the active parallel region(s) has active ordered sections */
820
821 if ( active ) {
822 if ( pr->ordered == 0 ) {
823 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
824 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
825 } else {
826 pr->ordered_bumped = 0;
827
828 pr->u.p.ordered_lower = 1;
829 pr->u.p.ordered_upper = 0;
830
831 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
832 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
833 }
834 }
835
836 if ( __kmp_env_consistency_check ) {
837 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
838 if ( push_ws ) {
839 __kmp_push_workshare( gtid, ws, loc );
840 pr->pushed_ws = ws;
841 } else {
842 __kmp_check_workshare( gtid, ws, loc );
843 pr->pushed_ws = ct_none;
844 }
845 }
846
847 switch ( schedule ) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +0000848 #if ( KMP_STATIC_STEAL_ENABLED )
Jim Cownie5e8470a2013-09-27 10:38:44 +0000849 case kmp_sch_static_steal:
850 {
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +0000851 T nproc = th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000852 T ntc, init;
853
854 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
855
856 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
857 if ( nproc > 1 && ntc >= nproc ) {
858 T id = __kmp_tid_from_gtid(gtid);
859 T small_chunk, extras;
860
861 small_chunk = ntc / nproc;
862 extras = ntc % nproc;
863
864 init = id * small_chunk + ( id < extras ? id : extras );
865 pr->u.p.count = init;
866 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
867
868 pr->u.p.parm2 = lb;
869 //pr->pfields.parm3 = 0; // it's not used in static_steal
Andrey Churbanov429dbc22016-07-11 10:44:57 +0000870 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
Jim Cownie5e8470a2013-09-27 10:38:44 +0000871 pr->u.p.st = st;
Andrey Churbanov429dbc22016-07-11 10:44:57 +0000872 if ( ___kmp_size_type > 4 ) {
873 // AC: TODO: check if 16-byte CAS available and use it to
874 // improve performance (probably wait for explicit request
875 // before spending time on this).
876 // For now use dynamically allocated per-thread lock,
877 // free memory in __kmp_dispatch_next when status==0.
878 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
879 th->th.th_dispatch->th_steal_lock =
880 (kmp_lock_t*)__kmp_allocate(sizeof(kmp_lock_t));
881 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
882 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000883 break;
884 } else {
885 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
886 gtid ) );
887 schedule = kmp_sch_static_balanced;
888 /* too few iterations: fall-through to kmp_sch_static_balanced */
889 } // if
890 /* FALL-THROUGH to static balanced */
891 } // case
892 #endif
893 case kmp_sch_static_balanced:
894 {
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +0000895 T nproc = th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000896 T init, limit;
897
898 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
899 gtid ) );
900
901 if ( nproc > 1 ) {
902 T id = __kmp_tid_from_gtid(gtid);
903
904 if ( tc < nproc ) {
905 if ( id < tc ) {
906 init = id;
907 limit = id;
908 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
909 } else {
910 pr->u.p.count = 1; /* means no more chunks to execute */
911 pr->u.p.parm1 = FALSE;
912 break;
913 }
914 } else {
915 T small_chunk = tc / nproc;
916 T extras = tc % nproc;
917 init = id * small_chunk + (id < extras ? id : extras);
918 limit = init + small_chunk - (id < extras ? 0 : 1);
919 pr->u.p.parm1 = (id == nproc - 1);
920 }
921 } else {
922 if ( tc > 0 ) {
923 init = 0;
924 limit = tc - 1;
925 pr->u.p.parm1 = TRUE;
926 } else {
927 // zero trip count
928 pr->u.p.count = 1; /* means no more chunks to execute */
929 pr->u.p.parm1 = FALSE;
930 break;
931 }
932 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000933#if USE_ITT_BUILD
934 // Calculate chunk for metadata report
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000935 if ( itt_need_metadata_reporting )
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000936 cur_chunk = limit - init + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000937#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000938 if ( st == 1 ) {
939 pr->u.p.lb = lb + init;
940 pr->u.p.ub = lb + limit;
941 } else {
942 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
943 pr->u.p.lb = lb + init * st;
944 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
945 if ( st > 0 ) {
946 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
947 } else {
948 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
949 }
950 }
951 if ( pr->ordered ) {
952 pr->u.p.ordered_lower = init;
953 pr->u.p.ordered_upper = limit;
954 }
955 break;
956 } // case
957 case kmp_sch_guided_iterative_chunked :
958 {
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +0000959 T nproc = th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000960 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
961
962 if ( nproc > 1 ) {
963 if ( (2L * chunk + 1 ) * nproc >= tc ) {
964 /* chunk size too large, switch to dynamic */
965 schedule = kmp_sch_dynamic_chunked;
966 } else {
967 // when remaining iters become less than parm2 - switch to dynamic
968 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
969 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
970 }
971 } else {
972 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
973 schedule = kmp_sch_static_greedy;
974 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
975 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
976 pr->u.p.parm1 = tc;
977 } // if
978 } // case
979 break;
980 case kmp_sch_guided_analytical_chunked:
981 {
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +0000982 T nproc = th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000983 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
984
985 if ( nproc > 1 ) {
986 if ( (2L * chunk + 1 ) * nproc >= tc ) {
987 /* chunk size too large, switch to dynamic */
988 schedule = kmp_sch_dynamic_chunked;
989 } else {
990 /* commonly used term: (2 nproc - 1)/(2 nproc) */
991 DBL x;
992
993 #if KMP_OS_WINDOWS && KMP_ARCH_X86
994 /* Linux* OS already has 64-bit computation by default for
995 long double, and on Windows* OS on Intel(R) 64,
996 /Qlong_double doesn't work. On Windows* OS
997 on IA-32 architecture, we need to set precision to
998 64-bit instead of the default 53-bit. Even though long
999 double doesn't work on Windows* OS on Intel(R) 64, the
1000 resulting lack of precision is not expected to impact
1001 the correctness of the algorithm, but this has not been
1002 mathematically proven.
1003 */
1004 // save original FPCW and set precision to 64-bit, as
1005 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +00001006 unsigned int oldFpcw = _control87(0,0);
1007 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +00001008 #endif
1009 /* value used for comparison in solver for cross-over point */
1010 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
1011
1012 /* crossover point--chunk indexes equal to or greater than
1013 this point switch to dynamic-style scheduling */
1014 UT cross;
1015
1016 /* commonly used term: (2 nproc - 1)/(2 nproc) */
1017 x = (long double)1.0 - (long double)0.5 / nproc;
1018
1019 #ifdef KMP_DEBUG
1020 { // test natural alignment
1021 struct _test_a {
1022 char a;
1023 union {
1024 char b;
1025 DBL d;
1026 };
1027 } t;
1028 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
1029 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
1030 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
1031 }
1032 #endif // KMP_DEBUG
1033
1034 /* save the term in thread private dispatch structure */
1035 *(DBL*)&pr->u.p.parm3 = x;
1036
1037 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
1038 {
1039 UT left, right, mid;
1040 long double p;
1041
1042 /* estimate initial upper and lower bound */
1043
1044 /* doesn't matter what value right is as long as it is positive, but
1045 it affects performance of the solver
1046 */
1047 right = 229;
1048 p = __kmp_pow< UT >(x,right);
1049 if ( p > target ) {
1050 do{
1051 p *= p;
1052 right <<= 1;
1053 } while(p>target && right < (1<<27));
1054 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1055 } else {
1056 left = 0;
1057 }
1058
1059 /* bisection root-finding method */
1060 while ( left + 1 < right ) {
1061 mid = (left + right) / 2;
1062 if ( __kmp_pow< UT >(x,mid) > target ) {
1063 left = mid;
1064 } else {
1065 right = mid;
1066 }
1067 } // while
1068 cross = right;
1069 }
1070 /* assert sanity of computed crossover point */
1071 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1072
1073 /* save the crossover point in thread private dispatch structure */
1074 pr->u.p.parm2 = cross;
1075
1076 // C75803
1077 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1078 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1079 #else
1080 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1081 #endif
1082 /* dynamic-style scheduling offset */
1083 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1084 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1085 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +00001086 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001087 #endif
1088 } // if
1089 } else {
1090 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1091 gtid ) );
1092 schedule = kmp_sch_static_greedy;
1093 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1094 pr->u.p.parm1 = tc;
1095 } // if
1096 } // case
1097 break;
1098 case kmp_sch_static_greedy:
1099 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +00001100 pr->u.p.parm1 = ( th->th.th_team_nproc > 1 ) ?
1101 ( tc + th->th.th_team_nproc - 1 ) / th->th.th_team_nproc :
Jim Cownie5e8470a2013-09-27 10:38:44 +00001102 tc;
1103 break;
1104 case kmp_sch_static_chunked :
1105 case kmp_sch_dynamic_chunked :
Jonathan Peyton70bda912015-11-06 20:32:44 +00001106 if ( pr->u.p.parm1 <= 0 ) {
1107 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1108 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00001109 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1110 break;
1111 case kmp_sch_trapezoidal :
1112 {
1113 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1114
1115 T parm1, parm2, parm3, parm4;
1116 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1117
1118 parm1 = chunk;
1119
1120 /* F : size of the first cycle */
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +00001121 parm2 = ( tc / (2 * th->th.th_team_nproc) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001122
1123 if ( parm2 < 1 ) {
1124 parm2 = 1;
1125 }
1126
1127 /* L : size of the last cycle. Make sure the last cycle
1128 * is not larger than the first cycle.
1129 */
1130 if ( parm1 < 1 ) {
1131 parm1 = 1;
1132 } else if ( parm1 > parm2 ) {
1133 parm1 = parm2;
1134 }
1135
1136 /* N : number of cycles */
1137 parm3 = ( parm2 + parm1 );
1138 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1139
1140 if ( parm3 < 2 ) {
1141 parm3 = 2;
1142 }
1143
1144 /* sigma : decreasing incr of the trapezoid */
1145 parm4 = ( parm3 - 1 );
1146 parm4 = ( parm2 - parm1 ) / parm4;
1147
1148 // pointless check, because parm4 >= 0 always
1149 //if ( parm4 < 0 ) {
1150 // parm4 = 0;
1151 //}
1152
1153 pr->u.p.parm1 = parm1;
1154 pr->u.p.parm2 = parm2;
1155 pr->u.p.parm3 = parm3;
1156 pr->u.p.parm4 = parm4;
1157 } // case
1158 break;
1159
1160 default:
1161 {
1162 __kmp_msg(
1163 kmp_ms_fatal, // Severity
1164 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1165 KMP_HNT( GetNewerLibrary ), // Hint
1166 __kmp_msg_null // Variadic argument list terminator
1167 );
1168 }
1169 break;
1170 } // switch
1171 pr->schedule = schedule;
1172 if ( active ) {
1173 /* The name of this buffer should be my_buffer_index when it's free to use it */
1174
1175 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1176 gtid, my_buffer_index, sh->buffer_index) );
1177 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1178 USE_ITT_BUILD_ARG( NULL )
1179 );
1180 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1181 // *always* 32-bit integers.
1182 KMP_MB(); /* is this necessary? */
1183 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1184 gtid, my_buffer_index, sh->buffer_index) );
1185
1186 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1187 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1188#if USE_ITT_BUILD
1189 if ( pr->ordered ) {
1190 __kmp_itt_ordered_init( gtid );
1191 }; // if
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001192 // Report loop metadata
1193 if ( itt_need_metadata_reporting ) {
1194 // Only report metadata by master of active team at level 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001195 kmp_uint64 schedtype = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001196 switch ( schedule ) {
1197 case kmp_sch_static_chunked:
1198 case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1199 break;
1200 case kmp_sch_static_greedy:
1201 cur_chunk = pr->u.p.parm1;
1202 break;
1203 case kmp_sch_dynamic_chunked:
1204 schedtype = 1;
1205 break;
1206 case kmp_sch_guided_iterative_chunked:
1207 case kmp_sch_guided_analytical_chunked:
1208 schedtype = 2;
1209 break;
1210 default:
1211// Should we put this case under "static"?
1212// case kmp_sch_static_steal:
1213 schedtype = 3;
1214 break;
1215 }
1216 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1217 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001218#endif /* USE_ITT_BUILD */
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001219 }; // if
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001220
Jim Cownie5e8470a2013-09-27 10:38:44 +00001221 #ifdef KMP_DEBUG
1222 {
1223 const char * buff;
1224 // create format specifiers before the debug output
1225 buff = __kmp_str_format(
1226 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1227 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1228 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1229 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1230 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1231 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1232 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1233 KD_TRACE(10, ( buff,
1234 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1235 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1236 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1237 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1238 __kmp_str_free( &buff );
1239 }
1240 #endif
1241 #if ( KMP_STATIC_STEAL_ENABLED )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001242 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1243 // all the parm3 variables will contain the same value.
1244 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1245 // rather than program life-time increment.
1246 // So the dedicated variable is required. The 'static_steal_counter' is used.
1247 if( schedule == kmp_sch_static_steal ) {
1248 // Other threads will inspect this variable when searching for a victim.
1249 // This is a flag showing that other threads may steal from this thread since then.
1250 volatile T * p = &pr->u.p.static_steal_counter;
1251 *p = *p + 1;
1252 }
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001253 #endif // ( KMP_STATIC_STEAL_ENABLED )
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001254
1255#if OMPT_SUPPORT && OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001256 if (ompt_enabled &&
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001257 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1258 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1259 ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1260 ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1261 team_info->parallel_id, task_info->task_id, team_info->microtask);
1262 }
1263#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001264}
1265
1266/*
1267 * For ordered loops, either __kmp_dispatch_finish() should be called after
1268 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1269 * every chunk of iterations. If the ordered section(s) were not executed
1270 * for this iteration (or every iteration in this chunk), we need to set the
1271 * ordered iteration counters so that the next thread can proceed.
1272 */
1273template< typename UT >
1274static void
1275__kmp_dispatch_finish( int gtid, ident_t *loc )
1276{
1277 typedef typename traits_t< UT >::signed_t ST;
1278 kmp_info_t *th = __kmp_threads[ gtid ];
1279
1280 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1281 if ( ! th -> th.th_team -> t.t_serialized ) {
1282
1283 dispatch_private_info_template< UT > * pr =
1284 reinterpret_cast< dispatch_private_info_template< UT >* >
1285 ( th->th.th_dispatch->th_dispatch_pr_current );
1286 dispatch_shared_info_template< UT > volatile * sh =
1287 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1288 ( th->th.th_dispatch->th_dispatch_sh_current );
1289 KMP_DEBUG_ASSERT( pr );
1290 KMP_DEBUG_ASSERT( sh );
1291 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1292 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1293
1294 if ( pr->ordered_bumped ) {
1295 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1296 gtid ) );
1297 pr->ordered_bumped = 0;
1298 } else {
1299 UT lower = pr->u.p.ordered_lower;
1300
1301 #ifdef KMP_DEBUG
1302 {
1303 const char * buff;
1304 // create format specifiers before the debug output
1305 buff = __kmp_str_format(
1306 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1307 traits_t< UT >::spec, traits_t< UT >::spec );
1308 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1309 __kmp_str_free( &buff );
1310 }
1311 #endif
1312
1313 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1314 USE_ITT_BUILD_ARG(NULL)
1315 );
1316 KMP_MB(); /* is this necessary? */
1317 #ifdef KMP_DEBUG
1318 {
1319 const char * buff;
1320 // create format specifiers before the debug output
1321 buff = __kmp_str_format(
1322 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1323 traits_t< UT >::spec, traits_t< UT >::spec );
1324 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1325 __kmp_str_free( &buff );
1326 }
1327 #endif
1328
1329 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1330 } // if
1331 } // if
1332 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1333}
1334
1335#ifdef KMP_GOMP_COMPAT
1336
1337template< typename UT >
1338static void
1339__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1340{
1341 typedef typename traits_t< UT >::signed_t ST;
1342 kmp_info_t *th = __kmp_threads[ gtid ];
1343
1344 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1345 if ( ! th -> th.th_team -> t.t_serialized ) {
1346// int cid;
1347 dispatch_private_info_template< UT > * pr =
1348 reinterpret_cast< dispatch_private_info_template< UT >* >
1349 ( th->th.th_dispatch->th_dispatch_pr_current );
1350 dispatch_shared_info_template< UT > volatile * sh =
1351 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1352 ( th->th.th_dispatch->th_dispatch_sh_current );
1353 KMP_DEBUG_ASSERT( pr );
1354 KMP_DEBUG_ASSERT( sh );
1355 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1356 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1357
1358// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1359 UT lower = pr->u.p.ordered_lower;
1360 UT upper = pr->u.p.ordered_upper;
1361 UT inc = upper - lower + 1;
1362
1363 if ( pr->ordered_bumped == inc ) {
1364 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1365 gtid ) );
1366 pr->ordered_bumped = 0;
1367 } else {
1368 inc -= pr->ordered_bumped;
1369
1370 #ifdef KMP_DEBUG
1371 {
1372 const char * buff;
1373 // create format specifiers before the debug output
1374 buff = __kmp_str_format(
1375 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1376 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1377 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1378 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1379 __kmp_str_free( &buff );
1380 }
1381 #endif
1382
1383 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1384 USE_ITT_BUILD_ARG(NULL)
1385 );
1386
1387 KMP_MB(); /* is this necessary? */
1388 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1389 gtid ) );
1390 pr->ordered_bumped = 0;
1391//!!!!! TODO check if the inc should be unsigned, or signed???
1392 #ifdef KMP_DEBUG
1393 {
1394 const char * buff;
1395 // create format specifiers before the debug output
1396 buff = __kmp_str_format(
1397 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1398 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1399 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1400 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1401 __kmp_str_free( &buff );
1402 }
1403 #endif
1404
1405 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1406 }
1407// }
1408 }
1409 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1410}
1411
1412#endif /* KMP_GOMP_COMPAT */
1413
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001414/* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1415 * (no more work), then tell OMPT the loop is over. In some cases
1416 * kmp_dispatch_fini() is not called. */
1417#if OMPT_SUPPORT && OMPT_TRACE
1418#define OMPT_LOOP_END \
1419 if (status == 0) { \
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001420 if (ompt_enabled && \
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001421 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1422 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1423 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1424 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1425 team_info->parallel_id, task_info->task_id); \
1426 } \
1427 }
1428#else
1429#define OMPT_LOOP_END // no-op
1430#endif
1431
Jim Cownie5e8470a2013-09-27 10:38:44 +00001432template< typename T >
1433static int
1434__kmp_dispatch_next(
1435 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1436) {
1437
1438 typedef typename traits_t< T >::unsigned_t UT;
1439 typedef typename traits_t< T >::signed_t ST;
1440 typedef typename traits_t< T >::floating_t DBL;
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001441#if ( KMP_STATIC_STEAL_ENABLED )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001442 static const int ___kmp_size_type = sizeof( UT );
Jonathan Peyton2321d572015-06-08 19:25:25 +00001443#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001444
Jonathan Peyton45be4502015-08-11 21:36:41 +00001445 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1446 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1447 // more than a compile time choice to use static scheduling would.)
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001448 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001449
Jim Cownie5e8470a2013-09-27 10:38:44 +00001450 int status;
1451 dispatch_private_info_template< T > * pr;
1452 kmp_info_t * th = __kmp_threads[ gtid ];
1453 kmp_team_t * team = th -> th.th_team;
1454
Andrey Churbanov9ad5c3a2015-07-13 17:52:41 +00001455 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie5e8470a2013-09-27 10:38:44 +00001456 #ifdef KMP_DEBUG
1457 {
1458 const char * buff;
1459 // create format specifiers before the debug output
1460 buff = __kmp_str_format(
1461 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1462 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1463 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1464 __kmp_str_free( &buff );
1465 }
1466 #endif
1467
1468 if ( team -> t.t_serialized ) {
1469 /* NOTE: serialize this dispatch becase we are not at the active level */
1470 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1471 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1472 KMP_DEBUG_ASSERT( pr );
1473
1474 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1475 *p_lb = 0;
1476 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001477// if ( p_last != NULL )
1478// *p_last = 0;
1479 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001480 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001481 if ( __kmp_env_consistency_check ) {
1482 if ( pr->pushed_ws != ct_none ) {
1483 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1484 }
1485 }
1486 } else if ( pr->nomerge ) {
1487 kmp_int32 last;
1488 T start;
1489 UT limit, trip, init;
1490 ST incr;
1491 T chunk = pr->u.p.parm1;
1492
1493 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1494
1495 init = chunk * pr->u.p.count++;
1496 trip = pr->u.p.tc - 1;
1497
1498 if ( (status = (init <= trip)) == 0 ) {
1499 *p_lb = 0;
1500 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001501// if ( p_last != NULL )
1502// *p_last = 0;
1503 if ( p_st != NULL )
1504 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001505 if ( __kmp_env_consistency_check ) {
1506 if ( pr->pushed_ws != ct_none ) {
1507 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1508 }
1509 }
1510 } else {
1511 start = pr->u.p.lb;
1512 limit = chunk + init - 1;
1513 incr = pr->u.p.st;
1514
1515 if ( (last = (limit >= trip)) != 0 ) {
1516 limit = trip;
1517 #if KMP_OS_WINDOWS
1518 pr->u.p.last_upper = pr->u.p.ub;
1519 #endif /* KMP_OS_WINDOWS */
1520 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001521 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001522 *p_last = last;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001523 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001524 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001525 if ( incr == 1 ) {
1526 *p_lb = start + init;
1527 *p_ub = start + limit;
1528 } else {
1529 *p_lb = start + init * incr;
1530 *p_ub = start + limit * incr;
1531 }
1532
1533 if ( pr->ordered ) {
1534 pr->u.p.ordered_lower = init;
1535 pr->u.p.ordered_upper = limit;
1536 #ifdef KMP_DEBUG
1537 {
1538 const char * buff;
1539 // create format specifiers before the debug output
1540 buff = __kmp_str_format(
1541 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1542 traits_t< UT >::spec, traits_t< UT >::spec );
1543 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1544 __kmp_str_free( &buff );
1545 }
1546 #endif
1547 } // if
1548 } // if
1549 } else {
1550 pr->u.p.tc = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001551 *p_lb = pr->u.p.lb;
1552 *p_ub = pr->u.p.ub;
1553 #if KMP_OS_WINDOWS
1554 pr->u.p.last_upper = *p_ub;
1555 #endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001556 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001557 *p_last = TRUE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001558 if ( p_st != NULL )
1559 *p_st = pr->u.p.st;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001560 } // if
1561 #ifdef KMP_DEBUG
1562 {
1563 const char * buff;
1564 // create format specifiers before the debug output
1565 buff = __kmp_str_format(
1566 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001567 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie5e8470a2013-09-27 10:38:44 +00001568 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001569 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001570 __kmp_str_free( &buff );
1571 }
1572 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001573#if INCLUDE_SSC_MARKS
1574 SSC_MARK_DISPATCH_NEXT();
1575#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001576 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001577 return status;
1578 } else {
1579 kmp_int32 last = 0;
1580 dispatch_shared_info_template< UT > *sh;
1581 T start;
1582 ST incr;
1583 UT limit, trip, init;
1584
1585 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1586 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1587
1588 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1589 ( th->th.th_dispatch->th_dispatch_pr_current );
1590 KMP_DEBUG_ASSERT( pr );
1591 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1592 ( th->th.th_dispatch->th_dispatch_sh_current );
1593 KMP_DEBUG_ASSERT( sh );
1594
1595 if ( pr->u.p.tc == 0 ) {
1596 // zero trip count
1597 status = 0;
1598 } else {
1599 switch (pr->schedule) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001600 #if ( KMP_STATIC_STEAL_ENABLED )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001601 case kmp_sch_static_steal:
1602 {
1603 T chunk = pr->u.p.parm1;
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001604 int nproc = th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001605
1606 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1607
1608 trip = pr->u.p.tc - 1;
1609
1610 if ( ___kmp_size_type > 4 ) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001611 // use lock for 8-byte and CAS for 4-byte induction
1612 // variable. TODO (optional): check and use 16-byte CAS
1613 kmp_lock_t * lck = th->th.th_dispatch->th_steal_lock;
1614 KMP_DEBUG_ASSERT(lck != NULL);
1615 if( pr->u.p.count < (UT)pr->u.p.ub ) {
1616 __kmp_acquire_lock(lck, gtid);
1617 // try to get own chunk of iterations
1618 init = ( pr->u.p.count )++;
1619 status = ( init < (UT)pr->u.p.ub );
1620 __kmp_release_lock(lck, gtid);
1621 } else {
1622 status = 0; // no own chunks
1623 }
1624 if( !status ) { // try to steal
1625 kmp_info_t **other_threads = team->t.t_threads;
1626 int while_limit = nproc; // nproc attempts to find a victim
1627 int while_index = 0;
1628 // TODO: algorithm of searching for a victim
1629 // should be cleaned up and measured
1630 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1631 T remaining;
1632 T victimIdx = pr->u.p.parm4;
1633 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1634 dispatch_private_info_template< T > * victim =
1635 reinterpret_cast< dispatch_private_info_template< T >* >
1636 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1637 while( ( victim == NULL || victim == pr ||
1638 ( *(volatile T*)&victim->u.p.static_steal_counter !=
1639 *(volatile T*)&pr->u.p.static_steal_counter ) ) &&
1640 oldVictimIdx != victimIdx )
1641 {
1642 victimIdx = (victimIdx + 1) % nproc;
1643 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1644 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1645 };
1646 if( !victim ||
1647 ( *(volatile T *)&victim->u.p.static_steal_counter !=
1648 *(volatile T *)&pr->u.p.static_steal_counter ) )
1649 {
1650 continue; // try once more (nproc attempts in total)
1651 // no victim is ready yet to participate in stealing
1652 // because all victims are still in kmp_init_dispatch
1653 }
1654 if( victim->u.p.count + 2 > (UT)victim->u.p.ub ) {
1655 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1656 continue; // not enough chunks to steal, goto next victim
1657 }
1658
1659 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1660 KMP_ASSERT(lck != NULL);
1661 __kmp_acquire_lock(lck, gtid);
1662 limit = victim->u.p.ub; // keep initial ub
1663 if( victim->u.p.count >= limit ||
1664 (remaining = limit - victim->u.p.count) < 2 )
1665 {
1666 __kmp_release_lock(lck, gtid);
1667 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1668 continue; // not enough chunks to steal
1669 }
1670 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or by 1
1671 if( remaining > 3 ) {
1672 init = ( victim->u.p.ub -= (remaining>>2) ); // steal 1/4 of remaining
1673 } else {
1674 init = ( victim->u.p.ub -= 1 ); // steal 1 chunk of 2 or 3 remaining
1675 }
1676 __kmp_release_lock(lck, gtid);
1677
1678 KMP_DEBUG_ASSERT(init + 1 <= limit);
1679 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1680 status = 1;
1681 while_index = 0;
1682 // now update own count and ub with stolen range but init chunk
1683 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1684 pr->u.p.count = init + 1;
1685 pr->u.p.ub = limit;
1686 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1687 } // while (search for victim)
1688 } // if (try to find victim and steal)
Jim Cownie5e8470a2013-09-27 10:38:44 +00001689 } else {
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001690 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
Jim Cownie5e8470a2013-09-27 10:38:44 +00001691 typedef union {
1692 struct {
1693 UT count;
1694 T ub;
1695 } p;
1696 kmp_int64 b;
1697 } union_i4;
1698 // All operations on 'count' or 'ub' must be combined atomically together.
Jim Cownie5e8470a2013-09-27 10:38:44 +00001699 {
1700 union_i4 vold, vnew;
1701 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1702 vnew = vold;
1703 vnew.p.count++;
1704 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1705 ( volatile kmp_int64* )&pr->u.p.count,
1706 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1707 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1708 KMP_CPU_PAUSE();
1709 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1710 vnew = vold;
1711 vnew.p.count++;
1712 }
1713 vnew = vold;
1714 init = vnew.p.count;
1715 status = ( init < (UT)vnew.p.ub ) ;
1716 }
1717
1718 if( !status ) {
1719 kmp_info_t **other_threads = team->t.t_threads;
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001720 int while_limit = nproc; // nproc attempts to find a victim
Jim Cownie5e8470a2013-09-27 10:38:44 +00001721 int while_index = 0;
1722
1723 // TODO: algorithm of searching for a victim
1724 // should be cleaned up and measured
1725 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1726 union_i4 vold, vnew;
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001727 kmp_int32 remaining;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001728 T victimIdx = pr->u.p.parm4;
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001729 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1730 dispatch_private_info_template< T > * victim =
1731 reinterpret_cast< dispatch_private_info_template< T >* >
1732 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1733 while( (victim == NULL || victim == pr ||
1734 (*(volatile T*)&victim->u.p.static_steal_counter !=
1735 *(volatile T*)&pr->u.p.static_steal_counter)) &&
1736 oldVictimIdx != victimIdx )
1737 {
1738 victimIdx = (victimIdx + 1) % nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001739 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1740 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001741 };
1742 if( !victim ||
1743 ( *(volatile T *)&victim->u.p.static_steal_counter !=
1744 *(volatile T *)&pr->u.p.static_steal_counter ) )
1745 {
1746 continue; // try once more (nproc attempts in total)
1747 // no victim is ready yet to participate in stealing
1748 // because all victims are still in kmp_init_dispatch
Jim Cownie5e8470a2013-09-27 10:38:44 +00001749 }
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001750 pr->u.p.parm4 = victimIdx; // new victim found
1751 while( 1 ) { // CAS loop if victim has enough chunks to steal
Jim Cownie5e8470a2013-09-27 10:38:44 +00001752 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1753 vnew = vold;
1754
1755 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001756 if ( vnew.p.count >= (UT)vnew.p.ub ||
1757 (remaining = vnew.p.ub - vnew.p.count) < 2 )
1758 {
1759 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1760 break; // not enough chunks to steal, goto next victim
Jim Cownie5e8470a2013-09-27 10:38:44 +00001761 }
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001762 if( remaining > 3 ) {
1763 vnew.p.ub -= (remaining>>2); // try to steal 1/4 of remaining
1764 } else {
1765 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1766 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00001767 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001768 // TODO: Should this be acquire or release?
1769 if ( KMP_COMPARE_AND_STORE_ACQ64(
1770 ( volatile kmp_int64 * )&victim->u.p.count,
1771 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1772 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001773 // stealing succedded
Jim Cownie5e8470a2013-09-27 10:38:44 +00001774 status = 1;
1775 while_index = 0;
1776 // now update own count and ub
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001777 init = vnew.p.ub;
1778 vold.p.count = init + 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001779 #if KMP_ARCH_X86
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001780 KMP_XCHG_FIXED64(( volatile kmp_int64 * )(&pr->u.p.count), vold.b);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001781 #else
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001782 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1783 #endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001784 break;
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001785 } // if (check CAS result)
1786 KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1787 } // while (try to steal from particular victim)
1788 } // while (search for victim)
1789 } // if (try to find victim and steal)
1790 } // if (4-byte induction variable)
Jim Cownie5e8470a2013-09-27 10:38:44 +00001791 if ( !status ) {
1792 *p_lb = 0;
1793 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001794 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001795 } else {
1796 start = pr->u.p.parm2;
1797 init *= chunk;
1798 limit = chunk + init - 1;
1799 incr = pr->u.p.st;
1800
1801 KMP_DEBUG_ASSERT(init <= trip);
1802 if ( (last = (limit >= trip)) != 0 )
1803 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001804 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001805
1806 if ( incr == 1 ) {
1807 *p_lb = start + init;
1808 *p_ub = start + limit;
1809 } else {
1810 *p_lb = start + init * incr;
1811 *p_ub = start + limit * incr;
1812 }
1813
1814 if ( pr->ordered ) {
1815 pr->u.p.ordered_lower = init;
1816 pr->u.p.ordered_upper = limit;
1817 #ifdef KMP_DEBUG
1818 {
1819 const char * buff;
1820 // create format specifiers before the debug output
1821 buff = __kmp_str_format(
1822 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1823 traits_t< UT >::spec, traits_t< UT >::spec );
1824 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1825 __kmp_str_free( &buff );
1826 }
1827 #endif
1828 } // if
1829 } // if
1830 break;
1831 } // case
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001832 #endif // ( KMP_STATIC_STEAL_ENABLED )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001833 case kmp_sch_static_balanced:
1834 {
1835 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1836 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1837 pr->u.p.count = 1;
1838 *p_lb = pr->u.p.lb;
1839 *p_ub = pr->u.p.ub;
1840 last = pr->u.p.parm1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001841 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001842 *p_st = pr->u.p.st;
1843 } else { /* no iterations to do */
1844 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1845 }
1846 if ( pr->ordered ) {
1847 #ifdef KMP_DEBUG
1848 {
1849 const char * buff;
1850 // create format specifiers before the debug output
1851 buff = __kmp_str_format(
1852 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1853 traits_t< UT >::spec, traits_t< UT >::spec );
1854 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1855 __kmp_str_free( &buff );
1856 }
1857 #endif
1858 } // if
1859 } // case
1860 break;
1861 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1862 case kmp_sch_static_chunked:
1863 {
1864 T parm1;
1865
1866 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1867 gtid ) );
1868 parm1 = pr->u.p.parm1;
1869
1870 trip = pr->u.p.tc - 1;
1871 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1872
1873 if ( (status = (init <= trip)) != 0 ) {
1874 start = pr->u.p.lb;
1875 incr = pr->u.p.st;
1876 limit = parm1 + init - 1;
1877
1878 if ( (last = (limit >= trip)) != 0 )
1879 limit = trip;
1880
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001881 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001882
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +00001883 pr->u.p.count += th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001884
1885 if ( incr == 1 ) {
1886 *p_lb = start + init;
1887 *p_ub = start + limit;
1888 }
1889 else {
1890 *p_lb = start + init * incr;
1891 *p_ub = start + limit * incr;
1892 }
1893
1894 if ( pr->ordered ) {
1895 pr->u.p.ordered_lower = init;
1896 pr->u.p.ordered_upper = limit;
1897 #ifdef KMP_DEBUG
1898 {
1899 const char * buff;
1900 // create format specifiers before the debug output
1901 buff = __kmp_str_format(
1902 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1903 traits_t< UT >::spec, traits_t< UT >::spec );
1904 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1905 __kmp_str_free( &buff );
1906 }
1907 #endif
1908 } // if
1909 } // if
1910 } // case
1911 break;
1912
1913 case kmp_sch_dynamic_chunked:
1914 {
1915 T chunk = pr->u.p.parm1;
1916
1917 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1918 gtid ) );
1919
1920 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1921 trip = pr->u.p.tc - 1;
1922
1923 if ( (status = (init <= trip)) == 0 ) {
1924 *p_lb = 0;
1925 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001926 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001927 } else {
1928 start = pr->u.p.lb;
1929 limit = chunk + init - 1;
1930 incr = pr->u.p.st;
1931
1932 if ( (last = (limit >= trip)) != 0 )
1933 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001934
1935 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001936
1937 if ( incr == 1 ) {
1938 *p_lb = start + init;
1939 *p_ub = start + limit;
1940 } else {
1941 *p_lb = start + init * incr;
1942 *p_ub = start + limit * incr;
1943 }
1944
1945 if ( pr->ordered ) {
1946 pr->u.p.ordered_lower = init;
1947 pr->u.p.ordered_upper = limit;
1948 #ifdef KMP_DEBUG
1949 {
1950 const char * buff;
1951 // create format specifiers before the debug output
1952 buff = __kmp_str_format(
1953 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1954 traits_t< UT >::spec, traits_t< UT >::spec );
1955 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1956 __kmp_str_free( &buff );
1957 }
1958 #endif
1959 } // if
1960 } // if
1961 } // case
1962 break;
1963
1964 case kmp_sch_guided_iterative_chunked:
1965 {
1966 T chunkspec = pr->u.p.parm1;
1967 KD_TRACE(100,
1968 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1969 trip = pr->u.p.tc;
1970 // Start atomic part of calculations
1971 while(1) {
1972 ST remaining; // signed, because can be < 0
1973 init = sh->u.s.iteration; // shared value
1974 remaining = trip - init;
1975 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1976 // nothing to do, don't try atomic op
1977 status = 0;
1978 break;
1979 }
1980 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1981 // use dynamic-style shcedule
1982 // atomically inrement iterations, get old value
1983 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1984 remaining = trip - init;
1985 if (remaining <= 0) {
1986 status = 0; // all iterations got by other threads
1987 } else {
1988 // got some iterations to work on
1989 status = 1;
1990 if ( (T)remaining > chunkspec ) {
1991 limit = init + chunkspec - 1;
1992 } else {
1993 last = 1; // the last chunk
1994 limit = init + remaining - 1;
1995 } // if
1996 } // if
1997 break;
1998 } // if
1999 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
2000 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
2001 // CAS was successful, chunk obtained
2002 status = 1;
2003 --limit;
2004 break;
2005 } // if
2006 } // while
2007 if ( status != 0 ) {
2008 start = pr->u.p.lb;
2009 incr = pr->u.p.st;
2010 if ( p_st != NULL )
2011 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002012 *p_lb = start + init * incr;
2013 *p_ub = start + limit * incr;
2014 if ( pr->ordered ) {
2015 pr->u.p.ordered_lower = init;
2016 pr->u.p.ordered_upper = limit;
2017 #ifdef KMP_DEBUG
2018 {
2019 const char * buff;
2020 // create format specifiers before the debug output
2021 buff = __kmp_str_format(
2022 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2023 traits_t< UT >::spec, traits_t< UT >::spec );
2024 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2025 __kmp_str_free( &buff );
2026 }
2027 #endif
2028 } // if
2029 } else {
2030 *p_lb = 0;
2031 *p_ub = 0;
2032 if ( p_st != NULL )
2033 *p_st = 0;
2034 } // if
2035 } // case
2036 break;
2037
2038 case kmp_sch_guided_analytical_chunked:
2039 {
2040 T chunkspec = pr->u.p.parm1;
2041 UT chunkIdx;
2042 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2043 /* for storing original FPCW value for Windows* OS on
2044 IA-32 architecture 8-byte version */
2045 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00002046 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002047 #endif
2048 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
2049 gtid ) );
2050
2051 trip = pr->u.p.tc;
2052
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +00002053 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2054 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < trip);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002055
2056 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
2057 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
2058 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
2059 --trip;
2060 /* use dynamic-style scheduling */
2061 init = chunkIdx * chunkspec + pr->u.p.count;
2062 /* need to verify init > 0 in case of overflow in the above calculation */
2063 if ( (status = (init > 0 && init <= trip)) != 0 ) {
2064 limit = init + chunkspec -1;
2065
2066 if ( (last = (limit >= trip)) != 0 )
2067 limit = trip;
2068 }
2069 break;
2070 } else {
2071 /* use exponential-style scheduling */
2072 /* The following check is to workaround the lack of long double precision on Windows* OS.
2073 This check works around the possible effect that init != 0 for chunkIdx == 0.
2074 */
2075 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2076 /* If we haven't already done so, save original
2077 FPCW and set precision to 64-bit, as Windows* OS
2078 on IA-32 architecture defaults to 53-bit */
2079 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00002080 oldFpcw = _control87(0,0);
2081 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002082 fpcwSet = 0x30000;
2083 }
2084 #endif
2085 if ( chunkIdx ) {
2086 init = __kmp_dispatch_guided_remaining< T >(
2087 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
2088 KMP_DEBUG_ASSERT(init);
2089 init = trip - init;
2090 } else
2091 init = 0;
2092 limit = trip - __kmp_dispatch_guided_remaining< T >(
2093 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
2094 KMP_ASSERT(init <= limit);
2095 if ( init < limit ) {
2096 KMP_DEBUG_ASSERT(limit <= trip);
2097 --limit;
2098 status = 1;
2099 break;
2100 } // if
2101 } // if
2102 } // while (1)
2103 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00002104 /* restore FPCW if necessary
2105 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2106 */
2107 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2108 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002109 #endif
2110 if ( status != 0 ) {
2111 start = pr->u.p.lb;
2112 incr = pr->u.p.st;
2113 if ( p_st != NULL )
2114 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002115 *p_lb = start + init * incr;
2116 *p_ub = start + limit * incr;
2117 if ( pr->ordered ) {
2118 pr->u.p.ordered_lower = init;
2119 pr->u.p.ordered_upper = limit;
2120 #ifdef KMP_DEBUG
2121 {
2122 const char * buff;
2123 // create format specifiers before the debug output
2124 buff = __kmp_str_format(
2125 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2126 traits_t< UT >::spec, traits_t< UT >::spec );
2127 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2128 __kmp_str_free( &buff );
2129 }
2130 #endif
2131 }
2132 } else {
2133 *p_lb = 0;
2134 *p_ub = 0;
2135 if ( p_st != NULL )
2136 *p_st = 0;
2137 }
2138 } // case
2139 break;
2140
2141 case kmp_sch_trapezoidal:
2142 {
2143 UT index;
2144 T parm2 = pr->u.p.parm2;
2145 T parm3 = pr->u.p.parm3;
2146 T parm4 = pr->u.p.parm4;
2147 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2148 gtid ) );
2149
2150 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2151
2152 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2153 trip = pr->u.p.tc - 1;
2154
2155 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2156 *p_lb = 0;
2157 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002158 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002159 } else {
2160 start = pr->u.p.lb;
2161 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2162 incr = pr->u.p.st;
2163
2164 if ( (last = (limit >= trip)) != 0 )
2165 limit = trip;
2166
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002167 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002168
2169 if ( incr == 1 ) {
2170 *p_lb = start + init;
2171 *p_ub = start + limit;
2172 } else {
2173 *p_lb = start + init * incr;
2174 *p_ub = start + limit * incr;
2175 }
2176
2177 if ( pr->ordered ) {
2178 pr->u.p.ordered_lower = init;
2179 pr->u.p.ordered_upper = limit;
2180 #ifdef KMP_DEBUG
2181 {
2182 const char * buff;
2183 // create format specifiers before the debug output
2184 buff = __kmp_str_format(
2185 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2186 traits_t< UT >::spec, traits_t< UT >::spec );
2187 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2188 __kmp_str_free( &buff );
2189 }
2190 #endif
2191 } // if
2192 } // if
2193 } // case
2194 break;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002195 default:
2196 {
2197 status = 0; // to avoid complaints on uninitialized variable use
2198 __kmp_msg(
2199 kmp_ms_fatal, // Severity
2200 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2201 KMP_HNT( GetNewerLibrary ), // Hint
2202 __kmp_msg_null // Variadic argument list terminator
2203 );
2204 }
2205 break;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002206 } // switch
2207 } // if tc == 0;
2208
2209 if ( status == 0 ) {
2210 UT num_done;
2211
2212 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2213 #ifdef KMP_DEBUG
2214 {
2215 const char * buff;
2216 // create format specifiers before the debug output
2217 buff = __kmp_str_format(
2218 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2219 traits_t< UT >::spec );
2220 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2221 __kmp_str_free( &buff );
2222 }
2223 #endif
2224
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +00002225 if ( (ST)num_done == th->th.th_team_nproc - 1 ) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +00002226 #if ( KMP_STATIC_STEAL_ENABLED )
2227 if( pr->schedule == kmp_sch_static_steal && ___kmp_size_type > 4 ) {
2228 int i;
2229 kmp_info_t **other_threads = team->t.t_threads;
2230 // loop complete, safe to destroy locks used for stealing
2231 for( i = 0; i < th->th.th_team_nproc; ++i ) {
2232 kmp_lock_t * lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2233 KMP_ASSERT(lck != NULL);
2234 __kmp_destroy_lock( lck );
2235 __kmp_free( lck );
2236 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2237 }
2238 }
2239 #endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00002240 /* NOTE: release this buffer to be reused */
2241
2242 KMP_MB(); /* Flush all pending memory write invalidates. */
2243
2244 sh->u.s.num_done = 0;
2245 sh->u.s.iteration = 0;
2246
2247 /* TODO replace with general release procedure? */
2248 if ( pr->ordered ) {
2249 sh->u.s.ordered_iteration = 0;
2250 }
2251
2252 KMP_MB(); /* Flush all pending memory write invalidates. */
2253
Jonathan Peyton067325f2016-05-31 19:01:15 +00002254 sh -> buffer_index += __kmp_dispatch_num_buffers;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002255 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2256 gtid, sh->buffer_index) );
2257
2258 KMP_MB(); /* Flush all pending memory write invalidates. */
2259
2260 } // if
2261 if ( __kmp_env_consistency_check ) {
2262 if ( pr->pushed_ws != ct_none ) {
2263 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2264 }
2265 }
2266
2267 th -> th.th_dispatch -> th_deo_fcn = NULL;
2268 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2269 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2270 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2271 } // if (status == 0)
2272#if KMP_OS_WINDOWS
2273 else if ( last ) {
2274 pr->u.p.last_upper = pr->u.p.ub;
2275 }
2276#endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002277 if ( p_last != NULL && status != 0 )
2278 *p_last = last;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002279 } // if
2280
2281 #ifdef KMP_DEBUG
2282 {
2283 const char * buff;
2284 // create format specifiers before the debug output
2285 buff = __kmp_str_format(
2286 "__kmp_dispatch_next: T#%%d normal case: " \
2287 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2288 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2289 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2290 __kmp_str_free( &buff );
2291 }
2292 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002293#if INCLUDE_SSC_MARKS
2294 SSC_MARK_DISPATCH_NEXT();
2295#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00002296 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002297 return status;
2298}
2299
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002300template< typename T >
2301static void
2302__kmp_dist_get_bounds(
2303 ident_t *loc,
2304 kmp_int32 gtid,
2305 kmp_int32 *plastiter,
2306 T *plower,
2307 T *pupper,
2308 typename traits_t< T >::signed_t incr
2309) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002310 typedef typename traits_t< T >::unsigned_t UT;
2311 typedef typename traits_t< T >::signed_t ST;
2312 register kmp_uint32 team_id;
2313 register kmp_uint32 nteams;
2314 register UT trip_count;
2315 register kmp_team_t *team;
2316 kmp_info_t * th;
2317
2318 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2319 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2320 #ifdef KMP_DEBUG
2321 {
2322 const char * buff;
2323 // create format specifiers before the debug output
2324 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2325 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2326 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2327 traits_t< T >::spec );
2328 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2329 __kmp_str_free( &buff );
2330 }
2331 #endif
2332
2333 if( __kmp_env_consistency_check ) {
2334 if( incr == 0 ) {
2335 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2336 }
2337 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2338 // The loop is illegal.
2339 // Some zero-trip loops maintained by compiler, e.g.:
2340 // for(i=10;i<0;++i) // lower >= upper - run-time check
2341 // for(i=0;i>10;--i) // lower <= upper - run-time check
2342 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2343 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2344 // Compiler does not check the following illegal loops:
2345 // for(i=0;i<10;i+=incr) // where incr<0
2346 // for(i=10;i>0;i-=incr) // where incr<0
2347 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2348 }
2349 }
2350 th = __kmp_threads[gtid];
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002351 team = th->th.th_team;
2352 #if OMP_40_ENABLED
Jonathan Peyton441f3372015-09-21 17:24:46 +00002353 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002354 nteams = th->th.th_teams_size.nteams;
2355 #endif
2356 team_id = team->t.t_master_tid;
2357 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2358
2359 // compute global trip count
2360 if( incr == 1 ) {
2361 trip_count = *pupper - *plower + 1;
2362 } else if(incr == -1) {
2363 trip_count = *plower - *pupper + 1;
Jonathan Peyton5235a1b2016-04-18 21:38:29 +00002364 } else if ( incr > 0 ) {
2365 // upper-lower can exceed the limit of signed type
2366 trip_count = (UT)(*pupper - *plower) / incr + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002367 } else {
Jonathan Peyton5235a1b2016-04-18 21:38:29 +00002368 trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002369 }
Jonathan Peyton45be4502015-08-11 21:36:41 +00002370
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002371 if( trip_count <= nteams ) {
2372 KMP_DEBUG_ASSERT(
2373 __kmp_static == kmp_sch_static_greedy || \
2374 __kmp_static == kmp_sch_static_balanced
2375 ); // Unknown static scheduling type.
2376 // only some teams get single iteration, others get nothing
2377 if( team_id < trip_count ) {
2378 *pupper = *plower = *plower + team_id * incr;
2379 } else {
2380 *plower = *pupper + incr; // zero-trip loop
2381 }
2382 if( plastiter != NULL )
2383 *plastiter = ( team_id == trip_count - 1 );
2384 } else {
2385 if( __kmp_static == kmp_sch_static_balanced ) {
2386 register UT chunk = trip_count / nteams;
2387 register UT extras = trip_count % nteams;
2388 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2389 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2390 if( plastiter != NULL )
2391 *plastiter = ( team_id == nteams - 1 );
2392 } else {
2393 register T chunk_inc_count =
2394 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2395 register T upper = *pupper;
2396 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2397 // Unknown static scheduling type.
2398 *plower += team_id * chunk_inc_count;
2399 *pupper = *plower + chunk_inc_count - incr;
2400 // Check/correct bounds if needed
2401 if( incr > 0 ) {
2402 if( *pupper < *plower )
2403 *pupper = i_maxmin< T >::mx;
2404 if( plastiter != NULL )
2405 *plastiter = *plower <= upper && *pupper > upper - incr;
2406 if( *pupper > upper )
2407 *pupper = upper; // tracker C73258
2408 } else {
2409 if( *pupper > *plower )
2410 *pupper = i_maxmin< T >::mn;
2411 if( plastiter != NULL )
2412 *plastiter = *plower >= upper && *pupper < upper - incr;
2413 if( *pupper < upper )
2414 *pupper = upper; // tracker C73258
2415 }
2416 }
2417 }
2418}
2419
Jim Cownie5e8470a2013-09-27 10:38:44 +00002420//-----------------------------------------------------------------------------------------
2421// Dispatch routines
2422// Transfer call to template< type T >
2423// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2424// T lb, T ub, ST st, ST chunk )
2425extern "C" {
2426
2427/*!
2428@ingroup WORK_SHARING
2429@{
2430@param loc Source location
2431@param gtid Global thread id
2432@param schedule Schedule type
2433@param lb Lower bound
2434@param ub Upper bound
2435@param st Step (or increment if you prefer)
2436@param chunk The chunk size to block with
2437
2438This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2439These functions are all identical apart from the types of the arguments.
2440*/
2441
2442void
2443__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2444 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2445{
2446 KMP_DEBUG_ASSERT( __kmp_init_serial );
2447 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2448}
2449/*!
2450See @ref __kmpc_dispatch_init_4
2451*/
2452void
2453__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2454 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2455{
2456 KMP_DEBUG_ASSERT( __kmp_init_serial );
2457 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2458}
2459
2460/*!
2461See @ref __kmpc_dispatch_init_4
2462*/
2463void
2464__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2465 kmp_int64 lb, kmp_int64 ub,
2466 kmp_int64 st, kmp_int64 chunk )
2467{
2468 KMP_DEBUG_ASSERT( __kmp_init_serial );
2469 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2470}
2471
2472/*!
2473See @ref __kmpc_dispatch_init_4
2474*/
2475void
2476__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2477 kmp_uint64 lb, kmp_uint64 ub,
2478 kmp_int64 st, kmp_int64 chunk )
2479{
2480 KMP_DEBUG_ASSERT( __kmp_init_serial );
2481 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2482}
2483
2484/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002485See @ref __kmpc_dispatch_init_4
2486
2487Difference from __kmpc_dispatch_init set of functions is these functions
2488are called for composite distribute parallel for construct. Thus before
2489regular iterations dispatching we need to calc per-team iteration space.
2490
2491These functions are all identical apart from the types of the arguments.
2492*/
2493void
2494__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2495 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2496{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002497 KMP_DEBUG_ASSERT( __kmp_init_serial );
2498 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2499 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2500}
2501
2502void
2503__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2504 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2505{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002506 KMP_DEBUG_ASSERT( __kmp_init_serial );
2507 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2508 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2509}
2510
2511void
2512__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2513 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2514{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002515 KMP_DEBUG_ASSERT( __kmp_init_serial );
2516 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2517 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2518}
2519
2520void
2521__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2522 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2523{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002524 KMP_DEBUG_ASSERT( __kmp_init_serial );
2525 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2526 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2527}
2528
2529/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002530@param loc Source code location
2531@param gtid Global thread id
2532@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2533@param p_lb Pointer to the lower bound for the next chunk of work
2534@param p_ub Pointer to the upper bound for the next chunk of work
2535@param p_st Pointer to the stride for the next chunk of work
2536@return one if there is work to be done, zero otherwise
2537
2538Get the next dynamically allocated chunk of work for this thread.
2539If there is no more work, then the lb,ub and stride need not be modified.
2540*/
2541int
2542__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2543 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2544{
2545 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2546}
2547
2548/*!
2549See @ref __kmpc_dispatch_next_4
2550*/
2551int
2552__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2553 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2554{
2555 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2556}
2557
2558/*!
2559See @ref __kmpc_dispatch_next_4
2560*/
2561int
2562__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2563 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2564{
2565 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2566}
2567
2568/*!
2569See @ref __kmpc_dispatch_next_4
2570*/
2571int
2572__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2573 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2574{
2575 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2576}
2577
2578/*!
2579@param loc Source code location
2580@param gtid Global thread id
2581
2582Mark the end of a dynamic loop.
2583*/
2584void
2585__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2586{
2587 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2588}
2589
2590/*!
2591See @ref __kmpc_dispatch_fini_4
2592*/
2593void
2594__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2595{
2596 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2597}
2598
2599/*!
2600See @ref __kmpc_dispatch_fini_4
2601*/
2602void
2603__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2604{
2605 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2606}
2607
2608/*!
2609See @ref __kmpc_dispatch_fini_4
2610*/
2611void
2612__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2613{
2614 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2615}
2616/*! @} */
2617
2618//-----------------------------------------------------------------------------------------
Jonathan Peytonde4749b2016-12-14 23:01:24 +00002619//Non-template routines from kmp_dispatch.cpp used in other sources
Jim Cownie5e8470a2013-09-27 10:38:44 +00002620
2621kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2622 return value == checker;
2623}
2624
2625kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2626 return value != checker;
2627}
2628
2629kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2630 return value < checker;
2631}
2632
2633kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2634 return value >= checker;
2635}
2636
2637kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2638 return value <= checker;
2639}
Jim Cownie5e8470a2013-09-27 10:38:44 +00002640
2641kmp_uint32
2642__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2643 kmp_uint32 checker,
2644 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2645 , void * obj // Higher-level synchronization object, or NULL.
2646 )
2647{
2648 // note: we may not belong to a team at this point
2649 register volatile kmp_uint32 * spin = spinner;
2650 register kmp_uint32 check = checker;
2651 register kmp_uint32 spins;
2652 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2653 register kmp_uint32 r;
2654
2655 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2656 KMP_INIT_YIELD( spins );
2657 // main wait spin loop
2658 while(!f(r = TCR_4(*spin), check)) {
2659 KMP_FSYNC_SPIN_PREPARE( obj );
2660 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2661 It causes problems with infinite recursion because of exit lock */
2662 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2663 __kmp_abort_thread(); */
2664
Jim Cownie5e8470a2013-09-27 10:38:44 +00002665 /* if we have waited a bit, or are oversubscribed, yield */
2666 /* pause is in the following code */
2667 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2668 KMP_YIELD_SPIN( spins );
2669 }
2670 KMP_FSYNC_SPIN_ACQUIRED( obj );
2671 return r;
2672}
2673
Paul Osmialowskif7cc6af2016-05-31 20:20:32 +00002674void
2675__kmp_wait_yield_4_ptr(void *spinner,
2676 kmp_uint32 checker,
2677 kmp_uint32 (*pred)( void *, kmp_uint32 ),
2678 void *obj // Higher-level synchronization object, or NULL.
2679 )
2680{
2681 // note: we may not belong to a team at this point
2682 register void *spin = spinner;
2683 register kmp_uint32 check = checker;
2684 register kmp_uint32 spins;
2685 register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred;
2686
2687 KMP_FSYNC_SPIN_INIT( obj, spin );
2688 KMP_INIT_YIELD( spins );
2689 // main wait spin loop
2690 while ( !f( spin, check ) ) {
2691 KMP_FSYNC_SPIN_PREPARE( obj );
2692 /* if we have waited a bit, or are oversubscribed, yield */
2693 /* pause is in the following code */
2694 KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc );
2695 KMP_YIELD_SPIN( spins );
2696 }
2697 KMP_FSYNC_SPIN_ACQUIRED( obj );
2698}
2699
Jim Cownie5e8470a2013-09-27 10:38:44 +00002700} // extern "C"
2701
2702#ifdef KMP_GOMP_COMPAT
2703
2704void
2705__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2706 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2707 kmp_int32 chunk, int push_ws )
2708{
2709 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2710 push_ws );
2711}
2712
2713void
2714__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2715 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2716 kmp_int32 chunk, int push_ws )
2717{
2718 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2719 push_ws );
2720}
2721
2722void
2723__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2724 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2725 kmp_int64 chunk, int push_ws )
2726{
2727 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2728 push_ws );
2729}
2730
2731void
2732__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2733 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2734 kmp_int64 chunk, int push_ws )
2735{
2736 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2737 push_ws );
2738}
2739
2740void
2741__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2742{
2743 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2744}
2745
2746void
2747__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2748{
2749 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2750}
2751
2752void
2753__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2754{
2755 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2756}
2757
2758void
2759__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2760{
2761 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2762}
2763
2764#endif /* KMP_GOMP_COMPAT */
2765
2766/* ------------------------------------------------------------------------ */
2767/* ------------------------------------------------------------------------ */
2768