blob: 49bb2164505a7213905a29c4b3150388574eeba6 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16/*
17 * Dynamic scheduling initialization and dispatch.
18 *
19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20 * it may change values between parallel regions. __kmp_max_nth
21 * is the largest value __kmp_nth may take, 1 is the smallest.
22 *
23 */
24
25/* ------------------------------------------------------------------------ */
26/* ------------------------------------------------------------------------ */
27
Andrey Churbanov429dbc22016-07-11 10:44:57 +000028// Need to raise Win version from XP to Vista here for support of InterlockedExchange64
29#if defined(_WIN32_WINNT) && defined(_M_IX86)
30#undef _WIN32_WINNT
31#define _WIN32_WINNT 0x0502
32#endif
33
Jim Cownie5e8470a2013-09-27 10:38:44 +000034#include "kmp.h"
35#include "kmp_i18n.h"
36#include "kmp_itt.h"
37#include "kmp_str.h"
38#include "kmp_error.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000039#include "kmp_stats.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000040#if KMP_OS_WINDOWS && KMP_ARCH_X86
41 #include <float.h>
42#endif
43
Andrey Churbanovd7d088f2015-04-29 16:42:24 +000044#if OMPT_SUPPORT
45#include "ompt-internal.h"
46#include "ompt-specific.h"
47#endif
48
Jim Cownie5e8470a2013-09-27 10:38:44 +000049/* ------------------------------------------------------------------------ */
50/* ------------------------------------------------------------------------ */
51
Andrey Churbanov429dbc22016-07-11 10:44:57 +000052#if KMP_STATIC_STEAL_ENABLED
Jim Cownie5e8470a2013-09-27 10:38:44 +000053
54 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
55 template< typename T >
56 struct dispatch_private_infoXX_template {
57 typedef typename traits_t< T >::unsigned_t UT;
58 typedef typename traits_t< T >::signed_t ST;
59 UT count; // unsigned
60 T ub;
61 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
62 T lb;
63 ST st; // signed
64 UT tc; // unsigned
65 T static_steal_counter; // for static_steal only; maybe better to put after ub
66
67 /* parm[1-4] are used in different ways by different scheduling algorithms */
68
69 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
70 // a) parm3 is properly aligned and
71 // b) all parm1-4 are in the same cache line.
72 // Because of parm1-4 are used together, performance seems to be better
73 // if they are in the same line (not measured though).
74
75 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
76 T parm1;
77 T parm2;
78 T parm3;
79 T parm4;
80 };
81
82 UT ordered_lower; // unsigned
83 UT ordered_upper; // unsigned
84 #if KMP_OS_WINDOWS
85 T last_upper;
86 #endif /* KMP_OS_WINDOWS */
87 };
88
89#else /* KMP_STATIC_STEAL_ENABLED */
90
91 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
92 template< typename T >
93 struct dispatch_private_infoXX_template {
94 typedef typename traits_t< T >::unsigned_t UT;
95 typedef typename traits_t< T >::signed_t ST;
96 T lb;
97 T ub;
98 ST st; // signed
99 UT tc; // unsigned
100
101 T parm1;
102 T parm2;
103 T parm3;
104 T parm4;
105
106 UT count; // unsigned
107
108 UT ordered_lower; // unsigned
109 UT ordered_upper; // unsigned
110 #if KMP_OS_WINDOWS
111 T last_upper;
112 #endif /* KMP_OS_WINDOWS */
113 };
114
115#endif /* KMP_STATIC_STEAL_ENABLED */
116
117// replaces dispatch_private_info structure and dispatch_private_info_t type
118template< typename T >
119struct KMP_ALIGN_CACHE dispatch_private_info_template {
120 // duplicate alignment here, otherwise size of structure is not correct in our compiler
121 union KMP_ALIGN_CACHE private_info_tmpl {
122 dispatch_private_infoXX_template< T > p;
123 dispatch_private_info64_t p64;
124 } u;
125 enum sched_type schedule; /* scheduling algorithm */
126 kmp_uint32 ordered; /* ordered clause specified */
127 kmp_uint32 ordered_bumped;
128 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
129 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
130 kmp_uint32 nomerge; /* don't merge iters if serialized */
131 kmp_uint32 type_size;
132 enum cons_type pushed_ws;
133};
134
135
136// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
137template< typename UT >
138struct dispatch_shared_infoXX_template {
139 /* chunk index under dynamic, number of idle threads under static-steal;
140 iteration index otherwise */
141 volatile UT iteration;
142 volatile UT num_done;
143 volatile UT ordered_iteration;
Jonathan Peyton71909c52016-03-02 22:42:06 +0000144 UT ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size making ordered_iteration scalar
Jim Cownie5e8470a2013-09-27 10:38:44 +0000145};
146
147// replaces dispatch_shared_info structure and dispatch_shared_info_t type
148template< typename UT >
149struct dispatch_shared_info_template {
150 // we need union here to keep the structure size
151 union shared_info_tmpl {
152 dispatch_shared_infoXX_template< UT > s;
153 dispatch_shared_info64_t s64;
154 } u;
155 volatile kmp_uint32 buffer_index;
Jonathan Peytondf6818b2016-06-14 17:57:47 +0000156#if OMP_45_ENABLED
Jonathan Peyton71909c52016-03-02 22:42:06 +0000157 volatile kmp_int32 doacross_buf_idx; // teamwise index
158 kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
159 kmp_int32 doacross_num_done; // count finished threads
160#endif
Jonathan Peyton4d3c2132016-07-08 17:43:21 +0000161#if KMP_USE_HWLOC
162 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
163 // machines (> 48 cores). Performance analysis showed that a cache thrash
164 // was occurring and this padding helps alleviate the problem.
165 char padding[64];
166#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000167};
168
169/* ------------------------------------------------------------------------ */
170/* ------------------------------------------------------------------------ */
171
Jim Cownie5e8470a2013-09-27 10:38:44 +0000172#undef USE_TEST_LOCKS
173
174// test_then_add template (general template should NOT be used)
175template< typename T >
176static __forceinline T
Jonathan Peytone1890e12016-06-13 21:33:30 +0000177test_then_add( volatile T *p, T d );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000178
179template<>
180__forceinline kmp_int32
181test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
182{
183 kmp_int32 r;
184 r = KMP_TEST_THEN_ADD32( p, d );
185 return r;
186}
187
188template<>
189__forceinline kmp_int64
190test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
191{
192 kmp_int64 r;
193 r = KMP_TEST_THEN_ADD64( p, d );
194 return r;
195}
196
197// test_then_inc_acq template (general template should NOT be used)
198template< typename T >
199static __forceinline T
Jonathan Peytone1890e12016-06-13 21:33:30 +0000200test_then_inc_acq( volatile T *p );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000201
202template<>
203__forceinline kmp_int32
204test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
205{
206 kmp_int32 r;
207 r = KMP_TEST_THEN_INC_ACQ32( p );
208 return r;
209}
210
211template<>
212__forceinline kmp_int64
213test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
214{
215 kmp_int64 r;
216 r = KMP_TEST_THEN_INC_ACQ64( p );
217 return r;
218}
219
220// test_then_inc template (general template should NOT be used)
221template< typename T >
222static __forceinline T
Jonathan Peytone1890e12016-06-13 21:33:30 +0000223test_then_inc( volatile T *p );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000224
225template<>
226__forceinline kmp_int32
227test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
228{
229 kmp_int32 r;
230 r = KMP_TEST_THEN_INC32( p );
231 return r;
232}
233
234template<>
235__forceinline kmp_int64
236test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
237{
238 kmp_int64 r;
239 r = KMP_TEST_THEN_INC64( p );
240 return r;
241}
242
243// compare_and_swap template (general template should NOT be used)
244template< typename T >
245static __forceinline kmp_int32
Jonathan Peytone1890e12016-06-13 21:33:30 +0000246compare_and_swap( volatile T *p, T c, T s );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000247
248template<>
249__forceinline kmp_int32
250compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
251{
252 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
253}
254
255template<>
256__forceinline kmp_int32
257compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
258{
259 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
260}
261
262/*
263 Spin wait loop that first does pause, then yield.
264 Waits until function returns non-zero when called with *spinner and check.
265 Does NOT put threads to sleep.
266#if USE_ITT_BUILD
267 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000268 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000269 locks consistently. For example, if lock is acquired immediately, its address is
270 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
271 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
272 address, not an address of low-level spinner.
273#endif // USE_ITT_BUILD
274*/
275template< typename UT >
276// ToDo: make inline function (move to header file for icl)
277static UT // unsigned 4- or 8-byte type
278__kmp_wait_yield( volatile UT * spinner,
279 UT checker,
280 kmp_uint32 (* pred)( UT, UT )
281 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
282 )
283{
284 // note: we may not belong to a team at this point
285 register volatile UT * spin = spinner;
286 register UT check = checker;
287 register kmp_uint32 spins;
288 register kmp_uint32 (*f) ( UT, UT ) = pred;
289 register UT r;
290
291 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
292 KMP_INIT_YIELD( spins );
293 // main wait spin loop
294 while(!f(r = *spin, check))
295 {
296 KMP_FSYNC_SPIN_PREPARE( obj );
297 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
298 It causes problems with infinite recursion because of exit lock */
299 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
300 __kmp_abort_thread(); */
301
Jim Cownie5e8470a2013-09-27 10:38:44 +0000302 // if we are oversubscribed,
303 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
304 // pause is in the following code
305 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
306 KMP_YIELD_SPIN( spins );
307 }
308 KMP_FSYNC_SPIN_ACQUIRED( obj );
309 return r;
310}
311
312template< typename UT >
313static kmp_uint32 __kmp_eq( UT value, UT checker) {
314 return value == checker;
315}
316
317template< typename UT >
318static kmp_uint32 __kmp_neq( UT value, UT checker) {
319 return value != checker;
320}
321
322template< typename UT >
323static kmp_uint32 __kmp_lt( UT value, UT checker) {
324 return value < checker;
325}
326
327template< typename UT >
328static kmp_uint32 __kmp_ge( UT value, UT checker) {
329 return value >= checker;
330}
331
332template< typename UT >
333static kmp_uint32 __kmp_le( UT value, UT checker) {
334 return value <= checker;
335}
336
337
338/* ------------------------------------------------------------------------ */
339/* ------------------------------------------------------------------------ */
340
341static void
342__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
343{
344 kmp_info_t *th;
345
346 KMP_DEBUG_ASSERT( gtid_ref );
347
348 if ( __kmp_env_consistency_check ) {
349 th = __kmp_threads[*gtid_ref];
350 if ( th -> th.th_root -> r.r_active
351 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000352#if KMP_USE_DYNAMIC_LOCK
353 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
354#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000355 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000356#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000357 }
358 }
359}
360
361template< typename UT >
362static void
363__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
364{
365 typedef typename traits_t< UT >::signed_t ST;
366 dispatch_private_info_template< UT > * pr;
367
368 int gtid = *gtid_ref;
369// int cid = *cid_ref;
370 kmp_info_t *th = __kmp_threads[ gtid ];
371 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
372
373 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
374 if ( __kmp_env_consistency_check ) {
375 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
376 ( th -> th.th_dispatch -> th_dispatch_pr_current );
377 if ( pr -> pushed_ws != ct_none ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000378#if KMP_USE_DYNAMIC_LOCK
379 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
380#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000381 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000382#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000383 }
384 }
385
386 if ( ! th -> th.th_team -> t.t_serialized ) {
387 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
388 ( th -> th.th_dispatch -> th_dispatch_sh_current );
389 UT lower;
390
391 if ( ! __kmp_env_consistency_check ) {
392 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
393 ( th -> th.th_dispatch -> th_dispatch_pr_current );
394 }
395 lower = pr->u.p.ordered_lower;
396
397 #if ! defined( KMP_GOMP_COMPAT )
398 if ( __kmp_env_consistency_check ) {
399 if ( pr->ordered_bumped ) {
400 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
401 __kmp_error_construct2(
402 kmp_i18n_msg_CnsMultipleNesting,
403 ct_ordered_in_pdo, loc_ref,
404 & p->stack_data[ p->w_top ]
405 );
406 }
407 }
408 #endif /* !defined(KMP_GOMP_COMPAT) */
409
410 KMP_MB();
411 #ifdef KMP_DEBUG
412 {
413 const char * buff;
414 // create format specifiers before the debug output
415 buff = __kmp_str_format(
416 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
417 traits_t< UT >::spec, traits_t< UT >::spec );
418 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
419 __kmp_str_free( &buff );
420 }
421 #endif
422
423 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
424 USE_ITT_BUILD_ARG( NULL )
425 );
426 KMP_MB(); /* is this necessary? */
427 #ifdef KMP_DEBUG
428 {
429 const char * buff;
430 // create format specifiers before the debug output
431 buff = __kmp_str_format(
432 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
433 traits_t< UT >::spec, traits_t< UT >::spec );
434 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
435 __kmp_str_free( &buff );
436 }
437 #endif
438 }
439 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
440}
441
442static void
443__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
444{
445 kmp_info_t *th;
446
447 if ( __kmp_env_consistency_check ) {
448 th = __kmp_threads[*gtid_ref];
449 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
450 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
451 }
452 }
453}
454
455template< typename UT >
456static void
457__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
458{
459 typedef typename traits_t< UT >::signed_t ST;
460 dispatch_private_info_template< UT > * pr;
461
462 int gtid = *gtid_ref;
463// int cid = *cid_ref;
464 kmp_info_t *th = __kmp_threads[ gtid ];
465 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
466
467 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
468 if ( __kmp_env_consistency_check ) {
469 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
470 ( th -> th.th_dispatch -> th_dispatch_pr_current );
471 if ( pr -> pushed_ws != ct_none ) {
472 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
473 }
474 }
475
476 if ( ! th -> th.th_team -> t.t_serialized ) {
477 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
478 ( th -> th.th_dispatch -> th_dispatch_sh_current );
479
480 if ( ! __kmp_env_consistency_check ) {
481 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
482 ( th -> th.th_dispatch -> th_dispatch_pr_current );
483 }
484
485 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
486 #if ! defined( KMP_GOMP_COMPAT )
487 if ( __kmp_env_consistency_check ) {
488 if ( pr->ordered_bumped != 0 ) {
489 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
490 /* How to test it? - OM */
491 __kmp_error_construct2(
492 kmp_i18n_msg_CnsMultipleNesting,
493 ct_ordered_in_pdo, loc_ref,
494 & p->stack_data[ p->w_top ]
495 );
496 }
497 }
498 #endif /* !defined(KMP_GOMP_COMPAT) */
499
500 KMP_MB(); /* Flush all pending memory write invalidates. */
501
502 pr->ordered_bumped += 1;
503
504 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
505 gtid, pr->ordered_bumped ) );
506
507 KMP_MB(); /* Flush all pending memory write invalidates. */
508
509 /* TODO use general release procedure? */
510 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
511
512 KMP_MB(); /* Flush all pending memory write invalidates. */
513 }
514 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
515}
516
517/* Computes and returns x to the power of y, where y must a non-negative integer */
518template< typename UT >
519static __forceinline long double
520__kmp_pow(long double x, UT y) {
521 long double s=1.0L;
522
523 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
524 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
525 while(y) {
526 if ( y & 1 )
527 s *= x;
528 x *= x;
529 y >>= 1;
530 }
531 return s;
532}
533
534/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
535 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
536 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
537 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
538*/
539template< typename T >
540static __inline typename traits_t< T >::unsigned_t
541__kmp_dispatch_guided_remaining(
542 T tc,
543 typename traits_t< T >::floating_t base,
544 typename traits_t< T >::unsigned_t idx
545) {
546 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
547 least for ICL 8.1, long double arithmetic may not really have
548 long double precision, even with /Qlong_double. Currently, we
549 workaround that in the caller code, by manipulating the FPCW for
550 Windows* OS on IA-32 architecture. The lack of precision is not
551 expected to be a correctness issue, though.
552 */
553 typedef typename traits_t< T >::unsigned_t UT;
554
555 long double x = tc * __kmp_pow< UT >(base, idx);
556 UT r = (UT) x;
557 if ( x == r )
558 return r;
559 return r + 1;
560}
561
562// Parameters of the guided-iterative algorithm:
563// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
564// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
565// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
566// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
567static int guided_int_param = 2;
568static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
569
570// UT - unsigned flavor of T, ST - signed flavor of T,
571// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
572template< typename T >
573static void
574__kmp_dispatch_init(
575 ident_t * loc,
576 int gtid,
577 enum sched_type schedule,
578 T lb,
579 T ub,
580 typename traits_t< T >::signed_t st,
581 typename traits_t< T >::signed_t chunk,
582 int push_ws
583) {
584 typedef typename traits_t< T >::unsigned_t UT;
585 typedef typename traits_t< T >::signed_t ST;
586 typedef typename traits_t< T >::floating_t DBL;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000587
588 int active;
589 T tc;
590 kmp_info_t * th;
591 kmp_team_t * team;
592 kmp_uint32 my_buffer_index;
593 dispatch_private_info_template< T > * pr;
594 dispatch_shared_info_template< UT > volatile * sh;
595
596 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
597 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
598
599 if ( ! TCR_4( __kmp_init_parallel ) )
600 __kmp_parallel_initialize();
601
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000602#if INCLUDE_SSC_MARKS
603 SSC_MARK_DISPATCH_INIT();
604#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000605 #ifdef KMP_DEBUG
606 {
607 const char * buff;
608 // create format specifiers before the debug output
609 buff = __kmp_str_format(
610 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
611 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
612 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
613 __kmp_str_free( &buff );
614 }
615 #endif
616 /* setup data */
617 th = __kmp_threads[ gtid ];
618 team = th -> th.th_team;
619 active = ! team -> t.t_serialized;
620 th->th.th_ident = loc;
621
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000622#if USE_ITT_BUILD
623 kmp_uint64 cur_chunk = chunk;
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000624 int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 &&
625 KMP_MASTER_GTID(gtid) &&
626#if OMP_40_ENABLED
627 th->th.th_teams_microtask == NULL &&
628#endif
629 team->t.t_active_level == 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000630#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000631 if ( ! active ) {
632 pr = reinterpret_cast< dispatch_private_info_template< T >* >
633 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
634 } else {
635 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
636 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
637
638 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
639
640 /* What happens when number of threads changes, need to resize buffer? */
641 pr = reinterpret_cast< dispatch_private_info_template< T > * >
Jonathan Peyton067325f2016-05-31 19:01:15 +0000642 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000643 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
Jonathan Peyton067325f2016-05-31 19:01:15 +0000644 ( &team -> t.t_disp_buffer[ my_buffer_index % __kmp_dispatch_num_buffers ] );
Jim Cownie5e8470a2013-09-27 10:38:44 +0000645 }
646
Andrey Churbanov429dbc22016-07-11 10:44:57 +0000647 #if ( KMP_STATIC_STEAL_ENABLED )
648 if ( SCHEDULE_HAS_NONMONOTONIC(schedule) )
649 // AC: we now have only one implementation of stealing, so use it
650 schedule = kmp_sch_static_steal;
651 else
652 #endif
653 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
Jonathan Peytonea0fe1d2016-02-25 17:55:50 +0000654
Jim Cownie5e8470a2013-09-27 10:38:44 +0000655 /* Pick up the nomerge/ordered bits from the scheduling type */
656 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
657 pr->nomerge = TRUE;
658 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
659 } else {
660 pr->nomerge = FALSE;
661 }
Jonathan Peyton12313d42017-01-27 18:09:22 +0000662 pr->type_size = traits_t<T>::type_size; // remember the size of variables
Jim Cownie5e8470a2013-09-27 10:38:44 +0000663 if ( kmp_ord_lower & schedule ) {
664 pr->ordered = TRUE;
665 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
666 } else {
667 pr->ordered = FALSE;
668 }
Jonathan Peyton45be4502015-08-11 21:36:41 +0000669
Jim Cownie5e8470a2013-09-27 10:38:44 +0000670 if ( schedule == kmp_sch_static ) {
671 schedule = __kmp_static;
672 } else {
673 if ( schedule == kmp_sch_runtime ) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000674 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
675 schedule = team -> t.t_sched.r_sched_type;
676 // Detail the schedule if needed (global controls are differentiated appropriately)
677 if ( schedule == kmp_sch_guided_chunked ) {
678 schedule = __kmp_guided;
679 } else if ( schedule == kmp_sch_static ) {
680 schedule = __kmp_static;
681 }
682 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
683 chunk = team -> t.t_sched.chunk;
Jonathan Peyton00afbd02015-11-12 21:26:22 +0000684#if USE_ITT_BUILD
685 cur_chunk = chunk;
686#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000687 #ifdef KMP_DEBUG
688 {
689 const char * buff;
690 // create format specifiers before the debug output
691 buff = __kmp_str_format(
692 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
693 traits_t< ST >::spec );
694 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
695 __kmp_str_free( &buff );
696 }
697 #endif
698 } else {
699 if ( schedule == kmp_sch_guided_chunked ) {
700 schedule = __kmp_guided;
701 }
702 if ( chunk <= 0 ) {
703 chunk = KMP_DEFAULT_CHUNK;
704 }
705 }
706
Jim Cownie5e8470a2013-09-27 10:38:44 +0000707 if ( schedule == kmp_sch_auto ) {
708 // mapping and differentiation: in the __kmp_do_serial_initialize()
709 schedule = __kmp_auto;
710 #ifdef KMP_DEBUG
711 {
712 const char * buff;
713 // create format specifiers before the debug output
714 buff = __kmp_str_format(
715 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
716 traits_t< ST >::spec );
717 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
718 __kmp_str_free( &buff );
719 }
720 #endif
721 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000722
723 /* guided analytical not safe for too many threads */
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +0000724 if ( schedule == kmp_sch_guided_analytical_chunked && th->th.th_team_nproc > 1<<20 ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +0000725 schedule = kmp_sch_guided_iterative_chunked;
726 KMP_WARNING( DispatchManyThreads );
727 }
728 pr->u.p.parm1 = chunk;
729 }
730 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
731 "unknown scheduling type" );
732
733 pr->u.p.count = 0;
734
735 if ( __kmp_env_consistency_check ) {
736 if ( st == 0 ) {
737 __kmp_error_construct(
738 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
739 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
740 );
741 }
742 }
Jonathan Peyton5235a1b2016-04-18 21:38:29 +0000743 // compute trip count
744 if ( st == 1 ) { // most common case
745 if ( ub >= lb ) {
746 tc = ub - lb + 1;
747 } else { // ub < lb
748 tc = 0; // zero-trip
Jim Cownie5e8470a2013-09-27 10:38:44 +0000749 }
Jonathan Peyton5235a1b2016-04-18 21:38:29 +0000750 } else if ( st < 0 ) {
751 if ( lb >= ub ) {
752 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
753 // where the division needs to be unsigned regardless of the result type
754 tc = (UT)(lb - ub) / (-st) + 1;
755 } else { // lb < ub
756 tc = 0; // zero-trip
757 }
758 } else { // st > 0
759 if ( ub >= lb ) {
760 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
761 // where the division needs to be unsigned regardless of the result type
762 tc = (UT)(ub - lb) / st + 1;
763 } else { // ub < lb
764 tc = 0; // zero-trip
765 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000766 }
767
Jonathan Peyton45be4502015-08-11 21:36:41 +0000768 // Any half-decent optimizer will remove this test when the blocks are empty since the macros expand to nothing
769 // when statistics are disabled.
770 if (schedule == __kmp_static)
771 {
772 KMP_COUNT_BLOCK(OMP_FOR_static);
773 KMP_COUNT_VALUE(FOR_static_iterations, tc);
774 }
775 else
776 {
777 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
778 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
779 }
780
Jim Cownie5e8470a2013-09-27 10:38:44 +0000781 pr->u.p.lb = lb;
782 pr->u.p.ub = ub;
783 pr->u.p.st = st;
784 pr->u.p.tc = tc;
785
786 #if KMP_OS_WINDOWS
787 pr->u.p.last_upper = ub + st;
788 #endif /* KMP_OS_WINDOWS */
789
790 /* NOTE: only the active parallel region(s) has active ordered sections */
791
792 if ( active ) {
793 if ( pr->ordered == 0 ) {
794 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
795 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
796 } else {
797 pr->ordered_bumped = 0;
798
799 pr->u.p.ordered_lower = 1;
800 pr->u.p.ordered_upper = 0;
801
802 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
803 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
804 }
805 }
806
807 if ( __kmp_env_consistency_check ) {
808 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
809 if ( push_ws ) {
810 __kmp_push_workshare( gtid, ws, loc );
811 pr->pushed_ws = ws;
812 } else {
813 __kmp_check_workshare( gtid, ws, loc );
814 pr->pushed_ws = ct_none;
815 }
816 }
817
818 switch ( schedule ) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +0000819 #if ( KMP_STATIC_STEAL_ENABLED )
Jim Cownie5e8470a2013-09-27 10:38:44 +0000820 case kmp_sch_static_steal:
821 {
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +0000822 T nproc = th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000823 T ntc, init;
824
825 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
826
827 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
828 if ( nproc > 1 && ntc >= nproc ) {
Jonathan Peyton12ecbb32017-02-17 17:06:16 +0000829 KMP_COUNT_BLOCK(OMP_FOR_static_steal);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000830 T id = __kmp_tid_from_gtid(gtid);
831 T small_chunk, extras;
832
833 small_chunk = ntc / nproc;
834 extras = ntc % nproc;
835
836 init = id * small_chunk + ( id < extras ? id : extras );
837 pr->u.p.count = init;
838 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
839
840 pr->u.p.parm2 = lb;
841 //pr->pfields.parm3 = 0; // it's not used in static_steal
Andrey Churbanov429dbc22016-07-11 10:44:57 +0000842 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
Jim Cownie5e8470a2013-09-27 10:38:44 +0000843 pr->u.p.st = st;
Jonathan Peyton12313d42017-01-27 18:09:22 +0000844 if ( traits_t<T>::type_size > 4 ) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +0000845 // AC: TODO: check if 16-byte CAS available and use it to
846 // improve performance (probably wait for explicit request
847 // before spending time on this).
848 // For now use dynamically allocated per-thread lock,
849 // free memory in __kmp_dispatch_next when status==0.
850 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
851 th->th.th_dispatch->th_steal_lock =
852 (kmp_lock_t*)__kmp_allocate(sizeof(kmp_lock_t));
853 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
854 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000855 break;
856 } else {
857 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
858 gtid ) );
859 schedule = kmp_sch_static_balanced;
860 /* too few iterations: fall-through to kmp_sch_static_balanced */
861 } // if
862 /* FALL-THROUGH to static balanced */
863 } // case
864 #endif
865 case kmp_sch_static_balanced:
866 {
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +0000867 T nproc = th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000868 T init, limit;
869
870 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
871 gtid ) );
872
873 if ( nproc > 1 ) {
874 T id = __kmp_tid_from_gtid(gtid);
875
876 if ( tc < nproc ) {
877 if ( id < tc ) {
878 init = id;
879 limit = id;
880 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
881 } else {
882 pr->u.p.count = 1; /* means no more chunks to execute */
883 pr->u.p.parm1 = FALSE;
884 break;
885 }
886 } else {
887 T small_chunk = tc / nproc;
888 T extras = tc % nproc;
889 init = id * small_chunk + (id < extras ? id : extras);
890 limit = init + small_chunk - (id < extras ? 0 : 1);
891 pr->u.p.parm1 = (id == nproc - 1);
892 }
893 } else {
894 if ( tc > 0 ) {
895 init = 0;
896 limit = tc - 1;
897 pr->u.p.parm1 = TRUE;
898 } else {
899 // zero trip count
900 pr->u.p.count = 1; /* means no more chunks to execute */
901 pr->u.p.parm1 = FALSE;
902 break;
903 }
904 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000905#if USE_ITT_BUILD
906 // Calculate chunk for metadata report
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000907 if ( itt_need_metadata_reporting )
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000908 cur_chunk = limit - init + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000909#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000910 if ( st == 1 ) {
911 pr->u.p.lb = lb + init;
912 pr->u.p.ub = lb + limit;
913 } else {
914 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
915 pr->u.p.lb = lb + init * st;
916 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
917 if ( st > 0 ) {
918 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
919 } else {
920 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
921 }
922 }
923 if ( pr->ordered ) {
924 pr->u.p.ordered_lower = init;
925 pr->u.p.ordered_upper = limit;
926 }
927 break;
928 } // case
929 case kmp_sch_guided_iterative_chunked :
930 {
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +0000931 T nproc = th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000932 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
933
934 if ( nproc > 1 ) {
935 if ( (2L * chunk + 1 ) * nproc >= tc ) {
936 /* chunk size too large, switch to dynamic */
937 schedule = kmp_sch_dynamic_chunked;
938 } else {
939 // when remaining iters become less than parm2 - switch to dynamic
940 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
941 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
942 }
943 } else {
944 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
945 schedule = kmp_sch_static_greedy;
946 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
947 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
948 pr->u.p.parm1 = tc;
949 } // if
950 } // case
951 break;
952 case kmp_sch_guided_analytical_chunked:
953 {
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +0000954 T nproc = th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000955 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
956
957 if ( nproc > 1 ) {
958 if ( (2L * chunk + 1 ) * nproc >= tc ) {
959 /* chunk size too large, switch to dynamic */
960 schedule = kmp_sch_dynamic_chunked;
961 } else {
962 /* commonly used term: (2 nproc - 1)/(2 nproc) */
963 DBL x;
964
965 #if KMP_OS_WINDOWS && KMP_ARCH_X86
966 /* Linux* OS already has 64-bit computation by default for
967 long double, and on Windows* OS on Intel(R) 64,
968 /Qlong_double doesn't work. On Windows* OS
969 on IA-32 architecture, we need to set precision to
970 64-bit instead of the default 53-bit. Even though long
971 double doesn't work on Windows* OS on Intel(R) 64, the
972 resulting lack of precision is not expected to impact
973 the correctness of the algorithm, but this has not been
974 mathematically proven.
975 */
976 // save original FPCW and set precision to 64-bit, as
977 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +0000978 unsigned int oldFpcw = _control87(0,0);
979 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +0000980 #endif
981 /* value used for comparison in solver for cross-over point */
982 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
983
984 /* crossover point--chunk indexes equal to or greater than
985 this point switch to dynamic-style scheduling */
986 UT cross;
987
988 /* commonly used term: (2 nproc - 1)/(2 nproc) */
989 x = (long double)1.0 - (long double)0.5 / nproc;
990
991 #ifdef KMP_DEBUG
992 { // test natural alignment
993 struct _test_a {
994 char a;
995 union {
996 char b;
997 DBL d;
998 };
999 } t;
1000 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
1001 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
1002 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
1003 }
1004 #endif // KMP_DEBUG
1005
1006 /* save the term in thread private dispatch structure */
1007 *(DBL*)&pr->u.p.parm3 = x;
1008
1009 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
1010 {
1011 UT left, right, mid;
1012 long double p;
1013
1014 /* estimate initial upper and lower bound */
1015
1016 /* doesn't matter what value right is as long as it is positive, but
1017 it affects performance of the solver
1018 */
1019 right = 229;
1020 p = __kmp_pow< UT >(x,right);
1021 if ( p > target ) {
1022 do{
1023 p *= p;
1024 right <<= 1;
1025 } while(p>target && right < (1<<27));
1026 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
1027 } else {
1028 left = 0;
1029 }
1030
1031 /* bisection root-finding method */
1032 while ( left + 1 < right ) {
1033 mid = (left + right) / 2;
1034 if ( __kmp_pow< UT >(x,mid) > target ) {
1035 left = mid;
1036 } else {
1037 right = mid;
1038 }
1039 } // while
1040 cross = right;
1041 }
1042 /* assert sanity of computed crossover point */
1043 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1044
1045 /* save the crossover point in thread private dispatch structure */
1046 pr->u.p.parm2 = cross;
1047
1048 // C75803
1049 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1050 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1051 #else
1052 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1053 #endif
1054 /* dynamic-style scheduling offset */
1055 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1056 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1057 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +00001058 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001059 #endif
1060 } // if
1061 } else {
1062 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1063 gtid ) );
1064 schedule = kmp_sch_static_greedy;
1065 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1066 pr->u.p.parm1 = tc;
1067 } // if
1068 } // case
1069 break;
1070 case kmp_sch_static_greedy:
1071 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +00001072 pr->u.p.parm1 = ( th->th.th_team_nproc > 1 ) ?
1073 ( tc + th->th.th_team_nproc - 1 ) / th->th.th_team_nproc :
Jim Cownie5e8470a2013-09-27 10:38:44 +00001074 tc;
1075 break;
1076 case kmp_sch_static_chunked :
1077 case kmp_sch_dynamic_chunked :
Jonathan Peyton70bda912015-11-06 20:32:44 +00001078 if ( pr->u.p.parm1 <= 0 ) {
1079 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1080 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00001081 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1082 break;
1083 case kmp_sch_trapezoidal :
1084 {
1085 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1086
1087 T parm1, parm2, parm3, parm4;
1088 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1089
1090 parm1 = chunk;
1091
1092 /* F : size of the first cycle */
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +00001093 parm2 = ( tc / (2 * th->th.th_team_nproc) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001094
1095 if ( parm2 < 1 ) {
1096 parm2 = 1;
1097 }
1098
1099 /* L : size of the last cycle. Make sure the last cycle
1100 * is not larger than the first cycle.
1101 */
1102 if ( parm1 < 1 ) {
1103 parm1 = 1;
1104 } else if ( parm1 > parm2 ) {
1105 parm1 = parm2;
1106 }
1107
1108 /* N : number of cycles */
1109 parm3 = ( parm2 + parm1 );
1110 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1111
1112 if ( parm3 < 2 ) {
1113 parm3 = 2;
1114 }
1115
1116 /* sigma : decreasing incr of the trapezoid */
1117 parm4 = ( parm3 - 1 );
1118 parm4 = ( parm2 - parm1 ) / parm4;
1119
1120 // pointless check, because parm4 >= 0 always
1121 //if ( parm4 < 0 ) {
1122 // parm4 = 0;
1123 //}
1124
1125 pr->u.p.parm1 = parm1;
1126 pr->u.p.parm2 = parm2;
1127 pr->u.p.parm3 = parm3;
1128 pr->u.p.parm4 = parm4;
1129 } // case
1130 break;
1131
1132 default:
1133 {
1134 __kmp_msg(
1135 kmp_ms_fatal, // Severity
1136 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1137 KMP_HNT( GetNewerLibrary ), // Hint
1138 __kmp_msg_null // Variadic argument list terminator
1139 );
1140 }
1141 break;
1142 } // switch
1143 pr->schedule = schedule;
1144 if ( active ) {
1145 /* The name of this buffer should be my_buffer_index when it's free to use it */
1146
1147 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1148 gtid, my_buffer_index, sh->buffer_index) );
1149 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1150 USE_ITT_BUILD_ARG( NULL )
1151 );
1152 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1153 // *always* 32-bit integers.
1154 KMP_MB(); /* is this necessary? */
1155 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1156 gtid, my_buffer_index, sh->buffer_index) );
1157
1158 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1159 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1160#if USE_ITT_BUILD
1161 if ( pr->ordered ) {
1162 __kmp_itt_ordered_init( gtid );
1163 }; // if
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001164 // Report loop metadata
1165 if ( itt_need_metadata_reporting ) {
1166 // Only report metadata by master of active team at level 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001167 kmp_uint64 schedtype = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001168 switch ( schedule ) {
1169 case kmp_sch_static_chunked:
1170 case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1171 break;
1172 case kmp_sch_static_greedy:
1173 cur_chunk = pr->u.p.parm1;
1174 break;
1175 case kmp_sch_dynamic_chunked:
1176 schedtype = 1;
1177 break;
1178 case kmp_sch_guided_iterative_chunked:
1179 case kmp_sch_guided_analytical_chunked:
1180 schedtype = 2;
1181 break;
1182 default:
1183// Should we put this case under "static"?
1184// case kmp_sch_static_steal:
1185 schedtype = 3;
1186 break;
1187 }
1188 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1189 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001190#endif /* USE_ITT_BUILD */
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001191 }; // if
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001192
Jim Cownie5e8470a2013-09-27 10:38:44 +00001193 #ifdef KMP_DEBUG
1194 {
1195 const char * buff;
1196 // create format specifiers before the debug output
1197 buff = __kmp_str_format(
1198 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1199 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1200 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1201 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1202 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1203 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1204 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1205 KD_TRACE(10, ( buff,
1206 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1207 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1208 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1209 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1210 __kmp_str_free( &buff );
1211 }
1212 #endif
1213 #if ( KMP_STATIC_STEAL_ENABLED )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001214 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1215 // all the parm3 variables will contain the same value.
1216 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1217 // rather than program life-time increment.
1218 // So the dedicated variable is required. The 'static_steal_counter' is used.
1219 if( schedule == kmp_sch_static_steal ) {
1220 // Other threads will inspect this variable when searching for a victim.
1221 // This is a flag showing that other threads may steal from this thread since then.
1222 volatile T * p = &pr->u.p.static_steal_counter;
1223 *p = *p + 1;
1224 }
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001225 #endif // ( KMP_STATIC_STEAL_ENABLED )
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001226
1227#if OMPT_SUPPORT && OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001228 if (ompt_enabled &&
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001229 ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1230 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1231 ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1232 ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1233 team_info->parallel_id, task_info->task_id, team_info->microtask);
1234 }
1235#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001236}
1237
1238/*
1239 * For ordered loops, either __kmp_dispatch_finish() should be called after
1240 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1241 * every chunk of iterations. If the ordered section(s) were not executed
1242 * for this iteration (or every iteration in this chunk), we need to set the
1243 * ordered iteration counters so that the next thread can proceed.
1244 */
1245template< typename UT >
1246static void
1247__kmp_dispatch_finish( int gtid, ident_t *loc )
1248{
1249 typedef typename traits_t< UT >::signed_t ST;
1250 kmp_info_t *th = __kmp_threads[ gtid ];
1251
1252 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1253 if ( ! th -> th.th_team -> t.t_serialized ) {
1254
1255 dispatch_private_info_template< UT > * pr =
1256 reinterpret_cast< dispatch_private_info_template< UT >* >
1257 ( th->th.th_dispatch->th_dispatch_pr_current );
1258 dispatch_shared_info_template< UT > volatile * sh =
1259 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1260 ( th->th.th_dispatch->th_dispatch_sh_current );
1261 KMP_DEBUG_ASSERT( pr );
1262 KMP_DEBUG_ASSERT( sh );
1263 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1264 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1265
1266 if ( pr->ordered_bumped ) {
1267 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1268 gtid ) );
1269 pr->ordered_bumped = 0;
1270 } else {
1271 UT lower = pr->u.p.ordered_lower;
1272
1273 #ifdef KMP_DEBUG
1274 {
1275 const char * buff;
1276 // create format specifiers before the debug output
1277 buff = __kmp_str_format(
1278 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1279 traits_t< UT >::spec, traits_t< UT >::spec );
1280 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1281 __kmp_str_free( &buff );
1282 }
1283 #endif
1284
1285 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1286 USE_ITT_BUILD_ARG(NULL)
1287 );
1288 KMP_MB(); /* is this necessary? */
1289 #ifdef KMP_DEBUG
1290 {
1291 const char * buff;
1292 // create format specifiers before the debug output
1293 buff = __kmp_str_format(
1294 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1295 traits_t< UT >::spec, traits_t< UT >::spec );
1296 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1297 __kmp_str_free( &buff );
1298 }
1299 #endif
1300
1301 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1302 } // if
1303 } // if
1304 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1305}
1306
1307#ifdef KMP_GOMP_COMPAT
1308
1309template< typename UT >
1310static void
1311__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1312{
1313 typedef typename traits_t< UT >::signed_t ST;
1314 kmp_info_t *th = __kmp_threads[ gtid ];
1315
1316 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1317 if ( ! th -> th.th_team -> t.t_serialized ) {
1318// int cid;
1319 dispatch_private_info_template< UT > * pr =
1320 reinterpret_cast< dispatch_private_info_template< UT >* >
1321 ( th->th.th_dispatch->th_dispatch_pr_current );
1322 dispatch_shared_info_template< UT > volatile * sh =
1323 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1324 ( th->th.th_dispatch->th_dispatch_sh_current );
1325 KMP_DEBUG_ASSERT( pr );
1326 KMP_DEBUG_ASSERT( sh );
1327 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1328 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1329
1330// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1331 UT lower = pr->u.p.ordered_lower;
1332 UT upper = pr->u.p.ordered_upper;
1333 UT inc = upper - lower + 1;
1334
1335 if ( pr->ordered_bumped == inc ) {
1336 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1337 gtid ) );
1338 pr->ordered_bumped = 0;
1339 } else {
1340 inc -= pr->ordered_bumped;
1341
1342 #ifdef KMP_DEBUG
1343 {
1344 const char * buff;
1345 // create format specifiers before the debug output
1346 buff = __kmp_str_format(
1347 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1348 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1349 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1350 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1351 __kmp_str_free( &buff );
1352 }
1353 #endif
1354
1355 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1356 USE_ITT_BUILD_ARG(NULL)
1357 );
1358
1359 KMP_MB(); /* is this necessary? */
1360 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1361 gtid ) );
1362 pr->ordered_bumped = 0;
1363//!!!!! TODO check if the inc should be unsigned, or signed???
1364 #ifdef KMP_DEBUG
1365 {
1366 const char * buff;
1367 // create format specifiers before the debug output
1368 buff = __kmp_str_format(
1369 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1370 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1371 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1372 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1373 __kmp_str_free( &buff );
1374 }
1375 #endif
1376
1377 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1378 }
1379// }
1380 }
1381 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1382}
1383
1384#endif /* KMP_GOMP_COMPAT */
1385
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001386/* Define a macro for exiting __kmp_dispatch_next(). If status is 0
1387 * (no more work), then tell OMPT the loop is over. In some cases
1388 * kmp_dispatch_fini() is not called. */
1389#if OMPT_SUPPORT && OMPT_TRACE
1390#define OMPT_LOOP_END \
1391 if (status == 0) { \
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001392 if (ompt_enabled && \
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001393 ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1394 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1395 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1396 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1397 team_info->parallel_id, task_info->task_id); \
1398 } \
1399 }
1400#else
1401#define OMPT_LOOP_END // no-op
1402#endif
1403
Jim Cownie5e8470a2013-09-27 10:38:44 +00001404template< typename T >
1405static int
1406__kmp_dispatch_next(
1407 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1408) {
1409
1410 typedef typename traits_t< T >::unsigned_t UT;
1411 typedef typename traits_t< T >::signed_t ST;
1412 typedef typename traits_t< T >::floating_t DBL;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001413
Jonathan Peyton45be4502015-08-11 21:36:41 +00001414 // This is potentially slightly misleading, schedule(runtime) will appear here even if the actual runtme schedule
1415 // is static. (Which points out a disadavantage of schedule(runtime): even when static scheduling is used it costs
1416 // more than a compile time choice to use static scheduling would.)
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001417 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001418
Jim Cownie5e8470a2013-09-27 10:38:44 +00001419 int status;
1420 dispatch_private_info_template< T > * pr;
1421 kmp_info_t * th = __kmp_threads[ gtid ];
1422 kmp_team_t * team = th -> th.th_team;
1423
Andrey Churbanov9ad5c3a2015-07-13 17:52:41 +00001424 KMP_DEBUG_ASSERT( p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie5e8470a2013-09-27 10:38:44 +00001425 #ifdef KMP_DEBUG
1426 {
1427 const char * buff;
1428 // create format specifiers before the debug output
1429 buff = __kmp_str_format(
1430 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1431 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1432 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1433 __kmp_str_free( &buff );
1434 }
1435 #endif
1436
1437 if ( team -> t.t_serialized ) {
1438 /* NOTE: serialize this dispatch becase we are not at the active level */
1439 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1440 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1441 KMP_DEBUG_ASSERT( pr );
1442
1443 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1444 *p_lb = 0;
1445 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001446// if ( p_last != NULL )
1447// *p_last = 0;
1448 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001449 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001450 if ( __kmp_env_consistency_check ) {
1451 if ( pr->pushed_ws != ct_none ) {
1452 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1453 }
1454 }
1455 } else if ( pr->nomerge ) {
1456 kmp_int32 last;
1457 T start;
1458 UT limit, trip, init;
1459 ST incr;
1460 T chunk = pr->u.p.parm1;
1461
1462 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1463
1464 init = chunk * pr->u.p.count++;
1465 trip = pr->u.p.tc - 1;
1466
1467 if ( (status = (init <= trip)) == 0 ) {
1468 *p_lb = 0;
1469 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001470// if ( p_last != NULL )
1471// *p_last = 0;
1472 if ( p_st != NULL )
1473 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001474 if ( __kmp_env_consistency_check ) {
1475 if ( pr->pushed_ws != ct_none ) {
1476 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1477 }
1478 }
1479 } else {
1480 start = pr->u.p.lb;
1481 limit = chunk + init - 1;
1482 incr = pr->u.p.st;
1483
1484 if ( (last = (limit >= trip)) != 0 ) {
1485 limit = trip;
1486 #if KMP_OS_WINDOWS
1487 pr->u.p.last_upper = pr->u.p.ub;
1488 #endif /* KMP_OS_WINDOWS */
1489 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001490 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001491 *p_last = last;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001492 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001493 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001494 if ( incr == 1 ) {
1495 *p_lb = start + init;
1496 *p_ub = start + limit;
1497 } else {
1498 *p_lb = start + init * incr;
1499 *p_ub = start + limit * incr;
1500 }
1501
1502 if ( pr->ordered ) {
1503 pr->u.p.ordered_lower = init;
1504 pr->u.p.ordered_upper = limit;
1505 #ifdef KMP_DEBUG
1506 {
1507 const char * buff;
1508 // create format specifiers before the debug output
1509 buff = __kmp_str_format(
1510 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1511 traits_t< UT >::spec, traits_t< UT >::spec );
1512 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1513 __kmp_str_free( &buff );
1514 }
1515 #endif
1516 } // if
1517 } // if
1518 } else {
1519 pr->u.p.tc = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001520 *p_lb = pr->u.p.lb;
1521 *p_ub = pr->u.p.ub;
1522 #if KMP_OS_WINDOWS
1523 pr->u.p.last_upper = *p_ub;
1524 #endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001525 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001526 *p_last = TRUE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001527 if ( p_st != NULL )
1528 *p_st = pr->u.p.st;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001529 } // if
1530 #ifdef KMP_DEBUG
1531 {
1532 const char * buff;
1533 // create format specifiers before the debug output
1534 buff = __kmp_str_format(
1535 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001536 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie5e8470a2013-09-27 10:38:44 +00001537 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001538 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001539 __kmp_str_free( &buff );
1540 }
1541 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001542#if INCLUDE_SSC_MARKS
1543 SSC_MARK_DISPATCH_NEXT();
1544#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001545 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001546 return status;
1547 } else {
1548 kmp_int32 last = 0;
1549 dispatch_shared_info_template< UT > *sh;
1550 T start;
1551 ST incr;
1552 UT limit, trip, init;
1553
1554 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1555 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1556
1557 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1558 ( th->th.th_dispatch->th_dispatch_pr_current );
1559 KMP_DEBUG_ASSERT( pr );
1560 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1561 ( th->th.th_dispatch->th_dispatch_sh_current );
1562 KMP_DEBUG_ASSERT( sh );
1563
1564 if ( pr->u.p.tc == 0 ) {
1565 // zero trip count
1566 status = 0;
1567 } else {
1568 switch (pr->schedule) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001569 #if ( KMP_STATIC_STEAL_ENABLED )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001570 case kmp_sch_static_steal:
1571 {
1572 T chunk = pr->u.p.parm1;
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001573 int nproc = th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001574
1575 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1576
1577 trip = pr->u.p.tc - 1;
1578
Jonathan Peyton12313d42017-01-27 18:09:22 +00001579 if ( traits_t<T>::type_size > 4 ) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001580 // use lock for 8-byte and CAS for 4-byte induction
1581 // variable. TODO (optional): check and use 16-byte CAS
1582 kmp_lock_t * lck = th->th.th_dispatch->th_steal_lock;
1583 KMP_DEBUG_ASSERT(lck != NULL);
1584 if( pr->u.p.count < (UT)pr->u.p.ub ) {
1585 __kmp_acquire_lock(lck, gtid);
1586 // try to get own chunk of iterations
1587 init = ( pr->u.p.count )++;
1588 status = ( init < (UT)pr->u.p.ub );
1589 __kmp_release_lock(lck, gtid);
1590 } else {
1591 status = 0; // no own chunks
1592 }
1593 if( !status ) { // try to steal
1594 kmp_info_t **other_threads = team->t.t_threads;
1595 int while_limit = nproc; // nproc attempts to find a victim
1596 int while_index = 0;
1597 // TODO: algorithm of searching for a victim
1598 // should be cleaned up and measured
1599 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1600 T remaining;
1601 T victimIdx = pr->u.p.parm4;
1602 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1603 dispatch_private_info_template< T > * victim =
1604 reinterpret_cast< dispatch_private_info_template< T >* >
1605 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1606 while( ( victim == NULL || victim == pr ||
1607 ( *(volatile T*)&victim->u.p.static_steal_counter !=
1608 *(volatile T*)&pr->u.p.static_steal_counter ) ) &&
1609 oldVictimIdx != victimIdx )
1610 {
1611 victimIdx = (victimIdx + 1) % nproc;
1612 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1613 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1614 };
1615 if( !victim ||
1616 ( *(volatile T *)&victim->u.p.static_steal_counter !=
1617 *(volatile T *)&pr->u.p.static_steal_counter ) )
1618 {
1619 continue; // try once more (nproc attempts in total)
1620 // no victim is ready yet to participate in stealing
1621 // because all victims are still in kmp_init_dispatch
1622 }
1623 if( victim->u.p.count + 2 > (UT)victim->u.p.ub ) {
1624 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1625 continue; // not enough chunks to steal, goto next victim
1626 }
1627
1628 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1629 KMP_ASSERT(lck != NULL);
1630 __kmp_acquire_lock(lck, gtid);
1631 limit = victim->u.p.ub; // keep initial ub
1632 if( victim->u.p.count >= limit ||
1633 (remaining = limit - victim->u.p.count) < 2 )
1634 {
1635 __kmp_release_lock(lck, gtid);
1636 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1637 continue; // not enough chunks to steal
1638 }
1639 // stealing succeded, reduce victim's ub by 1/4 of undone chunks or by 1
1640 if( remaining > 3 ) {
Jonathan Peyton12ecbb32017-02-17 17:06:16 +00001641 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining>>2);
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001642 init = ( victim->u.p.ub -= (remaining>>2) ); // steal 1/4 of remaining
1643 } else {
Jonathan Peyton12ecbb32017-02-17 17:06:16 +00001644 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001645 init = ( victim->u.p.ub -= 1 ); // steal 1 chunk of 2 or 3 remaining
1646 }
1647 __kmp_release_lock(lck, gtid);
1648
1649 KMP_DEBUG_ASSERT(init + 1 <= limit);
1650 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1651 status = 1;
1652 while_index = 0;
1653 // now update own count and ub with stolen range but init chunk
1654 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1655 pr->u.p.count = init + 1;
1656 pr->u.p.ub = limit;
1657 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1658 } // while (search for victim)
1659 } // if (try to find victim and steal)
Jim Cownie5e8470a2013-09-27 10:38:44 +00001660 } else {
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001661 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
Jim Cownie5e8470a2013-09-27 10:38:44 +00001662 typedef union {
1663 struct {
1664 UT count;
1665 T ub;
1666 } p;
1667 kmp_int64 b;
1668 } union_i4;
1669 // All operations on 'count' or 'ub' must be combined atomically together.
Jim Cownie5e8470a2013-09-27 10:38:44 +00001670 {
1671 union_i4 vold, vnew;
1672 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1673 vnew = vold;
1674 vnew.p.count++;
1675 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1676 ( volatile kmp_int64* )&pr->u.p.count,
1677 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1678 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1679 KMP_CPU_PAUSE();
1680 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1681 vnew = vold;
1682 vnew.p.count++;
1683 }
1684 vnew = vold;
1685 init = vnew.p.count;
1686 status = ( init < (UT)vnew.p.ub ) ;
1687 }
1688
1689 if( !status ) {
1690 kmp_info_t **other_threads = team->t.t_threads;
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001691 int while_limit = nproc; // nproc attempts to find a victim
Jim Cownie5e8470a2013-09-27 10:38:44 +00001692 int while_index = 0;
1693
1694 // TODO: algorithm of searching for a victim
1695 // should be cleaned up and measured
1696 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1697 union_i4 vold, vnew;
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001698 kmp_int32 remaining;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001699 T victimIdx = pr->u.p.parm4;
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001700 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1701 dispatch_private_info_template< T > * victim =
1702 reinterpret_cast< dispatch_private_info_template< T >* >
1703 (other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current);
1704 while( (victim == NULL || victim == pr ||
1705 (*(volatile T*)&victim->u.p.static_steal_counter !=
1706 *(volatile T*)&pr->u.p.static_steal_counter)) &&
1707 oldVictimIdx != victimIdx )
1708 {
1709 victimIdx = (victimIdx + 1) % nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001710 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1711 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001712 };
1713 if( !victim ||
1714 ( *(volatile T *)&victim->u.p.static_steal_counter !=
1715 *(volatile T *)&pr->u.p.static_steal_counter ) )
1716 {
1717 continue; // try once more (nproc attempts in total)
1718 // no victim is ready yet to participate in stealing
1719 // because all victims are still in kmp_init_dispatch
Jim Cownie5e8470a2013-09-27 10:38:44 +00001720 }
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001721 pr->u.p.parm4 = victimIdx; // new victim found
1722 while( 1 ) { // CAS loop if victim has enough chunks to steal
Jim Cownie5e8470a2013-09-27 10:38:44 +00001723 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1724 vnew = vold;
1725
1726 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001727 if ( vnew.p.count >= (UT)vnew.p.ub ||
1728 (remaining = vnew.p.ub - vnew.p.count) < 2 )
1729 {
1730 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id
1731 break; // not enough chunks to steal, goto next victim
Jim Cownie5e8470a2013-09-27 10:38:44 +00001732 }
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001733 if( remaining > 3 ) {
1734 vnew.p.ub -= (remaining>>2); // try to steal 1/4 of remaining
1735 } else {
1736 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1737 }
Jim Cownie5e8470a2013-09-27 10:38:44 +00001738 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001739 // TODO: Should this be acquire or release?
1740 if ( KMP_COMPARE_AND_STORE_ACQ64(
1741 ( volatile kmp_int64 * )&victim->u.p.count,
1742 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1743 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001744 // stealing succedded
Jonathan Peyton12ecbb32017-02-17 17:06:16 +00001745 KMP_COUNT_VALUE(FOR_static_steal_stolen, vold.p.ub-vnew.p.ub);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001746 status = 1;
1747 while_index = 0;
1748 // now update own count and ub
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001749 init = vnew.p.ub;
1750 vold.p.count = init + 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001751 #if KMP_ARCH_X86
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001752 KMP_XCHG_FIXED64(( volatile kmp_int64 * )(&pr->u.p.count), vold.b);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001753 #else
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001754 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1755 #endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001756 break;
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001757 } // if (check CAS result)
1758 KMP_CPU_PAUSE(); // CAS failed, repeate attempt
1759 } // while (try to steal from particular victim)
1760 } // while (search for victim)
1761 } // if (try to find victim and steal)
1762 } // if (4-byte induction variable)
Jim Cownie5e8470a2013-09-27 10:38:44 +00001763 if ( !status ) {
1764 *p_lb = 0;
1765 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001766 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001767 } else {
1768 start = pr->u.p.parm2;
1769 init *= chunk;
1770 limit = chunk + init - 1;
1771 incr = pr->u.p.st;
Jonathan Peyton12ecbb32017-02-17 17:06:16 +00001772 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001773
1774 KMP_DEBUG_ASSERT(init <= trip);
1775 if ( (last = (limit >= trip)) != 0 )
1776 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001777 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001778
1779 if ( incr == 1 ) {
1780 *p_lb = start + init;
1781 *p_ub = start + limit;
1782 } else {
1783 *p_lb = start + init * incr;
1784 *p_ub = start + limit * incr;
1785 }
1786
1787 if ( pr->ordered ) {
1788 pr->u.p.ordered_lower = init;
1789 pr->u.p.ordered_upper = limit;
1790 #ifdef KMP_DEBUG
1791 {
1792 const char * buff;
1793 // create format specifiers before the debug output
1794 buff = __kmp_str_format(
1795 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1796 traits_t< UT >::spec, traits_t< UT >::spec );
1797 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1798 __kmp_str_free( &buff );
1799 }
1800 #endif
1801 } // if
1802 } // if
1803 break;
1804 } // case
Andrey Churbanov429dbc22016-07-11 10:44:57 +00001805 #endif // ( KMP_STATIC_STEAL_ENABLED )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001806 case kmp_sch_static_balanced:
1807 {
1808 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1809 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1810 pr->u.p.count = 1;
1811 *p_lb = pr->u.p.lb;
1812 *p_ub = pr->u.p.ub;
1813 last = pr->u.p.parm1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001814 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001815 *p_st = pr->u.p.st;
1816 } else { /* no iterations to do */
1817 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1818 }
1819 if ( pr->ordered ) {
1820 #ifdef KMP_DEBUG
1821 {
1822 const char * buff;
1823 // create format specifiers before the debug output
1824 buff = __kmp_str_format(
1825 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1826 traits_t< UT >::spec, traits_t< UT >::spec );
1827 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1828 __kmp_str_free( &buff );
1829 }
1830 #endif
1831 } // if
1832 } // case
1833 break;
1834 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1835 case kmp_sch_static_chunked:
1836 {
1837 T parm1;
1838
1839 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1840 gtid ) );
1841 parm1 = pr->u.p.parm1;
1842
1843 trip = pr->u.p.tc - 1;
1844 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1845
1846 if ( (status = (init <= trip)) != 0 ) {
1847 start = pr->u.p.lb;
1848 incr = pr->u.p.st;
1849 limit = parm1 + init - 1;
1850
1851 if ( (last = (limit >= trip)) != 0 )
1852 limit = trip;
1853
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001854 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001855
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +00001856 pr->u.p.count += th->th.th_team_nproc;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001857
1858 if ( incr == 1 ) {
1859 *p_lb = start + init;
1860 *p_ub = start + limit;
1861 }
1862 else {
1863 *p_lb = start + init * incr;
1864 *p_ub = start + limit * incr;
1865 }
1866
1867 if ( pr->ordered ) {
1868 pr->u.p.ordered_lower = init;
1869 pr->u.p.ordered_upper = limit;
1870 #ifdef KMP_DEBUG
1871 {
1872 const char * buff;
1873 // create format specifiers before the debug output
1874 buff = __kmp_str_format(
1875 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1876 traits_t< UT >::spec, traits_t< UT >::spec );
1877 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1878 __kmp_str_free( &buff );
1879 }
1880 #endif
1881 } // if
1882 } // if
1883 } // case
1884 break;
1885
1886 case kmp_sch_dynamic_chunked:
1887 {
1888 T chunk = pr->u.p.parm1;
1889
1890 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1891 gtid ) );
1892
1893 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1894 trip = pr->u.p.tc - 1;
1895
1896 if ( (status = (init <= trip)) == 0 ) {
1897 *p_lb = 0;
1898 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001899 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001900 } else {
1901 start = pr->u.p.lb;
1902 limit = chunk + init - 1;
1903 incr = pr->u.p.st;
1904
1905 if ( (last = (limit >= trip)) != 0 )
1906 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001907
1908 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001909
1910 if ( incr == 1 ) {
1911 *p_lb = start + init;
1912 *p_ub = start + limit;
1913 } else {
1914 *p_lb = start + init * incr;
1915 *p_ub = start + limit * incr;
1916 }
1917
1918 if ( pr->ordered ) {
1919 pr->u.p.ordered_lower = init;
1920 pr->u.p.ordered_upper = limit;
1921 #ifdef KMP_DEBUG
1922 {
1923 const char * buff;
1924 // create format specifiers before the debug output
1925 buff = __kmp_str_format(
1926 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1927 traits_t< UT >::spec, traits_t< UT >::spec );
1928 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1929 __kmp_str_free( &buff );
1930 }
1931 #endif
1932 } // if
1933 } // if
1934 } // case
1935 break;
1936
1937 case kmp_sch_guided_iterative_chunked:
1938 {
1939 T chunkspec = pr->u.p.parm1;
1940 KD_TRACE(100,
1941 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1942 trip = pr->u.p.tc;
1943 // Start atomic part of calculations
1944 while(1) {
1945 ST remaining; // signed, because can be < 0
1946 init = sh->u.s.iteration; // shared value
1947 remaining = trip - init;
1948 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1949 // nothing to do, don't try atomic op
1950 status = 0;
1951 break;
1952 }
1953 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1954 // use dynamic-style shcedule
1955 // atomically inrement iterations, get old value
1956 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1957 remaining = trip - init;
1958 if (remaining <= 0) {
1959 status = 0; // all iterations got by other threads
1960 } else {
1961 // got some iterations to work on
1962 status = 1;
1963 if ( (T)remaining > chunkspec ) {
1964 limit = init + chunkspec - 1;
1965 } else {
1966 last = 1; // the last chunk
1967 limit = init + remaining - 1;
1968 } // if
1969 } // if
1970 break;
1971 } // if
1972 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1973 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1974 // CAS was successful, chunk obtained
1975 status = 1;
1976 --limit;
1977 break;
1978 } // if
1979 } // while
1980 if ( status != 0 ) {
1981 start = pr->u.p.lb;
1982 incr = pr->u.p.st;
1983 if ( p_st != NULL )
1984 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001985 *p_lb = start + init * incr;
1986 *p_ub = start + limit * incr;
1987 if ( pr->ordered ) {
1988 pr->u.p.ordered_lower = init;
1989 pr->u.p.ordered_upper = limit;
1990 #ifdef KMP_DEBUG
1991 {
1992 const char * buff;
1993 // create format specifiers before the debug output
1994 buff = __kmp_str_format(
1995 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1996 traits_t< UT >::spec, traits_t< UT >::spec );
1997 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1998 __kmp_str_free( &buff );
1999 }
2000 #endif
2001 } // if
2002 } else {
2003 *p_lb = 0;
2004 *p_ub = 0;
2005 if ( p_st != NULL )
2006 *p_st = 0;
2007 } // if
2008 } // case
2009 break;
2010
2011 case kmp_sch_guided_analytical_chunked:
2012 {
2013 T chunkspec = pr->u.p.parm1;
2014 UT chunkIdx;
2015 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2016 /* for storing original FPCW value for Windows* OS on
2017 IA-32 architecture 8-byte version */
2018 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00002019 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002020 #endif
2021 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
2022 gtid ) );
2023
2024 trip = pr->u.p.tc;
2025
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +00002026 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2027 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc < trip);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002028
2029 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
2030 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
2031 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
2032 --trip;
2033 /* use dynamic-style scheduling */
2034 init = chunkIdx * chunkspec + pr->u.p.count;
2035 /* need to verify init > 0 in case of overflow in the above calculation */
2036 if ( (status = (init > 0 && init <= trip)) != 0 ) {
2037 limit = init + chunkspec -1;
2038
2039 if ( (last = (limit >= trip)) != 0 )
2040 limit = trip;
2041 }
2042 break;
2043 } else {
2044 /* use exponential-style scheduling */
2045 /* The following check is to workaround the lack of long double precision on Windows* OS.
2046 This check works around the possible effect that init != 0 for chunkIdx == 0.
2047 */
2048 #if KMP_OS_WINDOWS && KMP_ARCH_X86
2049 /* If we haven't already done so, save original
2050 FPCW and set precision to 64-bit, as Windows* OS
2051 on IA-32 architecture defaults to 53-bit */
2052 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00002053 oldFpcw = _control87(0,0);
2054 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002055 fpcwSet = 0x30000;
2056 }
2057 #endif
2058 if ( chunkIdx ) {
2059 init = __kmp_dispatch_guided_remaining< T >(
2060 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
2061 KMP_DEBUG_ASSERT(init);
2062 init = trip - init;
2063 } else
2064 init = 0;
2065 limit = trip - __kmp_dispatch_guided_remaining< T >(
2066 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
2067 KMP_ASSERT(init <= limit);
2068 if ( init < limit ) {
2069 KMP_DEBUG_ASSERT(limit <= trip);
2070 --limit;
2071 status = 1;
2072 break;
2073 } // if
2074 } // if
2075 } // while (1)
2076 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00002077 /* restore FPCW if necessary
2078 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
2079 */
2080 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
2081 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002082 #endif
2083 if ( status != 0 ) {
2084 start = pr->u.p.lb;
2085 incr = pr->u.p.st;
2086 if ( p_st != NULL )
2087 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002088 *p_lb = start + init * incr;
2089 *p_ub = start + limit * incr;
2090 if ( pr->ordered ) {
2091 pr->u.p.ordered_lower = init;
2092 pr->u.p.ordered_upper = limit;
2093 #ifdef KMP_DEBUG
2094 {
2095 const char * buff;
2096 // create format specifiers before the debug output
2097 buff = __kmp_str_format(
2098 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2099 traits_t< UT >::spec, traits_t< UT >::spec );
2100 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2101 __kmp_str_free( &buff );
2102 }
2103 #endif
2104 }
2105 } else {
2106 *p_lb = 0;
2107 *p_ub = 0;
2108 if ( p_st != NULL )
2109 *p_st = 0;
2110 }
2111 } // case
2112 break;
2113
2114 case kmp_sch_trapezoidal:
2115 {
2116 UT index;
2117 T parm2 = pr->u.p.parm2;
2118 T parm3 = pr->u.p.parm3;
2119 T parm4 = pr->u.p.parm4;
2120 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2121 gtid ) );
2122
2123 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
2124
2125 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
2126 trip = pr->u.p.tc - 1;
2127
2128 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
2129 *p_lb = 0;
2130 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002131 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002132 } else {
2133 start = pr->u.p.lb;
2134 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
2135 incr = pr->u.p.st;
2136
2137 if ( (last = (limit >= trip)) != 0 )
2138 limit = trip;
2139
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002140 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002141
2142 if ( incr == 1 ) {
2143 *p_lb = start + init;
2144 *p_ub = start + limit;
2145 } else {
2146 *p_lb = start + init * incr;
2147 *p_ub = start + limit * incr;
2148 }
2149
2150 if ( pr->ordered ) {
2151 pr->u.p.ordered_lower = init;
2152 pr->u.p.ordered_upper = limit;
2153 #ifdef KMP_DEBUG
2154 {
2155 const char * buff;
2156 // create format specifiers before the debug output
2157 buff = __kmp_str_format(
2158 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2159 traits_t< UT >::spec, traits_t< UT >::spec );
2160 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2161 __kmp_str_free( &buff );
2162 }
2163 #endif
2164 } // if
2165 } // if
2166 } // case
2167 break;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002168 default:
2169 {
2170 status = 0; // to avoid complaints on uninitialized variable use
2171 __kmp_msg(
2172 kmp_ms_fatal, // Severity
2173 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2174 KMP_HNT( GetNewerLibrary ), // Hint
2175 __kmp_msg_null // Variadic argument list terminator
2176 );
2177 }
2178 break;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002179 } // switch
2180 } // if tc == 0;
2181
2182 if ( status == 0 ) {
2183 UT num_done;
2184
2185 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2186 #ifdef KMP_DEBUG
2187 {
2188 const char * buff;
2189 // create format specifiers before the debug output
2190 buff = __kmp_str_format(
2191 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2192 traits_t< UT >::spec );
2193 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2194 __kmp_str_free( &buff );
2195 }
2196 #endif
2197
Jonathan Peytonff5ca8b2016-06-21 18:30:15 +00002198 if ( (ST)num_done == th->th.th_team_nproc - 1 ) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +00002199 #if ( KMP_STATIC_STEAL_ENABLED )
Jonathan Peyton12313d42017-01-27 18:09:22 +00002200 if( pr->schedule == kmp_sch_static_steal && traits_t<T>::type_size > 4 ) {
Andrey Churbanov429dbc22016-07-11 10:44:57 +00002201 int i;
2202 kmp_info_t **other_threads = team->t.t_threads;
2203 // loop complete, safe to destroy locks used for stealing
2204 for( i = 0; i < th->th.th_team_nproc; ++i ) {
2205 kmp_lock_t * lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2206 KMP_ASSERT(lck != NULL);
2207 __kmp_destroy_lock( lck );
2208 __kmp_free( lck );
2209 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2210 }
2211 }
2212 #endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00002213 /* NOTE: release this buffer to be reused */
2214
2215 KMP_MB(); /* Flush all pending memory write invalidates. */
2216
2217 sh->u.s.num_done = 0;
2218 sh->u.s.iteration = 0;
2219
2220 /* TODO replace with general release procedure? */
2221 if ( pr->ordered ) {
2222 sh->u.s.ordered_iteration = 0;
2223 }
2224
2225 KMP_MB(); /* Flush all pending memory write invalidates. */
2226
Jonathan Peyton067325f2016-05-31 19:01:15 +00002227 sh -> buffer_index += __kmp_dispatch_num_buffers;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002228 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2229 gtid, sh->buffer_index) );
2230
2231 KMP_MB(); /* Flush all pending memory write invalidates. */
2232
2233 } // if
2234 if ( __kmp_env_consistency_check ) {
2235 if ( pr->pushed_ws != ct_none ) {
2236 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2237 }
2238 }
2239
2240 th -> th.th_dispatch -> th_deo_fcn = NULL;
2241 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2242 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2243 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2244 } // if (status == 0)
2245#if KMP_OS_WINDOWS
2246 else if ( last ) {
2247 pr->u.p.last_upper = pr->u.p.ub;
2248 }
2249#endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002250 if ( p_last != NULL && status != 0 )
2251 *p_last = last;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002252 } // if
2253
2254 #ifdef KMP_DEBUG
2255 {
2256 const char * buff;
2257 // create format specifiers before the debug output
2258 buff = __kmp_str_format(
2259 "__kmp_dispatch_next: T#%%d normal case: " \
2260 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2261 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2262 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2263 __kmp_str_free( &buff );
2264 }
2265 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002266#if INCLUDE_SSC_MARKS
2267 SSC_MARK_DISPATCH_NEXT();
2268#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00002269 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002270 return status;
2271}
2272
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002273template< typename T >
2274static void
2275__kmp_dist_get_bounds(
2276 ident_t *loc,
2277 kmp_int32 gtid,
2278 kmp_int32 *plastiter,
2279 T *plower,
2280 T *pupper,
2281 typename traits_t< T >::signed_t incr
2282) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002283 typedef typename traits_t< T >::unsigned_t UT;
2284 typedef typename traits_t< T >::signed_t ST;
2285 register kmp_uint32 team_id;
2286 register kmp_uint32 nteams;
2287 register UT trip_count;
2288 register kmp_team_t *team;
2289 kmp_info_t * th;
2290
2291 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2292 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2293 #ifdef KMP_DEBUG
2294 {
2295 const char * buff;
2296 // create format specifiers before the debug output
2297 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2298 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2299 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2300 traits_t< T >::spec );
2301 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2302 __kmp_str_free( &buff );
2303 }
2304 #endif
2305
2306 if( __kmp_env_consistency_check ) {
2307 if( incr == 0 ) {
2308 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2309 }
2310 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2311 // The loop is illegal.
2312 // Some zero-trip loops maintained by compiler, e.g.:
2313 // for(i=10;i<0;++i) // lower >= upper - run-time check
2314 // for(i=0;i>10;--i) // lower <= upper - run-time check
2315 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2316 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2317 // Compiler does not check the following illegal loops:
2318 // for(i=0;i<10;i+=incr) // where incr<0
2319 // for(i=10;i>0;i-=incr) // where incr<0
2320 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2321 }
2322 }
2323 th = __kmp_threads[gtid];
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002324 team = th->th.th_team;
2325 #if OMP_40_ENABLED
Jonathan Peyton441f3372015-09-21 17:24:46 +00002326 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002327 nteams = th->th.th_teams_size.nteams;
2328 #endif
2329 team_id = team->t.t_master_tid;
2330 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2331
2332 // compute global trip count
2333 if( incr == 1 ) {
2334 trip_count = *pupper - *plower + 1;
2335 } else if(incr == -1) {
2336 trip_count = *plower - *pupper + 1;
Jonathan Peyton5235a1b2016-04-18 21:38:29 +00002337 } else if ( incr > 0 ) {
2338 // upper-lower can exceed the limit of signed type
2339 trip_count = (UT)(*pupper - *plower) / incr + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002340 } else {
Jonathan Peyton5235a1b2016-04-18 21:38:29 +00002341 trip_count = (UT)(*plower - *pupper) / ( -incr ) + 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002342 }
Jonathan Peyton45be4502015-08-11 21:36:41 +00002343
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002344 if( trip_count <= nteams ) {
2345 KMP_DEBUG_ASSERT(
2346 __kmp_static == kmp_sch_static_greedy || \
2347 __kmp_static == kmp_sch_static_balanced
2348 ); // Unknown static scheduling type.
2349 // only some teams get single iteration, others get nothing
2350 if( team_id < trip_count ) {
2351 *pupper = *plower = *plower + team_id * incr;
2352 } else {
2353 *plower = *pupper + incr; // zero-trip loop
2354 }
2355 if( plastiter != NULL )
2356 *plastiter = ( team_id == trip_count - 1 );
2357 } else {
2358 if( __kmp_static == kmp_sch_static_balanced ) {
2359 register UT chunk = trip_count / nteams;
2360 register UT extras = trip_count % nteams;
2361 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2362 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2363 if( plastiter != NULL )
2364 *plastiter = ( team_id == nteams - 1 );
2365 } else {
2366 register T chunk_inc_count =
2367 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2368 register T upper = *pupper;
2369 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2370 // Unknown static scheduling type.
2371 *plower += team_id * chunk_inc_count;
2372 *pupper = *plower + chunk_inc_count - incr;
2373 // Check/correct bounds if needed
2374 if( incr > 0 ) {
2375 if( *pupper < *plower )
Jonathan Peyton12313d42017-01-27 18:09:22 +00002376 *pupper = traits_t<T>::max_value;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002377 if( plastiter != NULL )
2378 *plastiter = *plower <= upper && *pupper > upper - incr;
2379 if( *pupper > upper )
2380 *pupper = upper; // tracker C73258
2381 } else {
2382 if( *pupper > *plower )
Jonathan Peyton12313d42017-01-27 18:09:22 +00002383 *pupper = traits_t<T>::min_value;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002384 if( plastiter != NULL )
2385 *plastiter = *plower >= upper && *pupper < upper - incr;
2386 if( *pupper < upper )
2387 *pupper = upper; // tracker C73258
2388 }
2389 }
2390 }
2391}
2392
Jim Cownie5e8470a2013-09-27 10:38:44 +00002393//-----------------------------------------------------------------------------------------
2394// Dispatch routines
2395// Transfer call to template< type T >
2396// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2397// T lb, T ub, ST st, ST chunk )
2398extern "C" {
2399
2400/*!
2401@ingroup WORK_SHARING
2402@{
2403@param loc Source location
2404@param gtid Global thread id
2405@param schedule Schedule type
2406@param lb Lower bound
2407@param ub Upper bound
2408@param st Step (or increment if you prefer)
2409@param chunk The chunk size to block with
2410
2411This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2412These functions are all identical apart from the types of the arguments.
2413*/
2414
2415void
2416__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2417 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2418{
2419 KMP_DEBUG_ASSERT( __kmp_init_serial );
2420 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2421}
2422/*!
2423See @ref __kmpc_dispatch_init_4
2424*/
2425void
2426__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2427 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2428{
2429 KMP_DEBUG_ASSERT( __kmp_init_serial );
2430 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2431}
2432
2433/*!
2434See @ref __kmpc_dispatch_init_4
2435*/
2436void
2437__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2438 kmp_int64 lb, kmp_int64 ub,
2439 kmp_int64 st, kmp_int64 chunk )
2440{
2441 KMP_DEBUG_ASSERT( __kmp_init_serial );
2442 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2443}
2444
2445/*!
2446See @ref __kmpc_dispatch_init_4
2447*/
2448void
2449__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2450 kmp_uint64 lb, kmp_uint64 ub,
2451 kmp_int64 st, kmp_int64 chunk )
2452{
2453 KMP_DEBUG_ASSERT( __kmp_init_serial );
2454 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2455}
2456
2457/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002458See @ref __kmpc_dispatch_init_4
2459
2460Difference from __kmpc_dispatch_init set of functions is these functions
2461are called for composite distribute parallel for construct. Thus before
2462regular iterations dispatching we need to calc per-team iteration space.
2463
2464These functions are all identical apart from the types of the arguments.
2465*/
2466void
2467__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2468 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2469{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002470 KMP_DEBUG_ASSERT( __kmp_init_serial );
2471 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2472 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2473}
2474
2475void
2476__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2477 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2478{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002479 KMP_DEBUG_ASSERT( __kmp_init_serial );
2480 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2481 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2482}
2483
2484void
2485__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2486 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2487{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002488 KMP_DEBUG_ASSERT( __kmp_init_serial );
2489 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2490 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2491}
2492
2493void
2494__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2495 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2496{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002497 KMP_DEBUG_ASSERT( __kmp_init_serial );
2498 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2499 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2500}
2501
2502/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002503@param loc Source code location
2504@param gtid Global thread id
2505@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2506@param p_lb Pointer to the lower bound for the next chunk of work
2507@param p_ub Pointer to the upper bound for the next chunk of work
2508@param p_st Pointer to the stride for the next chunk of work
2509@return one if there is work to be done, zero otherwise
2510
2511Get the next dynamically allocated chunk of work for this thread.
2512If there is no more work, then the lb,ub and stride need not be modified.
2513*/
2514int
2515__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2516 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2517{
2518 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2519}
2520
2521/*!
2522See @ref __kmpc_dispatch_next_4
2523*/
2524int
2525__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2526 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2527{
2528 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2529}
2530
2531/*!
2532See @ref __kmpc_dispatch_next_4
2533*/
2534int
2535__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2536 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2537{
2538 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2539}
2540
2541/*!
2542See @ref __kmpc_dispatch_next_4
2543*/
2544int
2545__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2546 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2547{
2548 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2549}
2550
2551/*!
2552@param loc Source code location
2553@param gtid Global thread id
2554
2555Mark the end of a dynamic loop.
2556*/
2557void
2558__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2559{
2560 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2561}
2562
2563/*!
2564See @ref __kmpc_dispatch_fini_4
2565*/
2566void
2567__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2568{
2569 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2570}
2571
2572/*!
2573See @ref __kmpc_dispatch_fini_4
2574*/
2575void
2576__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2577{
2578 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2579}
2580
2581/*!
2582See @ref __kmpc_dispatch_fini_4
2583*/
2584void
2585__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2586{
2587 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2588}
2589/*! @} */
2590
2591//-----------------------------------------------------------------------------------------
Jonathan Peytonde4749b2016-12-14 23:01:24 +00002592//Non-template routines from kmp_dispatch.cpp used in other sources
Jim Cownie5e8470a2013-09-27 10:38:44 +00002593
2594kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2595 return value == checker;
2596}
2597
2598kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2599 return value != checker;
2600}
2601
2602kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2603 return value < checker;
2604}
2605
2606kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2607 return value >= checker;
2608}
2609
2610kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2611 return value <= checker;
2612}
Jim Cownie5e8470a2013-09-27 10:38:44 +00002613
2614kmp_uint32
2615__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2616 kmp_uint32 checker,
2617 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2618 , void * obj // Higher-level synchronization object, or NULL.
2619 )
2620{
2621 // note: we may not belong to a team at this point
2622 register volatile kmp_uint32 * spin = spinner;
2623 register kmp_uint32 check = checker;
2624 register kmp_uint32 spins;
2625 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2626 register kmp_uint32 r;
2627
2628 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2629 KMP_INIT_YIELD( spins );
2630 // main wait spin loop
2631 while(!f(r = TCR_4(*spin), check)) {
2632 KMP_FSYNC_SPIN_PREPARE( obj );
2633 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2634 It causes problems with infinite recursion because of exit lock */
2635 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2636 __kmp_abort_thread(); */
2637
Jim Cownie5e8470a2013-09-27 10:38:44 +00002638 /* if we have waited a bit, or are oversubscribed, yield */
2639 /* pause is in the following code */
2640 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2641 KMP_YIELD_SPIN( spins );
2642 }
2643 KMP_FSYNC_SPIN_ACQUIRED( obj );
2644 return r;
2645}
2646
Paul Osmialowskif7cc6af2016-05-31 20:20:32 +00002647void
2648__kmp_wait_yield_4_ptr(void *spinner,
2649 kmp_uint32 checker,
2650 kmp_uint32 (*pred)( void *, kmp_uint32 ),
2651 void *obj // Higher-level synchronization object, or NULL.
2652 )
2653{
2654 // note: we may not belong to a team at this point
2655 register void *spin = spinner;
2656 register kmp_uint32 check = checker;
2657 register kmp_uint32 spins;
2658 register kmp_uint32 (*f) ( void *, kmp_uint32 ) = pred;
2659
2660 KMP_FSYNC_SPIN_INIT( obj, spin );
2661 KMP_INIT_YIELD( spins );
2662 // main wait spin loop
2663 while ( !f( spin, check ) ) {
2664 KMP_FSYNC_SPIN_PREPARE( obj );
2665 /* if we have waited a bit, or are oversubscribed, yield */
2666 /* pause is in the following code */
2667 KMP_YIELD( TCR_4( __kmp_nth ) > __kmp_avail_proc );
2668 KMP_YIELD_SPIN( spins );
2669 }
2670 KMP_FSYNC_SPIN_ACQUIRED( obj );
2671}
2672
Jim Cownie5e8470a2013-09-27 10:38:44 +00002673} // extern "C"
2674
2675#ifdef KMP_GOMP_COMPAT
2676
2677void
2678__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2679 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2680 kmp_int32 chunk, int push_ws )
2681{
2682 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2683 push_ws );
2684}
2685
2686void
2687__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2688 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2689 kmp_int32 chunk, int push_ws )
2690{
2691 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2692 push_ws );
2693}
2694
2695void
2696__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2697 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2698 kmp_int64 chunk, int push_ws )
2699{
2700 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2701 push_ws );
2702}
2703
2704void
2705__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2706 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2707 kmp_int64 chunk, int push_ws )
2708{
2709 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2710 push_ws );
2711}
2712
2713void
2714__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2715{
2716 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2717}
2718
2719void
2720__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2721{
2722 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2723}
2724
2725void
2726__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2727{
2728 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2729}
2730
2731void
2732__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2733{
2734 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2735}
2736
2737#endif /* KMP_GOMP_COMPAT */
2738
2739/* ------------------------------------------------------------------------ */
2740/* ------------------------------------------------------------------------ */
2741