blob: cc58f493a69a3f31bd752d195ed9f8ef9781a12c [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 * $Revision: 43457 $
4 * $Date: 2014-09-17 03:57:22 -0500 (Wed, 17 Sep 2014) $
Jim Cownie5e8470a2013-09-27 10:38:44 +00005 */
6
7
8//===----------------------------------------------------------------------===//
9//
10// The LLVM Compiler Infrastructure
11//
12// This file is dual licensed under the MIT and the University of Illinois Open
13// Source Licenses. See LICENSE.txt for details.
14//
15//===----------------------------------------------------------------------===//
16
17
18/*
19 * Dynamic scheduling initialization and dispatch.
20 *
21 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
22 * it may change values between parallel regions. __kmp_max_nth
23 * is the largest value __kmp_nth may take, 1 is the smallest.
24 *
25 */
26
27/* ------------------------------------------------------------------------ */
28/* ------------------------------------------------------------------------ */
29
30#include "kmp.h"
31#include "kmp_i18n.h"
32#include "kmp_itt.h"
33#include "kmp_str.h"
34#include "kmp_error.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000035#include "kmp_stats.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000036#if KMP_OS_WINDOWS && KMP_ARCH_X86
37 #include <float.h>
38#endif
39
40/* ------------------------------------------------------------------------ */
41/* ------------------------------------------------------------------------ */
42
Jim Cownie4cc4bb42014-10-07 16:25:50 +000043// template for type limits
44template< typename T >
45struct i_maxmin {
46 static const T mx;
47 static const T mn;
48};
49template<>
50struct i_maxmin< int > {
51 static const int mx = 0x7fffffff;
52 static const int mn = 0x80000000;
53};
54template<>
55struct i_maxmin< unsigned int > {
56 static const unsigned int mx = 0xffffffff;
57 static const unsigned int mn = 0x00000000;
58};
59template<>
60struct i_maxmin< long long > {
61 static const long long mx = 0x7fffffffffffffffLL;
62 static const long long mn = 0x8000000000000000LL;
63};
64template<>
65struct i_maxmin< unsigned long long > {
66 static const unsigned long long mx = 0xffffffffffffffffLL;
67 static const unsigned long long mn = 0x0000000000000000LL;
68};
69//-------------------------------------------------------------------------
70
Jim Cownie5e8470a2013-09-27 10:38:44 +000071#ifdef KMP_STATIC_STEAL_ENABLED
72
73 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
74 template< typename T >
75 struct dispatch_private_infoXX_template {
76 typedef typename traits_t< T >::unsigned_t UT;
77 typedef typename traits_t< T >::signed_t ST;
78 UT count; // unsigned
79 T ub;
80 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
81 T lb;
82 ST st; // signed
83 UT tc; // unsigned
84 T static_steal_counter; // for static_steal only; maybe better to put after ub
85
86 /* parm[1-4] are used in different ways by different scheduling algorithms */
87
88 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
89 // a) parm3 is properly aligned and
90 // b) all parm1-4 are in the same cache line.
91 // Because of parm1-4 are used together, performance seems to be better
92 // if they are in the same line (not measured though).
93
94 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
95 T parm1;
96 T parm2;
97 T parm3;
98 T parm4;
99 };
100
101 UT ordered_lower; // unsigned
102 UT ordered_upper; // unsigned
103 #if KMP_OS_WINDOWS
104 T last_upper;
105 #endif /* KMP_OS_WINDOWS */
106 };
107
108#else /* KMP_STATIC_STEAL_ENABLED */
109
110 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
111 template< typename T >
112 struct dispatch_private_infoXX_template {
113 typedef typename traits_t< T >::unsigned_t UT;
114 typedef typename traits_t< T >::signed_t ST;
115 T lb;
116 T ub;
117 ST st; // signed
118 UT tc; // unsigned
119
120 T parm1;
121 T parm2;
122 T parm3;
123 T parm4;
124
125 UT count; // unsigned
126
127 UT ordered_lower; // unsigned
128 UT ordered_upper; // unsigned
129 #if KMP_OS_WINDOWS
130 T last_upper;
131 #endif /* KMP_OS_WINDOWS */
132 };
133
134#endif /* KMP_STATIC_STEAL_ENABLED */
135
136// replaces dispatch_private_info structure and dispatch_private_info_t type
137template< typename T >
138struct KMP_ALIGN_CACHE dispatch_private_info_template {
139 // duplicate alignment here, otherwise size of structure is not correct in our compiler
140 union KMP_ALIGN_CACHE private_info_tmpl {
141 dispatch_private_infoXX_template< T > p;
142 dispatch_private_info64_t p64;
143 } u;
144 enum sched_type schedule; /* scheduling algorithm */
145 kmp_uint32 ordered; /* ordered clause specified */
146 kmp_uint32 ordered_bumped;
147 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
148 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
149 kmp_uint32 nomerge; /* don't merge iters if serialized */
150 kmp_uint32 type_size;
151 enum cons_type pushed_ws;
152};
153
154
155// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
156template< typename UT >
157struct dispatch_shared_infoXX_template {
158 /* chunk index under dynamic, number of idle threads under static-steal;
159 iteration index otherwise */
160 volatile UT iteration;
161 volatile UT num_done;
162 volatile UT ordered_iteration;
163 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
164};
165
166// replaces dispatch_shared_info structure and dispatch_shared_info_t type
167template< typename UT >
168struct dispatch_shared_info_template {
169 // we need union here to keep the structure size
170 union shared_info_tmpl {
171 dispatch_shared_infoXX_template< UT > s;
172 dispatch_shared_info64_t s64;
173 } u;
174 volatile kmp_uint32 buffer_index;
175};
176
177/* ------------------------------------------------------------------------ */
178/* ------------------------------------------------------------------------ */
179
Jim Cownie5e8470a2013-09-27 10:38:44 +0000180#undef USE_TEST_LOCKS
181
182// test_then_add template (general template should NOT be used)
183template< typename T >
184static __forceinline T
185test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
186
187template<>
188__forceinline kmp_int32
189test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
190{
191 kmp_int32 r;
192 r = KMP_TEST_THEN_ADD32( p, d );
193 return r;
194}
195
196template<>
197__forceinline kmp_int64
198test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
199{
200 kmp_int64 r;
201 r = KMP_TEST_THEN_ADD64( p, d );
202 return r;
203}
204
205// test_then_inc_acq template (general template should NOT be used)
206template< typename T >
207static __forceinline T
208test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
209
210template<>
211__forceinline kmp_int32
212test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
213{
214 kmp_int32 r;
215 r = KMP_TEST_THEN_INC_ACQ32( p );
216 return r;
217}
218
219template<>
220__forceinline kmp_int64
221test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
222{
223 kmp_int64 r;
224 r = KMP_TEST_THEN_INC_ACQ64( p );
225 return r;
226}
227
228// test_then_inc template (general template should NOT be used)
229template< typename T >
230static __forceinline T
231test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
232
233template<>
234__forceinline kmp_int32
235test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
236{
237 kmp_int32 r;
238 r = KMP_TEST_THEN_INC32( p );
239 return r;
240}
241
242template<>
243__forceinline kmp_int64
244test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
245{
246 kmp_int64 r;
247 r = KMP_TEST_THEN_INC64( p );
248 return r;
249}
250
251// compare_and_swap template (general template should NOT be used)
252template< typename T >
253static __forceinline kmp_int32
254compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
255
256template<>
257__forceinline kmp_int32
258compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
259{
260 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
261}
262
263template<>
264__forceinline kmp_int32
265compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
266{
267 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
268}
269
270/*
271 Spin wait loop that first does pause, then yield.
272 Waits until function returns non-zero when called with *spinner and check.
273 Does NOT put threads to sleep.
274#if USE_ITT_BUILD
275 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000276 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000277 locks consistently. For example, if lock is acquired immediately, its address is
278 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
279 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
280 address, not an address of low-level spinner.
281#endif // USE_ITT_BUILD
282*/
283template< typename UT >
284// ToDo: make inline function (move to header file for icl)
285static UT // unsigned 4- or 8-byte type
286__kmp_wait_yield( volatile UT * spinner,
287 UT checker,
288 kmp_uint32 (* pred)( UT, UT )
289 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
290 )
291{
292 // note: we may not belong to a team at this point
293 register volatile UT * spin = spinner;
294 register UT check = checker;
295 register kmp_uint32 spins;
296 register kmp_uint32 (*f) ( UT, UT ) = pred;
297 register UT r;
298
299 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
300 KMP_INIT_YIELD( spins );
301 // main wait spin loop
302 while(!f(r = *spin, check))
303 {
304 KMP_FSYNC_SPIN_PREPARE( obj );
305 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
306 It causes problems with infinite recursion because of exit lock */
307 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
308 __kmp_abort_thread(); */
309
Jim Cownie5e8470a2013-09-27 10:38:44 +0000310 // if we are oversubscribed,
311 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
312 // pause is in the following code
313 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
314 KMP_YIELD_SPIN( spins );
315 }
316 KMP_FSYNC_SPIN_ACQUIRED( obj );
317 return r;
318}
319
320template< typename UT >
321static kmp_uint32 __kmp_eq( UT value, UT checker) {
322 return value == checker;
323}
324
325template< typename UT >
326static kmp_uint32 __kmp_neq( UT value, UT checker) {
327 return value != checker;
328}
329
330template< typename UT >
331static kmp_uint32 __kmp_lt( UT value, UT checker) {
332 return value < checker;
333}
334
335template< typename UT >
336static kmp_uint32 __kmp_ge( UT value, UT checker) {
337 return value >= checker;
338}
339
340template< typename UT >
341static kmp_uint32 __kmp_le( UT value, UT checker) {
342 return value <= checker;
343}
344
345
346/* ------------------------------------------------------------------------ */
347/* ------------------------------------------------------------------------ */
348
349static void
350__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
351{
352 kmp_info_t *th;
353
354 KMP_DEBUG_ASSERT( gtid_ref );
355
356 if ( __kmp_env_consistency_check ) {
357 th = __kmp_threads[*gtid_ref];
358 if ( th -> th.th_root -> r.r_active
359 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
360 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
361 }
362 }
363}
364
365template< typename UT >
366static void
367__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
368{
369 typedef typename traits_t< UT >::signed_t ST;
370 dispatch_private_info_template< UT > * pr;
371
372 int gtid = *gtid_ref;
373// int cid = *cid_ref;
374 kmp_info_t *th = __kmp_threads[ gtid ];
375 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
376
377 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
378 if ( __kmp_env_consistency_check ) {
379 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
380 ( th -> th.th_dispatch -> th_dispatch_pr_current );
381 if ( pr -> pushed_ws != ct_none ) {
382 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
383 }
384 }
385
386 if ( ! th -> th.th_team -> t.t_serialized ) {
387 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
388 ( th -> th.th_dispatch -> th_dispatch_sh_current );
389 UT lower;
390
391 if ( ! __kmp_env_consistency_check ) {
392 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
393 ( th -> th.th_dispatch -> th_dispatch_pr_current );
394 }
395 lower = pr->u.p.ordered_lower;
396
397 #if ! defined( KMP_GOMP_COMPAT )
398 if ( __kmp_env_consistency_check ) {
399 if ( pr->ordered_bumped ) {
400 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
401 __kmp_error_construct2(
402 kmp_i18n_msg_CnsMultipleNesting,
403 ct_ordered_in_pdo, loc_ref,
404 & p->stack_data[ p->w_top ]
405 );
406 }
407 }
408 #endif /* !defined(KMP_GOMP_COMPAT) */
409
410 KMP_MB();
411 #ifdef KMP_DEBUG
412 {
413 const char * buff;
414 // create format specifiers before the debug output
415 buff = __kmp_str_format(
416 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
417 traits_t< UT >::spec, traits_t< UT >::spec );
418 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
419 __kmp_str_free( &buff );
420 }
421 #endif
422
423 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
424 USE_ITT_BUILD_ARG( NULL )
425 );
426 KMP_MB(); /* is this necessary? */
427 #ifdef KMP_DEBUG
428 {
429 const char * buff;
430 // create format specifiers before the debug output
431 buff = __kmp_str_format(
432 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
433 traits_t< UT >::spec, traits_t< UT >::spec );
434 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
435 __kmp_str_free( &buff );
436 }
437 #endif
438 }
439 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
440}
441
442static void
443__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
444{
445 kmp_info_t *th;
446
447 if ( __kmp_env_consistency_check ) {
448 th = __kmp_threads[*gtid_ref];
449 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
450 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
451 }
452 }
453}
454
455template< typename UT >
456static void
457__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
458{
459 typedef typename traits_t< UT >::signed_t ST;
460 dispatch_private_info_template< UT > * pr;
461
462 int gtid = *gtid_ref;
463// int cid = *cid_ref;
464 kmp_info_t *th = __kmp_threads[ gtid ];
465 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
466
467 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
468 if ( __kmp_env_consistency_check ) {
469 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
470 ( th -> th.th_dispatch -> th_dispatch_pr_current );
471 if ( pr -> pushed_ws != ct_none ) {
472 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
473 }
474 }
475
476 if ( ! th -> th.th_team -> t.t_serialized ) {
477 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
478 ( th -> th.th_dispatch -> th_dispatch_sh_current );
479
480 if ( ! __kmp_env_consistency_check ) {
481 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
482 ( th -> th.th_dispatch -> th_dispatch_pr_current );
483 }
484
485 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
486 #if ! defined( KMP_GOMP_COMPAT )
487 if ( __kmp_env_consistency_check ) {
488 if ( pr->ordered_bumped != 0 ) {
489 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
490 /* How to test it? - OM */
491 __kmp_error_construct2(
492 kmp_i18n_msg_CnsMultipleNesting,
493 ct_ordered_in_pdo, loc_ref,
494 & p->stack_data[ p->w_top ]
495 );
496 }
497 }
498 #endif /* !defined(KMP_GOMP_COMPAT) */
499
500 KMP_MB(); /* Flush all pending memory write invalidates. */
501
502 pr->ordered_bumped += 1;
503
504 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
505 gtid, pr->ordered_bumped ) );
506
507 KMP_MB(); /* Flush all pending memory write invalidates. */
508
509 /* TODO use general release procedure? */
510 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
511
512 KMP_MB(); /* Flush all pending memory write invalidates. */
513 }
514 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
515}
516
517/* Computes and returns x to the power of y, where y must a non-negative integer */
518template< typename UT >
519static __forceinline long double
520__kmp_pow(long double x, UT y) {
521 long double s=1.0L;
522
523 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
524 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
525 while(y) {
526 if ( y & 1 )
527 s *= x;
528 x *= x;
529 y >>= 1;
530 }
531 return s;
532}
533
534/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
535 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
536 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
537 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
538*/
539template< typename T >
540static __inline typename traits_t< T >::unsigned_t
541__kmp_dispatch_guided_remaining(
542 T tc,
543 typename traits_t< T >::floating_t base,
544 typename traits_t< T >::unsigned_t idx
545) {
546 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
547 least for ICL 8.1, long double arithmetic may not really have
548 long double precision, even with /Qlong_double. Currently, we
549 workaround that in the caller code, by manipulating the FPCW for
550 Windows* OS on IA-32 architecture. The lack of precision is not
551 expected to be a correctness issue, though.
552 */
553 typedef typename traits_t< T >::unsigned_t UT;
554
555 long double x = tc * __kmp_pow< UT >(base, idx);
556 UT r = (UT) x;
557 if ( x == r )
558 return r;
559 return r + 1;
560}
561
562// Parameters of the guided-iterative algorithm:
563// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
564// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
565// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
566// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
567static int guided_int_param = 2;
568static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
569
570// UT - unsigned flavor of T, ST - signed flavor of T,
571// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
572template< typename T >
573static void
574__kmp_dispatch_init(
575 ident_t * loc,
576 int gtid,
577 enum sched_type schedule,
578 T lb,
579 T ub,
580 typename traits_t< T >::signed_t st,
581 typename traits_t< T >::signed_t chunk,
582 int push_ws
583) {
584 typedef typename traits_t< T >::unsigned_t UT;
585 typedef typename traits_t< T >::signed_t ST;
586 typedef typename traits_t< T >::floating_t DBL;
587 static const int ___kmp_size_type = sizeof( UT );
588
589 int active;
590 T tc;
591 kmp_info_t * th;
592 kmp_team_t * team;
593 kmp_uint32 my_buffer_index;
594 dispatch_private_info_template< T > * pr;
595 dispatch_shared_info_template< UT > volatile * sh;
596
597 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
598 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
599
600 if ( ! TCR_4( __kmp_init_parallel ) )
601 __kmp_parallel_initialize();
602
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000603#if INCLUDE_SSC_MARKS
604 SSC_MARK_DISPATCH_INIT();
605#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000606 #ifdef KMP_DEBUG
607 {
608 const char * buff;
609 // create format specifiers before the debug output
610 buff = __kmp_str_format(
611 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
612 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
613 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
614 __kmp_str_free( &buff );
615 }
616 #endif
617 /* setup data */
618 th = __kmp_threads[ gtid ];
619 team = th -> th.th_team;
620 active = ! team -> t.t_serialized;
621 th->th.th_ident = loc;
622
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000623#if USE_ITT_BUILD
624 kmp_uint64 cur_chunk = chunk;
625#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000626 if ( ! active ) {
627 pr = reinterpret_cast< dispatch_private_info_template< T >* >
628 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
629 } else {
630 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
631 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
632
633 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
634
635 /* What happens when number of threads changes, need to resize buffer? */
636 pr = reinterpret_cast< dispatch_private_info_template< T > * >
637 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
638 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
639 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
640 }
641
642 /* Pick up the nomerge/ordered bits from the scheduling type */
643 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
644 pr->nomerge = TRUE;
645 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
646 } else {
647 pr->nomerge = FALSE;
648 }
649 pr->type_size = ___kmp_size_type; // remember the size of variables
650 if ( kmp_ord_lower & schedule ) {
651 pr->ordered = TRUE;
652 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
653 } else {
654 pr->ordered = FALSE;
655 }
656 if ( schedule == kmp_sch_static ) {
657 schedule = __kmp_static;
658 } else {
659 if ( schedule == kmp_sch_runtime ) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000660 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
661 schedule = team -> t.t_sched.r_sched_type;
662 // Detail the schedule if needed (global controls are differentiated appropriately)
663 if ( schedule == kmp_sch_guided_chunked ) {
664 schedule = __kmp_guided;
665 } else if ( schedule == kmp_sch_static ) {
666 schedule = __kmp_static;
667 }
668 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
669 chunk = team -> t.t_sched.chunk;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000670
671 #ifdef KMP_DEBUG
672 {
673 const char * buff;
674 // create format specifiers before the debug output
675 buff = __kmp_str_format(
676 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
677 traits_t< ST >::spec );
678 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
679 __kmp_str_free( &buff );
680 }
681 #endif
682 } else {
683 if ( schedule == kmp_sch_guided_chunked ) {
684 schedule = __kmp_guided;
685 }
686 if ( chunk <= 0 ) {
687 chunk = KMP_DEFAULT_CHUNK;
688 }
689 }
690
Jim Cownie5e8470a2013-09-27 10:38:44 +0000691 if ( schedule == kmp_sch_auto ) {
692 // mapping and differentiation: in the __kmp_do_serial_initialize()
693 schedule = __kmp_auto;
694 #ifdef KMP_DEBUG
695 {
696 const char * buff;
697 // create format specifiers before the debug output
698 buff = __kmp_str_format(
699 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
700 traits_t< ST >::spec );
701 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
702 __kmp_str_free( &buff );
703 }
704 #endif
705 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000706
707 /* guided analytical not safe for too many threads */
708 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
709 schedule = kmp_sch_guided_iterative_chunked;
710 KMP_WARNING( DispatchManyThreads );
711 }
712 pr->u.p.parm1 = chunk;
713 }
714 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
715 "unknown scheduling type" );
716
717 pr->u.p.count = 0;
718
719 if ( __kmp_env_consistency_check ) {
720 if ( st == 0 ) {
721 __kmp_error_construct(
722 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
723 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
724 );
725 }
726 }
727
728 tc = ( ub - lb + st );
729 if ( st != 1 ) {
730 if ( st < 0 ) {
731 if ( lb < ub ) {
732 tc = 0; // zero-trip
733 } else { // lb >= ub
734 tc = (ST)tc / st; // convert to signed division
735 }
736 } else { // st > 0
737 if ( ub < lb ) {
738 tc = 0; // zero-trip
739 } else { // lb >= ub
740 tc /= st;
741 }
742 }
743 } else if ( ub < lb ) { // st == 1
744 tc = 0; // zero-trip
745 }
746
747 pr->u.p.lb = lb;
748 pr->u.p.ub = ub;
749 pr->u.p.st = st;
750 pr->u.p.tc = tc;
751
752 #if KMP_OS_WINDOWS
753 pr->u.p.last_upper = ub + st;
754 #endif /* KMP_OS_WINDOWS */
755
756 /* NOTE: only the active parallel region(s) has active ordered sections */
757
758 if ( active ) {
759 if ( pr->ordered == 0 ) {
760 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
761 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
762 } else {
763 pr->ordered_bumped = 0;
764
765 pr->u.p.ordered_lower = 1;
766 pr->u.p.ordered_upper = 0;
767
768 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
769 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
770 }
771 }
772
773 if ( __kmp_env_consistency_check ) {
774 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
775 if ( push_ws ) {
776 __kmp_push_workshare( gtid, ws, loc );
777 pr->pushed_ws = ws;
778 } else {
779 __kmp_check_workshare( gtid, ws, loc );
780 pr->pushed_ws = ct_none;
781 }
782 }
783
784 switch ( schedule ) {
785 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
786 case kmp_sch_static_steal:
787 {
788 T nproc = team->t.t_nproc;
789 T ntc, init;
790
791 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
792
793 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
794 if ( nproc > 1 && ntc >= nproc ) {
795 T id = __kmp_tid_from_gtid(gtid);
796 T small_chunk, extras;
797
798 small_chunk = ntc / nproc;
799 extras = ntc % nproc;
800
801 init = id * small_chunk + ( id < extras ? id : extras );
802 pr->u.p.count = init;
803 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
804
805 pr->u.p.parm2 = lb;
806 //pr->pfields.parm3 = 0; // it's not used in static_steal
807 pr->u.p.parm4 = id;
808 pr->u.p.st = st;
809 break;
810 } else {
811 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
812 gtid ) );
813 schedule = kmp_sch_static_balanced;
814 /* too few iterations: fall-through to kmp_sch_static_balanced */
815 } // if
816 /* FALL-THROUGH to static balanced */
817 } // case
818 #endif
819 case kmp_sch_static_balanced:
820 {
821 T nproc = team->t.t_nproc;
822 T init, limit;
823
824 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
825 gtid ) );
826
827 if ( nproc > 1 ) {
828 T id = __kmp_tid_from_gtid(gtid);
829
830 if ( tc < nproc ) {
831 if ( id < tc ) {
832 init = id;
833 limit = id;
834 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
835 } else {
836 pr->u.p.count = 1; /* means no more chunks to execute */
837 pr->u.p.parm1 = FALSE;
838 break;
839 }
840 } else {
841 T small_chunk = tc / nproc;
842 T extras = tc % nproc;
843 init = id * small_chunk + (id < extras ? id : extras);
844 limit = init + small_chunk - (id < extras ? 0 : 1);
845 pr->u.p.parm1 = (id == nproc - 1);
846 }
847 } else {
848 if ( tc > 0 ) {
849 init = 0;
850 limit = tc - 1;
851 pr->u.p.parm1 = TRUE;
852 } else {
853 // zero trip count
854 pr->u.p.count = 1; /* means no more chunks to execute */
855 pr->u.p.parm1 = FALSE;
856 break;
857 }
858 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000859#if USE_ITT_BUILD
860 // Calculate chunk for metadata report
861 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
862 cur_chunk = limit - init + 1;
863 }
864#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000865 if ( st == 1 ) {
866 pr->u.p.lb = lb + init;
867 pr->u.p.ub = lb + limit;
868 } else {
869 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
870 pr->u.p.lb = lb + init * st;
871 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
872 if ( st > 0 ) {
873 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
874 } else {
875 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
876 }
877 }
878 if ( pr->ordered ) {
879 pr->u.p.ordered_lower = init;
880 pr->u.p.ordered_upper = limit;
881 }
882 break;
883 } // case
884 case kmp_sch_guided_iterative_chunked :
885 {
886 T nproc = team->t.t_nproc;
887 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
888
889 if ( nproc > 1 ) {
890 if ( (2L * chunk + 1 ) * nproc >= tc ) {
891 /* chunk size too large, switch to dynamic */
892 schedule = kmp_sch_dynamic_chunked;
893 } else {
894 // when remaining iters become less than parm2 - switch to dynamic
895 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
896 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
897 }
898 } else {
899 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
900 schedule = kmp_sch_static_greedy;
901 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
902 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
903 pr->u.p.parm1 = tc;
904 } // if
905 } // case
906 break;
907 case kmp_sch_guided_analytical_chunked:
908 {
909 T nproc = team->t.t_nproc;
910 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
911
912 if ( nproc > 1 ) {
913 if ( (2L * chunk + 1 ) * nproc >= tc ) {
914 /* chunk size too large, switch to dynamic */
915 schedule = kmp_sch_dynamic_chunked;
916 } else {
917 /* commonly used term: (2 nproc - 1)/(2 nproc) */
918 DBL x;
919
920 #if KMP_OS_WINDOWS && KMP_ARCH_X86
921 /* Linux* OS already has 64-bit computation by default for
922 long double, and on Windows* OS on Intel(R) 64,
923 /Qlong_double doesn't work. On Windows* OS
924 on IA-32 architecture, we need to set precision to
925 64-bit instead of the default 53-bit. Even though long
926 double doesn't work on Windows* OS on Intel(R) 64, the
927 resulting lack of precision is not expected to impact
928 the correctness of the algorithm, but this has not been
929 mathematically proven.
930 */
931 // save original FPCW and set precision to 64-bit, as
932 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +0000933 unsigned int oldFpcw = _control87(0,0);
934 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +0000935 #endif
936 /* value used for comparison in solver for cross-over point */
937 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
938
939 /* crossover point--chunk indexes equal to or greater than
940 this point switch to dynamic-style scheduling */
941 UT cross;
942
943 /* commonly used term: (2 nproc - 1)/(2 nproc) */
944 x = (long double)1.0 - (long double)0.5 / nproc;
945
946 #ifdef KMP_DEBUG
947 { // test natural alignment
948 struct _test_a {
949 char a;
950 union {
951 char b;
952 DBL d;
953 };
954 } t;
955 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
956 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
957 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
958 }
959 #endif // KMP_DEBUG
960
961 /* save the term in thread private dispatch structure */
962 *(DBL*)&pr->u.p.parm3 = x;
963
964 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
965 {
966 UT left, right, mid;
967 long double p;
968
969 /* estimate initial upper and lower bound */
970
971 /* doesn't matter what value right is as long as it is positive, but
972 it affects performance of the solver
973 */
974 right = 229;
975 p = __kmp_pow< UT >(x,right);
976 if ( p > target ) {
977 do{
978 p *= p;
979 right <<= 1;
980 } while(p>target && right < (1<<27));
981 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
982 } else {
983 left = 0;
984 }
985
986 /* bisection root-finding method */
987 while ( left + 1 < right ) {
988 mid = (left + right) / 2;
989 if ( __kmp_pow< UT >(x,mid) > target ) {
990 left = mid;
991 } else {
992 right = mid;
993 }
994 } // while
995 cross = right;
996 }
997 /* assert sanity of computed crossover point */
998 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
999
1000 /* save the crossover point in thread private dispatch structure */
1001 pr->u.p.parm2 = cross;
1002
1003 // C75803
1004 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1005 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1006 #else
1007 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1008 #endif
1009 /* dynamic-style scheduling offset */
1010 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1011 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1012 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +00001013 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001014 #endif
1015 } // if
1016 } else {
1017 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1018 gtid ) );
1019 schedule = kmp_sch_static_greedy;
1020 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1021 pr->u.p.parm1 = tc;
1022 } // if
1023 } // case
1024 break;
1025 case kmp_sch_static_greedy:
1026 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1027 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1028 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1029 tc;
1030 break;
1031 case kmp_sch_static_chunked :
1032 case kmp_sch_dynamic_chunked :
1033 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1034 break;
1035 case kmp_sch_trapezoidal :
1036 {
1037 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1038
1039 T parm1, parm2, parm3, parm4;
1040 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1041
1042 parm1 = chunk;
1043
1044 /* F : size of the first cycle */
1045 parm2 = ( tc / (2 * team->t.t_nproc) );
1046
1047 if ( parm2 < 1 ) {
1048 parm2 = 1;
1049 }
1050
1051 /* L : size of the last cycle. Make sure the last cycle
1052 * is not larger than the first cycle.
1053 */
1054 if ( parm1 < 1 ) {
1055 parm1 = 1;
1056 } else if ( parm1 > parm2 ) {
1057 parm1 = parm2;
1058 }
1059
1060 /* N : number of cycles */
1061 parm3 = ( parm2 + parm1 );
1062 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1063
1064 if ( parm3 < 2 ) {
1065 parm3 = 2;
1066 }
1067
1068 /* sigma : decreasing incr of the trapezoid */
1069 parm4 = ( parm3 - 1 );
1070 parm4 = ( parm2 - parm1 ) / parm4;
1071
1072 // pointless check, because parm4 >= 0 always
1073 //if ( parm4 < 0 ) {
1074 // parm4 = 0;
1075 //}
1076
1077 pr->u.p.parm1 = parm1;
1078 pr->u.p.parm2 = parm2;
1079 pr->u.p.parm3 = parm3;
1080 pr->u.p.parm4 = parm4;
1081 } // case
1082 break;
1083
1084 default:
1085 {
1086 __kmp_msg(
1087 kmp_ms_fatal, // Severity
1088 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1089 KMP_HNT( GetNewerLibrary ), // Hint
1090 __kmp_msg_null // Variadic argument list terminator
1091 );
1092 }
1093 break;
1094 } // switch
1095 pr->schedule = schedule;
1096 if ( active ) {
1097 /* The name of this buffer should be my_buffer_index when it's free to use it */
1098
1099 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1100 gtid, my_buffer_index, sh->buffer_index) );
1101 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1102 USE_ITT_BUILD_ARG( NULL )
1103 );
1104 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1105 // *always* 32-bit integers.
1106 KMP_MB(); /* is this necessary? */
1107 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1108 gtid, my_buffer_index, sh->buffer_index) );
1109
1110 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1111 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1112#if USE_ITT_BUILD
1113 if ( pr->ordered ) {
1114 __kmp_itt_ordered_init( gtid );
1115 }; // if
1116#endif /* USE_ITT_BUILD */
1117 }; // if
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001118
1119#if USE_ITT_BUILD
1120 // Report loop metadata
1121 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
1122 kmp_uint32 tid = __kmp_tid_from_gtid( gtid );
1123 if (KMP_MASTER_TID(tid)) {
1124 kmp_uint64 schedtype = 0;
1125
1126 switch ( schedule ) {
1127 case kmp_sch_static_chunked:
1128 case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1129 break;
1130 case kmp_sch_static_greedy:
1131 cur_chunk = pr->u.p.parm1;
1132 break;
1133 case kmp_sch_dynamic_chunked:
1134 schedtype = 1;
1135 break;
1136 case kmp_sch_guided_iterative_chunked:
1137 case kmp_sch_guided_analytical_chunked:
1138 schedtype = 2;
1139 break;
1140 default:
1141// Should we put this case under "static"?
1142// case kmp_sch_static_steal:
1143 schedtype = 3;
1144 break;
1145 }
1146 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1147 }
1148 }
1149#endif /* USE_ITT_BUILD */
1150
Jim Cownie5e8470a2013-09-27 10:38:44 +00001151 #ifdef KMP_DEBUG
1152 {
1153 const char * buff;
1154 // create format specifiers before the debug output
1155 buff = __kmp_str_format(
1156 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1157 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1158 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1159 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1160 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1161 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1162 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1163 KD_TRACE(10, ( buff,
1164 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1165 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1166 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1167 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1168 __kmp_str_free( &buff );
1169 }
1170 #endif
1171 #if ( KMP_STATIC_STEAL_ENABLED )
1172 if ( ___kmp_size_type < 8 ) {
1173 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1174 // all the parm3 variables will contain the same value.
1175 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1176 // rather than program life-time increment.
1177 // So the dedicated variable is required. The 'static_steal_counter' is used.
1178 if( schedule == kmp_sch_static_steal ) {
1179 // Other threads will inspect this variable when searching for a victim.
1180 // This is a flag showing that other threads may steal from this thread since then.
1181 volatile T * p = &pr->u.p.static_steal_counter;
1182 *p = *p + 1;
1183 }
1184 }
1185 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1186}
1187
1188/*
1189 * For ordered loops, either __kmp_dispatch_finish() should be called after
1190 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1191 * every chunk of iterations. If the ordered section(s) were not executed
1192 * for this iteration (or every iteration in this chunk), we need to set the
1193 * ordered iteration counters so that the next thread can proceed.
1194 */
1195template< typename UT >
1196static void
1197__kmp_dispatch_finish( int gtid, ident_t *loc )
1198{
1199 typedef typename traits_t< UT >::signed_t ST;
1200 kmp_info_t *th = __kmp_threads[ gtid ];
1201
1202 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1203 if ( ! th -> th.th_team -> t.t_serialized ) {
1204
1205 dispatch_private_info_template< UT > * pr =
1206 reinterpret_cast< dispatch_private_info_template< UT >* >
1207 ( th->th.th_dispatch->th_dispatch_pr_current );
1208 dispatch_shared_info_template< UT > volatile * sh =
1209 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1210 ( th->th.th_dispatch->th_dispatch_sh_current );
1211 KMP_DEBUG_ASSERT( pr );
1212 KMP_DEBUG_ASSERT( sh );
1213 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1214 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1215
1216 if ( pr->ordered_bumped ) {
1217 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1218 gtid ) );
1219 pr->ordered_bumped = 0;
1220 } else {
1221 UT lower = pr->u.p.ordered_lower;
1222
1223 #ifdef KMP_DEBUG
1224 {
1225 const char * buff;
1226 // create format specifiers before the debug output
1227 buff = __kmp_str_format(
1228 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1229 traits_t< UT >::spec, traits_t< UT >::spec );
1230 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1231 __kmp_str_free( &buff );
1232 }
1233 #endif
1234
1235 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1236 USE_ITT_BUILD_ARG(NULL)
1237 );
1238 KMP_MB(); /* is this necessary? */
1239 #ifdef KMP_DEBUG
1240 {
1241 const char * buff;
1242 // create format specifiers before the debug output
1243 buff = __kmp_str_format(
1244 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1245 traits_t< UT >::spec, traits_t< UT >::spec );
1246 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1247 __kmp_str_free( &buff );
1248 }
1249 #endif
1250
1251 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1252 } // if
1253 } // if
1254 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1255}
1256
1257#ifdef KMP_GOMP_COMPAT
1258
1259template< typename UT >
1260static void
1261__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1262{
1263 typedef typename traits_t< UT >::signed_t ST;
1264 kmp_info_t *th = __kmp_threads[ gtid ];
1265
1266 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1267 if ( ! th -> th.th_team -> t.t_serialized ) {
1268// int cid;
1269 dispatch_private_info_template< UT > * pr =
1270 reinterpret_cast< dispatch_private_info_template< UT >* >
1271 ( th->th.th_dispatch->th_dispatch_pr_current );
1272 dispatch_shared_info_template< UT > volatile * sh =
1273 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1274 ( th->th.th_dispatch->th_dispatch_sh_current );
1275 KMP_DEBUG_ASSERT( pr );
1276 KMP_DEBUG_ASSERT( sh );
1277 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1278 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1279
1280// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1281 UT lower = pr->u.p.ordered_lower;
1282 UT upper = pr->u.p.ordered_upper;
1283 UT inc = upper - lower + 1;
1284
1285 if ( pr->ordered_bumped == inc ) {
1286 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1287 gtid ) );
1288 pr->ordered_bumped = 0;
1289 } else {
1290 inc -= pr->ordered_bumped;
1291
1292 #ifdef KMP_DEBUG
1293 {
1294 const char * buff;
1295 // create format specifiers before the debug output
1296 buff = __kmp_str_format(
1297 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1298 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1299 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1300 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1301 __kmp_str_free( &buff );
1302 }
1303 #endif
1304
1305 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1306 USE_ITT_BUILD_ARG(NULL)
1307 );
1308
1309 KMP_MB(); /* is this necessary? */
1310 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1311 gtid ) );
1312 pr->ordered_bumped = 0;
1313//!!!!! TODO check if the inc should be unsigned, or signed???
1314 #ifdef KMP_DEBUG
1315 {
1316 const char * buff;
1317 // create format specifiers before the debug output
1318 buff = __kmp_str_format(
1319 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1320 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1321 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1322 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1323 __kmp_str_free( &buff );
1324 }
1325 #endif
1326
1327 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1328 }
1329// }
1330 }
1331 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1332}
1333
1334#endif /* KMP_GOMP_COMPAT */
1335
1336template< typename T >
1337static int
1338__kmp_dispatch_next(
1339 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1340) {
1341
1342 typedef typename traits_t< T >::unsigned_t UT;
1343 typedef typename traits_t< T >::signed_t ST;
1344 typedef typename traits_t< T >::floating_t DBL;
1345 static const int ___kmp_size_type = sizeof( UT );
1346
1347 int status;
1348 dispatch_private_info_template< T > * pr;
1349 kmp_info_t * th = __kmp_threads[ gtid ];
1350 kmp_team_t * team = th -> th.th_team;
1351
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001352 KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie5e8470a2013-09-27 10:38:44 +00001353 #ifdef KMP_DEBUG
1354 {
1355 const char * buff;
1356 // create format specifiers before the debug output
1357 buff = __kmp_str_format(
1358 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1359 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1360 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1361 __kmp_str_free( &buff );
1362 }
1363 #endif
1364
1365 if ( team -> t.t_serialized ) {
1366 /* NOTE: serialize this dispatch becase we are not at the active level */
1367 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1368 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1369 KMP_DEBUG_ASSERT( pr );
1370
1371 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1372 *p_lb = 0;
1373 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001374// if ( p_last != NULL )
1375// *p_last = 0;
1376 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001377 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001378 if ( __kmp_env_consistency_check ) {
1379 if ( pr->pushed_ws != ct_none ) {
1380 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1381 }
1382 }
1383 } else if ( pr->nomerge ) {
1384 kmp_int32 last;
1385 T start;
1386 UT limit, trip, init;
1387 ST incr;
1388 T chunk = pr->u.p.parm1;
1389
1390 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1391
1392 init = chunk * pr->u.p.count++;
1393 trip = pr->u.p.tc - 1;
1394
1395 if ( (status = (init <= trip)) == 0 ) {
1396 *p_lb = 0;
1397 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001398// if ( p_last != NULL )
1399// *p_last = 0;
1400 if ( p_st != NULL )
1401 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001402 if ( __kmp_env_consistency_check ) {
1403 if ( pr->pushed_ws != ct_none ) {
1404 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1405 }
1406 }
1407 } else {
1408 start = pr->u.p.lb;
1409 limit = chunk + init - 1;
1410 incr = pr->u.p.st;
1411
1412 if ( (last = (limit >= trip)) != 0 ) {
1413 limit = trip;
1414 #if KMP_OS_WINDOWS
1415 pr->u.p.last_upper = pr->u.p.ub;
1416 #endif /* KMP_OS_WINDOWS */
1417 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001418 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001419 *p_last = last;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001420 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001421 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001422 if ( incr == 1 ) {
1423 *p_lb = start + init;
1424 *p_ub = start + limit;
1425 } else {
1426 *p_lb = start + init * incr;
1427 *p_ub = start + limit * incr;
1428 }
1429
1430 if ( pr->ordered ) {
1431 pr->u.p.ordered_lower = init;
1432 pr->u.p.ordered_upper = limit;
1433 #ifdef KMP_DEBUG
1434 {
1435 const char * buff;
1436 // create format specifiers before the debug output
1437 buff = __kmp_str_format(
1438 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1439 traits_t< UT >::spec, traits_t< UT >::spec );
1440 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1441 __kmp_str_free( &buff );
1442 }
1443 #endif
1444 } // if
1445 } // if
1446 } else {
1447 pr->u.p.tc = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001448 *p_lb = pr->u.p.lb;
1449 *p_ub = pr->u.p.ub;
1450 #if KMP_OS_WINDOWS
1451 pr->u.p.last_upper = *p_ub;
1452 #endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001453 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001454 *p_last = TRUE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001455 if ( p_st != NULL )
1456 *p_st = pr->u.p.st;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001457 } // if
1458 #ifdef KMP_DEBUG
1459 {
1460 const char * buff;
1461 // create format specifiers before the debug output
1462 buff = __kmp_str_format(
1463 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001464 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie5e8470a2013-09-27 10:38:44 +00001465 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001466 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001467 __kmp_str_free( &buff );
1468 }
1469 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001470#if INCLUDE_SSC_MARKS
1471 SSC_MARK_DISPATCH_NEXT();
1472#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001473 return status;
1474 } else {
1475 kmp_int32 last = 0;
1476 dispatch_shared_info_template< UT > *sh;
1477 T start;
1478 ST incr;
1479 UT limit, trip, init;
1480
1481 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1482 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1483
1484 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1485 ( th->th.th_dispatch->th_dispatch_pr_current );
1486 KMP_DEBUG_ASSERT( pr );
1487 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1488 ( th->th.th_dispatch->th_dispatch_sh_current );
1489 KMP_DEBUG_ASSERT( sh );
1490
1491 if ( pr->u.p.tc == 0 ) {
1492 // zero trip count
1493 status = 0;
1494 } else {
1495 switch (pr->schedule) {
1496 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1497 case kmp_sch_static_steal:
1498 {
1499 T chunk = pr->u.p.parm1;
1500
1501 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1502
1503 trip = pr->u.p.tc - 1;
1504
1505 if ( ___kmp_size_type > 4 ) {
1506 // Other threads do not look into the data of this thread,
1507 // so it's not necessary to make volatile casting.
1508 init = ( pr->u.p.count )++;
1509 status = ( init < (UT)pr->u.p.ub );
1510 } else {
1511 typedef union {
1512 struct {
1513 UT count;
1514 T ub;
1515 } p;
1516 kmp_int64 b;
1517 } union_i4;
1518 // All operations on 'count' or 'ub' must be combined atomically together.
1519 // stealing implemented only for 4-byte indexes
1520 {
1521 union_i4 vold, vnew;
1522 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1523 vnew = vold;
1524 vnew.p.count++;
1525 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1526 ( volatile kmp_int64* )&pr->u.p.count,
1527 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1528 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1529 KMP_CPU_PAUSE();
1530 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1531 vnew = vold;
1532 vnew.p.count++;
1533 }
1534 vnew = vold;
1535 init = vnew.p.count;
1536 status = ( init < (UT)vnew.p.ub ) ;
1537 }
1538
1539 if( !status ) {
1540 kmp_info_t **other_threads = team->t.t_threads;
1541 int while_limit = 10;
1542 int while_index = 0;
1543
1544 // TODO: algorithm of searching for a victim
1545 // should be cleaned up and measured
1546 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1547 union_i4 vold, vnew;
1548 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1549 T victimIdx = pr->u.p.parm4;
1550 T oldVictimIdx = victimIdx;
1551 dispatch_private_info_template< T > * victim;
1552
1553 do {
1554 if( !victimIdx ) {
1555 victimIdx = team->t.t_nproc - 1;
1556 } else {
1557 --victimIdx;
1558 }
1559 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1560 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1561 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1562 // TODO: think about a proper place of this test
1563 if ( ( !victim ) ||
1564 ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1565 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1566 // TODO: delay would be nice
1567 continue;
1568 // the victim is not ready yet to participate in stealing
1569 // because the victim is still in kmp_init_dispatch
1570 }
1571 if ( oldVictimIdx == victimIdx ) {
1572 break;
1573 }
1574 pr->u.p.parm4 = victimIdx;
1575
1576 while( 1 ) {
1577 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1578 vnew = vold;
1579
1580 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1581 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1582 break;
1583 }
1584 vnew.p.ub -= (remaining >> 2);
1585 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1586 #pragma warning( push )
1587 // disable warning on pointless comparison of unsigned with 0
1588 #pragma warning( disable: 186 )
1589 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1590 #pragma warning( pop )
1591 // TODO: Should this be acquire or release?
1592 if ( KMP_COMPARE_AND_STORE_ACQ64(
1593 ( volatile kmp_int64 * )&victim->u.p.count,
1594 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1595 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1596 status = 1;
1597 while_index = 0;
1598 // now update own count and ub
1599 #if KMP_ARCH_X86
1600 // stealing executed on non-KMP_ARCH_X86 only
1601 // Atomic 64-bit write on ia32 is
1602 // unavailable, so we do this in steps.
1603 // This code is not tested.
1604 init = vold.p.count;
1605 pr->u.p.ub = 0;
1606 pr->u.p.count = init + 1;
1607 pr->u.p.ub = vnew.p.count;
1608 #else
1609 init = vnew.p.ub;
1610 vold.p.count = init + 1;
1611 // TODO: is it safe and enough?
1612 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1613 #endif // KMP_ARCH_X86
1614 break;
1615 } // if
1616 KMP_CPU_PAUSE();
1617 } // while (1)
1618 } // while
1619 } // if
1620 } // if
1621 if ( !status ) {
1622 *p_lb = 0;
1623 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001624 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001625 } else {
1626 start = pr->u.p.parm2;
1627 init *= chunk;
1628 limit = chunk + init - 1;
1629 incr = pr->u.p.st;
1630
1631 KMP_DEBUG_ASSERT(init <= trip);
1632 if ( (last = (limit >= trip)) != 0 )
1633 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001634 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001635
1636 if ( incr == 1 ) {
1637 *p_lb = start + init;
1638 *p_ub = start + limit;
1639 } else {
1640 *p_lb = start + init * incr;
1641 *p_ub = start + limit * incr;
1642 }
1643
1644 if ( pr->ordered ) {
1645 pr->u.p.ordered_lower = init;
1646 pr->u.p.ordered_upper = limit;
1647 #ifdef KMP_DEBUG
1648 {
1649 const char * buff;
1650 // create format specifiers before the debug output
1651 buff = __kmp_str_format(
1652 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1653 traits_t< UT >::spec, traits_t< UT >::spec );
1654 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1655 __kmp_str_free( &buff );
1656 }
1657 #endif
1658 } // if
1659 } // if
1660 break;
1661 } // case
1662 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1663 case kmp_sch_static_balanced:
1664 {
1665 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1666 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1667 pr->u.p.count = 1;
1668 *p_lb = pr->u.p.lb;
1669 *p_ub = pr->u.p.ub;
1670 last = pr->u.p.parm1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001671 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001672 *p_st = pr->u.p.st;
1673 } else { /* no iterations to do */
1674 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1675 }
1676 if ( pr->ordered ) {
1677 #ifdef KMP_DEBUG
1678 {
1679 const char * buff;
1680 // create format specifiers before the debug output
1681 buff = __kmp_str_format(
1682 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1683 traits_t< UT >::spec, traits_t< UT >::spec );
1684 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1685 __kmp_str_free( &buff );
1686 }
1687 #endif
1688 } // if
1689 } // case
1690 break;
1691 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1692 case kmp_sch_static_chunked:
1693 {
1694 T parm1;
1695
1696 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1697 gtid ) );
1698 parm1 = pr->u.p.parm1;
1699
1700 trip = pr->u.p.tc - 1;
1701 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1702
1703 if ( (status = (init <= trip)) != 0 ) {
1704 start = pr->u.p.lb;
1705 incr = pr->u.p.st;
1706 limit = parm1 + init - 1;
1707
1708 if ( (last = (limit >= trip)) != 0 )
1709 limit = trip;
1710
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001711 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001712
1713 pr->u.p.count += team->t.t_nproc;
1714
1715 if ( incr == 1 ) {
1716 *p_lb = start + init;
1717 *p_ub = start + limit;
1718 }
1719 else {
1720 *p_lb = start + init * incr;
1721 *p_ub = start + limit * incr;
1722 }
1723
1724 if ( pr->ordered ) {
1725 pr->u.p.ordered_lower = init;
1726 pr->u.p.ordered_upper = limit;
1727 #ifdef KMP_DEBUG
1728 {
1729 const char * buff;
1730 // create format specifiers before the debug output
1731 buff = __kmp_str_format(
1732 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1733 traits_t< UT >::spec, traits_t< UT >::spec );
1734 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1735 __kmp_str_free( &buff );
1736 }
1737 #endif
1738 } // if
1739 } // if
1740 } // case
1741 break;
1742
1743 case kmp_sch_dynamic_chunked:
1744 {
1745 T chunk = pr->u.p.parm1;
1746
1747 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1748 gtid ) );
1749
1750 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1751 trip = pr->u.p.tc - 1;
1752
1753 if ( (status = (init <= trip)) == 0 ) {
1754 *p_lb = 0;
1755 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001756 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001757 } else {
1758 start = pr->u.p.lb;
1759 limit = chunk + init - 1;
1760 incr = pr->u.p.st;
1761
1762 if ( (last = (limit >= trip)) != 0 )
1763 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001764
1765 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001766
1767 if ( incr == 1 ) {
1768 *p_lb = start + init;
1769 *p_ub = start + limit;
1770 } else {
1771 *p_lb = start + init * incr;
1772 *p_ub = start + limit * incr;
1773 }
1774
1775 if ( pr->ordered ) {
1776 pr->u.p.ordered_lower = init;
1777 pr->u.p.ordered_upper = limit;
1778 #ifdef KMP_DEBUG
1779 {
1780 const char * buff;
1781 // create format specifiers before the debug output
1782 buff = __kmp_str_format(
1783 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1784 traits_t< UT >::spec, traits_t< UT >::spec );
1785 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1786 __kmp_str_free( &buff );
1787 }
1788 #endif
1789 } // if
1790 } // if
1791 } // case
1792 break;
1793
1794 case kmp_sch_guided_iterative_chunked:
1795 {
1796 T chunkspec = pr->u.p.parm1;
1797 KD_TRACE(100,
1798 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1799 trip = pr->u.p.tc;
1800 // Start atomic part of calculations
1801 while(1) {
1802 ST remaining; // signed, because can be < 0
1803 init = sh->u.s.iteration; // shared value
1804 remaining = trip - init;
1805 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1806 // nothing to do, don't try atomic op
1807 status = 0;
1808 break;
1809 }
1810 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1811 // use dynamic-style shcedule
1812 // atomically inrement iterations, get old value
1813 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1814 remaining = trip - init;
1815 if (remaining <= 0) {
1816 status = 0; // all iterations got by other threads
1817 } else {
1818 // got some iterations to work on
1819 status = 1;
1820 if ( (T)remaining > chunkspec ) {
1821 limit = init + chunkspec - 1;
1822 } else {
1823 last = 1; // the last chunk
1824 limit = init + remaining - 1;
1825 } // if
1826 } // if
1827 break;
1828 } // if
1829 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1830 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1831 // CAS was successful, chunk obtained
1832 status = 1;
1833 --limit;
1834 break;
1835 } // if
1836 } // while
1837 if ( status != 0 ) {
1838 start = pr->u.p.lb;
1839 incr = pr->u.p.st;
1840 if ( p_st != NULL )
1841 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001842 *p_lb = start + init * incr;
1843 *p_ub = start + limit * incr;
1844 if ( pr->ordered ) {
1845 pr->u.p.ordered_lower = init;
1846 pr->u.p.ordered_upper = limit;
1847 #ifdef KMP_DEBUG
1848 {
1849 const char * buff;
1850 // create format specifiers before the debug output
1851 buff = __kmp_str_format(
1852 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1853 traits_t< UT >::spec, traits_t< UT >::spec );
1854 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1855 __kmp_str_free( &buff );
1856 }
1857 #endif
1858 } // if
1859 } else {
1860 *p_lb = 0;
1861 *p_ub = 0;
1862 if ( p_st != NULL )
1863 *p_st = 0;
1864 } // if
1865 } // case
1866 break;
1867
1868 case kmp_sch_guided_analytical_chunked:
1869 {
1870 T chunkspec = pr->u.p.parm1;
1871 UT chunkIdx;
1872 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1873 /* for storing original FPCW value for Windows* OS on
1874 IA-32 architecture 8-byte version */
1875 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001876 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001877 #endif
1878 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1879 gtid ) );
1880
1881 trip = pr->u.p.tc;
1882
1883 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1884 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1885
1886 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1887 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1888 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1889 --trip;
1890 /* use dynamic-style scheduling */
1891 init = chunkIdx * chunkspec + pr->u.p.count;
1892 /* need to verify init > 0 in case of overflow in the above calculation */
1893 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1894 limit = init + chunkspec -1;
1895
1896 if ( (last = (limit >= trip)) != 0 )
1897 limit = trip;
1898 }
1899 break;
1900 } else {
1901 /* use exponential-style scheduling */
1902 /* The following check is to workaround the lack of long double precision on Windows* OS.
1903 This check works around the possible effect that init != 0 for chunkIdx == 0.
1904 */
1905 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1906 /* If we haven't already done so, save original
1907 FPCW and set precision to 64-bit, as Windows* OS
1908 on IA-32 architecture defaults to 53-bit */
1909 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00001910 oldFpcw = _control87(0,0);
1911 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001912 fpcwSet = 0x30000;
1913 }
1914 #endif
1915 if ( chunkIdx ) {
1916 init = __kmp_dispatch_guided_remaining< T >(
1917 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1918 KMP_DEBUG_ASSERT(init);
1919 init = trip - init;
1920 } else
1921 init = 0;
1922 limit = trip - __kmp_dispatch_guided_remaining< T >(
1923 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1924 KMP_ASSERT(init <= limit);
1925 if ( init < limit ) {
1926 KMP_DEBUG_ASSERT(limit <= trip);
1927 --limit;
1928 status = 1;
1929 break;
1930 } // if
1931 } // if
1932 } // while (1)
1933 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00001934 /* restore FPCW if necessary
1935 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1936 */
1937 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1938 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001939 #endif
1940 if ( status != 0 ) {
1941 start = pr->u.p.lb;
1942 incr = pr->u.p.st;
1943 if ( p_st != NULL )
1944 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001945 *p_lb = start + init * incr;
1946 *p_ub = start + limit * incr;
1947 if ( pr->ordered ) {
1948 pr->u.p.ordered_lower = init;
1949 pr->u.p.ordered_upper = limit;
1950 #ifdef KMP_DEBUG
1951 {
1952 const char * buff;
1953 // create format specifiers before the debug output
1954 buff = __kmp_str_format(
1955 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1956 traits_t< UT >::spec, traits_t< UT >::spec );
1957 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1958 __kmp_str_free( &buff );
1959 }
1960 #endif
1961 }
1962 } else {
1963 *p_lb = 0;
1964 *p_ub = 0;
1965 if ( p_st != NULL )
1966 *p_st = 0;
1967 }
1968 } // case
1969 break;
1970
1971 case kmp_sch_trapezoidal:
1972 {
1973 UT index;
1974 T parm2 = pr->u.p.parm2;
1975 T parm3 = pr->u.p.parm3;
1976 T parm4 = pr->u.p.parm4;
1977 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1978 gtid ) );
1979
1980 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1981
1982 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1983 trip = pr->u.p.tc - 1;
1984
1985 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1986 *p_lb = 0;
1987 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001988 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001989 } else {
1990 start = pr->u.p.lb;
1991 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1992 incr = pr->u.p.st;
1993
1994 if ( (last = (limit >= trip)) != 0 )
1995 limit = trip;
1996
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001997 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001998
1999 if ( incr == 1 ) {
2000 *p_lb = start + init;
2001 *p_ub = start + limit;
2002 } else {
2003 *p_lb = start + init * incr;
2004 *p_ub = start + limit * incr;
2005 }
2006
2007 if ( pr->ordered ) {
2008 pr->u.p.ordered_lower = init;
2009 pr->u.p.ordered_upper = limit;
2010 #ifdef KMP_DEBUG
2011 {
2012 const char * buff;
2013 // create format specifiers before the debug output
2014 buff = __kmp_str_format(
2015 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2016 traits_t< UT >::spec, traits_t< UT >::spec );
2017 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2018 __kmp_str_free( &buff );
2019 }
2020 #endif
2021 } // if
2022 } // if
2023 } // case
2024 break;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002025 default:
2026 {
2027 status = 0; // to avoid complaints on uninitialized variable use
2028 __kmp_msg(
2029 kmp_ms_fatal, // Severity
2030 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2031 KMP_HNT( GetNewerLibrary ), // Hint
2032 __kmp_msg_null // Variadic argument list terminator
2033 );
2034 }
2035 break;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002036 } // switch
2037 } // if tc == 0;
2038
2039 if ( status == 0 ) {
2040 UT num_done;
2041
2042 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2043 #ifdef KMP_DEBUG
2044 {
2045 const char * buff;
2046 // create format specifiers before the debug output
2047 buff = __kmp_str_format(
2048 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2049 traits_t< UT >::spec );
2050 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2051 __kmp_str_free( &buff );
2052 }
2053 #endif
2054
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002055 if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002056 /* NOTE: release this buffer to be reused */
2057
2058 KMP_MB(); /* Flush all pending memory write invalidates. */
2059
2060 sh->u.s.num_done = 0;
2061 sh->u.s.iteration = 0;
2062
2063 /* TODO replace with general release procedure? */
2064 if ( pr->ordered ) {
2065 sh->u.s.ordered_iteration = 0;
2066 }
2067
2068 KMP_MB(); /* Flush all pending memory write invalidates. */
2069
2070 sh -> buffer_index += KMP_MAX_DISP_BUF;
2071 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2072 gtid, sh->buffer_index) );
2073
2074 KMP_MB(); /* Flush all pending memory write invalidates. */
2075
2076 } // if
2077 if ( __kmp_env_consistency_check ) {
2078 if ( pr->pushed_ws != ct_none ) {
2079 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2080 }
2081 }
2082
2083 th -> th.th_dispatch -> th_deo_fcn = NULL;
2084 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2085 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2086 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2087 } // if (status == 0)
2088#if KMP_OS_WINDOWS
2089 else if ( last ) {
2090 pr->u.p.last_upper = pr->u.p.ub;
2091 }
2092#endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002093 if ( p_last != NULL && status != 0 )
2094 *p_last = last;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002095 } // if
2096
2097 #ifdef KMP_DEBUG
2098 {
2099 const char * buff;
2100 // create format specifiers before the debug output
2101 buff = __kmp_str_format(
2102 "__kmp_dispatch_next: T#%%d normal case: " \
2103 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2104 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2105 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2106 __kmp_str_free( &buff );
2107 }
2108 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002109#if INCLUDE_SSC_MARKS
2110 SSC_MARK_DISPATCH_NEXT();
2111#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00002112 return status;
2113}
2114
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002115template< typename T >
2116static void
2117__kmp_dist_get_bounds(
2118 ident_t *loc,
2119 kmp_int32 gtid,
2120 kmp_int32 *plastiter,
2121 T *plower,
2122 T *pupper,
2123 typename traits_t< T >::signed_t incr
2124) {
2125 KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2126 typedef typename traits_t< T >::unsigned_t UT;
2127 typedef typename traits_t< T >::signed_t ST;
2128 register kmp_uint32 team_id;
2129 register kmp_uint32 nteams;
2130 register UT trip_count;
2131 register kmp_team_t *team;
2132 kmp_info_t * th;
2133
2134 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2135 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2136 #ifdef KMP_DEBUG
2137 {
2138 const char * buff;
2139 // create format specifiers before the debug output
2140 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2141 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2142 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2143 traits_t< T >::spec );
2144 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2145 __kmp_str_free( &buff );
2146 }
2147 #endif
2148
2149 if( __kmp_env_consistency_check ) {
2150 if( incr == 0 ) {
2151 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2152 }
2153 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2154 // The loop is illegal.
2155 // Some zero-trip loops maintained by compiler, e.g.:
2156 // for(i=10;i<0;++i) // lower >= upper - run-time check
2157 // for(i=0;i>10;--i) // lower <= upper - run-time check
2158 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2159 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2160 // Compiler does not check the following illegal loops:
2161 // for(i=0;i<10;i+=incr) // where incr<0
2162 // for(i=10;i>0;i-=incr) // where incr<0
2163 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2164 }
2165 }
2166 th = __kmp_threads[gtid];
2167 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2168 team = th->th.th_team;
2169 #if OMP_40_ENABLED
2170 nteams = th->th.th_teams_size.nteams;
2171 #endif
2172 team_id = team->t.t_master_tid;
2173 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2174
2175 // compute global trip count
2176 if( incr == 1 ) {
2177 trip_count = *pupper - *plower + 1;
2178 } else if(incr == -1) {
2179 trip_count = *plower - *pupper + 1;
2180 } else {
2181 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2182 }
2183 if( trip_count <= nteams ) {
2184 KMP_DEBUG_ASSERT(
2185 __kmp_static == kmp_sch_static_greedy || \
2186 __kmp_static == kmp_sch_static_balanced
2187 ); // Unknown static scheduling type.
2188 // only some teams get single iteration, others get nothing
2189 if( team_id < trip_count ) {
2190 *pupper = *plower = *plower + team_id * incr;
2191 } else {
2192 *plower = *pupper + incr; // zero-trip loop
2193 }
2194 if( plastiter != NULL )
2195 *plastiter = ( team_id == trip_count - 1 );
2196 } else {
2197 if( __kmp_static == kmp_sch_static_balanced ) {
2198 register UT chunk = trip_count / nteams;
2199 register UT extras = trip_count % nteams;
2200 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2201 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2202 if( plastiter != NULL )
2203 *plastiter = ( team_id == nteams - 1 );
2204 } else {
2205 register T chunk_inc_count =
2206 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2207 register T upper = *pupper;
2208 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2209 // Unknown static scheduling type.
2210 *plower += team_id * chunk_inc_count;
2211 *pupper = *plower + chunk_inc_count - incr;
2212 // Check/correct bounds if needed
2213 if( incr > 0 ) {
2214 if( *pupper < *plower )
2215 *pupper = i_maxmin< T >::mx;
2216 if( plastiter != NULL )
2217 *plastiter = *plower <= upper && *pupper > upper - incr;
2218 if( *pupper > upper )
2219 *pupper = upper; // tracker C73258
2220 } else {
2221 if( *pupper > *plower )
2222 *pupper = i_maxmin< T >::mn;
2223 if( plastiter != NULL )
2224 *plastiter = *plower >= upper && *pupper < upper - incr;
2225 if( *pupper < upper )
2226 *pupper = upper; // tracker C73258
2227 }
2228 }
2229 }
2230}
2231
Jim Cownie5e8470a2013-09-27 10:38:44 +00002232//-----------------------------------------------------------------------------------------
2233// Dispatch routines
2234// Transfer call to template< type T >
2235// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2236// T lb, T ub, ST st, ST chunk )
2237extern "C" {
2238
2239/*!
2240@ingroup WORK_SHARING
2241@{
2242@param loc Source location
2243@param gtid Global thread id
2244@param schedule Schedule type
2245@param lb Lower bound
2246@param ub Upper bound
2247@param st Step (or increment if you prefer)
2248@param chunk The chunk size to block with
2249
2250This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2251These functions are all identical apart from the types of the arguments.
2252*/
2253
2254void
2255__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2256 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2257{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002258 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002259 KMP_DEBUG_ASSERT( __kmp_init_serial );
2260 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2261}
2262/*!
2263See @ref __kmpc_dispatch_init_4
2264*/
2265void
2266__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2267 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2268{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002269 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002270 KMP_DEBUG_ASSERT( __kmp_init_serial );
2271 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2272}
2273
2274/*!
2275See @ref __kmpc_dispatch_init_4
2276*/
2277void
2278__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2279 kmp_int64 lb, kmp_int64 ub,
2280 kmp_int64 st, kmp_int64 chunk )
2281{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002282 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002283 KMP_DEBUG_ASSERT( __kmp_init_serial );
2284 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2285}
2286
2287/*!
2288See @ref __kmpc_dispatch_init_4
2289*/
2290void
2291__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2292 kmp_uint64 lb, kmp_uint64 ub,
2293 kmp_int64 st, kmp_int64 chunk )
2294{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002295 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002296 KMP_DEBUG_ASSERT( __kmp_init_serial );
2297 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2298}
2299
2300/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002301See @ref __kmpc_dispatch_init_4
2302
2303Difference from __kmpc_dispatch_init set of functions is these functions
2304are called for composite distribute parallel for construct. Thus before
2305regular iterations dispatching we need to calc per-team iteration space.
2306
2307These functions are all identical apart from the types of the arguments.
2308*/
2309void
2310__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2311 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2312{
2313 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2314 KMP_DEBUG_ASSERT( __kmp_init_serial );
2315 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2316 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2317}
2318
2319void
2320__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2321 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2322{
2323 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2324 KMP_DEBUG_ASSERT( __kmp_init_serial );
2325 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2326 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2327}
2328
2329void
2330__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2331 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2332{
2333 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2334 KMP_DEBUG_ASSERT( __kmp_init_serial );
2335 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2336 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2337}
2338
2339void
2340__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2341 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2342{
2343 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2344 KMP_DEBUG_ASSERT( __kmp_init_serial );
2345 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2346 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2347}
2348
2349/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002350@param loc Source code location
2351@param gtid Global thread id
2352@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2353@param p_lb Pointer to the lower bound for the next chunk of work
2354@param p_ub Pointer to the upper bound for the next chunk of work
2355@param p_st Pointer to the stride for the next chunk of work
2356@return one if there is work to be done, zero otherwise
2357
2358Get the next dynamically allocated chunk of work for this thread.
2359If there is no more work, then the lb,ub and stride need not be modified.
2360*/
2361int
2362__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2363 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2364{
2365 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2366}
2367
2368/*!
2369See @ref __kmpc_dispatch_next_4
2370*/
2371int
2372__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2373 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2374{
2375 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2376}
2377
2378/*!
2379See @ref __kmpc_dispatch_next_4
2380*/
2381int
2382__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2383 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2384{
2385 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2386}
2387
2388/*!
2389See @ref __kmpc_dispatch_next_4
2390*/
2391int
2392__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2393 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2394{
2395 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2396}
2397
2398/*!
2399@param loc Source code location
2400@param gtid Global thread id
2401
2402Mark the end of a dynamic loop.
2403*/
2404void
2405__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2406{
2407 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2408}
2409
2410/*!
2411See @ref __kmpc_dispatch_fini_4
2412*/
2413void
2414__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2415{
2416 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2417}
2418
2419/*!
2420See @ref __kmpc_dispatch_fini_4
2421*/
2422void
2423__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2424{
2425 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2426}
2427
2428/*!
2429See @ref __kmpc_dispatch_fini_4
2430*/
2431void
2432__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2433{
2434 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2435}
2436/*! @} */
2437
2438//-----------------------------------------------------------------------------------------
2439//Non-template routines from kmp_dispatch.c used in other sources
2440
2441kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2442 return value == checker;
2443}
2444
2445kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2446 return value != checker;
2447}
2448
2449kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2450 return value < checker;
2451}
2452
2453kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2454 return value >= checker;
2455}
2456
2457kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2458 return value <= checker;
2459}
2460kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2461 return value == checker;
2462}
2463
2464kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2465 return value != checker;
2466}
2467
2468kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2469 return value < checker;
2470}
2471
2472kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2473 return value >= checker;
2474}
2475
2476kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2477 return value <= checker;
2478}
2479
2480kmp_uint32
2481__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2482 kmp_uint32 checker,
2483 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2484 , void * obj // Higher-level synchronization object, or NULL.
2485 )
2486{
2487 // note: we may not belong to a team at this point
2488 register volatile kmp_uint32 * spin = spinner;
2489 register kmp_uint32 check = checker;
2490 register kmp_uint32 spins;
2491 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2492 register kmp_uint32 r;
2493
2494 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2495 KMP_INIT_YIELD( spins );
2496 // main wait spin loop
2497 while(!f(r = TCR_4(*spin), check)) {
2498 KMP_FSYNC_SPIN_PREPARE( obj );
2499 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2500 It causes problems with infinite recursion because of exit lock */
2501 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2502 __kmp_abort_thread(); */
2503
Jim Cownie5e8470a2013-09-27 10:38:44 +00002504 /* if we have waited a bit, or are oversubscribed, yield */
2505 /* pause is in the following code */
2506 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2507 KMP_YIELD_SPIN( spins );
2508 }
2509 KMP_FSYNC_SPIN_ACQUIRED( obj );
2510 return r;
2511}
2512
2513kmp_uint64
2514__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2515 kmp_uint64 checker,
2516 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2517 , void * obj // Higher-level synchronization object, or NULL.
2518 )
2519{
2520 // note: we may not belong to a team at this point
2521 register volatile kmp_uint64 * spin = spinner;
2522 register kmp_uint64 check = checker;
2523 register kmp_uint32 spins;
2524 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2525 register kmp_uint64 r;
2526
2527 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2528 KMP_INIT_YIELD( spins );
2529 // main wait spin loop
2530 while(!f(r = *spin, check))
2531 {
2532 KMP_FSYNC_SPIN_PREPARE( obj );
2533 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2534 It causes problems with infinite recursion because of exit lock */
2535 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2536 __kmp_abort_thread(); */
2537
Jim Cownie5e8470a2013-09-27 10:38:44 +00002538 // if we are oversubscribed,
2539 // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2540 // pause is in the following code
2541 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2542 KMP_YIELD_SPIN( spins );
2543 }
2544 KMP_FSYNC_SPIN_ACQUIRED( obj );
2545 return r;
2546}
2547
2548} // extern "C"
2549
2550#ifdef KMP_GOMP_COMPAT
2551
2552void
2553__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2554 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2555 kmp_int32 chunk, int push_ws )
2556{
2557 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2558 push_ws );
2559}
2560
2561void
2562__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2563 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2564 kmp_int32 chunk, int push_ws )
2565{
2566 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2567 push_ws );
2568}
2569
2570void
2571__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2572 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2573 kmp_int64 chunk, int push_ws )
2574{
2575 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2576 push_ws );
2577}
2578
2579void
2580__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2581 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2582 kmp_int64 chunk, int push_ws )
2583{
2584 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2585 push_ws );
2586}
2587
2588void
2589__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2590{
2591 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2592}
2593
2594void
2595__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2596{
2597 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2598}
2599
2600void
2601__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2602{
2603 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2604}
2605
2606void
2607__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2608{
2609 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2610}
2611
2612#endif /* KMP_GOMP_COMPAT */
2613
2614/* ------------------------------------------------------------------------ */
2615/* ------------------------------------------------------------------------ */
2616