blob: a39c8f2c3061619ac43851bcc8c542e4b4c454d2 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16/*
17 * Dynamic scheduling initialization and dispatch.
18 *
19 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
20 * it may change values between parallel regions. __kmp_max_nth
21 * is the largest value __kmp_nth may take, 1 is the smallest.
22 *
23 */
24
25/* ------------------------------------------------------------------------ */
26/* ------------------------------------------------------------------------ */
27
28#include "kmp.h"
29#include "kmp_i18n.h"
30#include "kmp_itt.h"
31#include "kmp_str.h"
32#include "kmp_error.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000033#include "kmp_stats.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000034#if KMP_OS_WINDOWS && KMP_ARCH_X86
35 #include <float.h>
36#endif
37
38/* ------------------------------------------------------------------------ */
39/* ------------------------------------------------------------------------ */
40
Jim Cownie4cc4bb42014-10-07 16:25:50 +000041// template for type limits
42template< typename T >
43struct i_maxmin {
44 static const T mx;
45 static const T mn;
46};
47template<>
48struct i_maxmin< int > {
49 static const int mx = 0x7fffffff;
50 static const int mn = 0x80000000;
51};
52template<>
53struct i_maxmin< unsigned int > {
54 static const unsigned int mx = 0xffffffff;
55 static const unsigned int mn = 0x00000000;
56};
57template<>
58struct i_maxmin< long long > {
59 static const long long mx = 0x7fffffffffffffffLL;
60 static const long long mn = 0x8000000000000000LL;
61};
62template<>
63struct i_maxmin< unsigned long long > {
64 static const unsigned long long mx = 0xffffffffffffffffLL;
65 static const unsigned long long mn = 0x0000000000000000LL;
66};
67//-------------------------------------------------------------------------
68
Jim Cownie5e8470a2013-09-27 10:38:44 +000069#ifdef KMP_STATIC_STEAL_ENABLED
70
71 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
72 template< typename T >
73 struct dispatch_private_infoXX_template {
74 typedef typename traits_t< T >::unsigned_t UT;
75 typedef typename traits_t< T >::signed_t ST;
76 UT count; // unsigned
77 T ub;
78 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
79 T lb;
80 ST st; // signed
81 UT tc; // unsigned
82 T static_steal_counter; // for static_steal only; maybe better to put after ub
83
84 /* parm[1-4] are used in different ways by different scheduling algorithms */
85
86 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
87 // a) parm3 is properly aligned and
88 // b) all parm1-4 are in the same cache line.
89 // Because of parm1-4 are used together, performance seems to be better
90 // if they are in the same line (not measured though).
91
92 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
93 T parm1;
94 T parm2;
95 T parm3;
96 T parm4;
97 };
98
99 UT ordered_lower; // unsigned
100 UT ordered_upper; // unsigned
101 #if KMP_OS_WINDOWS
102 T last_upper;
103 #endif /* KMP_OS_WINDOWS */
104 };
105
106#else /* KMP_STATIC_STEAL_ENABLED */
107
108 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
109 template< typename T >
110 struct dispatch_private_infoXX_template {
111 typedef typename traits_t< T >::unsigned_t UT;
112 typedef typename traits_t< T >::signed_t ST;
113 T lb;
114 T ub;
115 ST st; // signed
116 UT tc; // unsigned
117
118 T parm1;
119 T parm2;
120 T parm3;
121 T parm4;
122
123 UT count; // unsigned
124
125 UT ordered_lower; // unsigned
126 UT ordered_upper; // unsigned
127 #if KMP_OS_WINDOWS
128 T last_upper;
129 #endif /* KMP_OS_WINDOWS */
130 };
131
132#endif /* KMP_STATIC_STEAL_ENABLED */
133
134// replaces dispatch_private_info structure and dispatch_private_info_t type
135template< typename T >
136struct KMP_ALIGN_CACHE dispatch_private_info_template {
137 // duplicate alignment here, otherwise size of structure is not correct in our compiler
138 union KMP_ALIGN_CACHE private_info_tmpl {
139 dispatch_private_infoXX_template< T > p;
140 dispatch_private_info64_t p64;
141 } u;
142 enum sched_type schedule; /* scheduling algorithm */
143 kmp_uint32 ordered; /* ordered clause specified */
144 kmp_uint32 ordered_bumped;
145 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
146 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
147 kmp_uint32 nomerge; /* don't merge iters if serialized */
148 kmp_uint32 type_size;
149 enum cons_type pushed_ws;
150};
151
152
153// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
154template< typename UT >
155struct dispatch_shared_infoXX_template {
156 /* chunk index under dynamic, number of idle threads under static-steal;
157 iteration index otherwise */
158 volatile UT iteration;
159 volatile UT num_done;
160 volatile UT ordered_iteration;
161 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
162};
163
164// replaces dispatch_shared_info structure and dispatch_shared_info_t type
165template< typename UT >
166struct dispatch_shared_info_template {
167 // we need union here to keep the structure size
168 union shared_info_tmpl {
169 dispatch_shared_infoXX_template< UT > s;
170 dispatch_shared_info64_t s64;
171 } u;
172 volatile kmp_uint32 buffer_index;
173};
174
175/* ------------------------------------------------------------------------ */
176/* ------------------------------------------------------------------------ */
177
Jim Cownie5e8470a2013-09-27 10:38:44 +0000178#undef USE_TEST_LOCKS
179
180// test_then_add template (general template should NOT be used)
181template< typename T >
182static __forceinline T
183test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
184
185template<>
186__forceinline kmp_int32
187test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
188{
189 kmp_int32 r;
190 r = KMP_TEST_THEN_ADD32( p, d );
191 return r;
192}
193
194template<>
195__forceinline kmp_int64
196test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
197{
198 kmp_int64 r;
199 r = KMP_TEST_THEN_ADD64( p, d );
200 return r;
201}
202
203// test_then_inc_acq template (general template should NOT be used)
204template< typename T >
205static __forceinline T
206test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
207
208template<>
209__forceinline kmp_int32
210test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
211{
212 kmp_int32 r;
213 r = KMP_TEST_THEN_INC_ACQ32( p );
214 return r;
215}
216
217template<>
218__forceinline kmp_int64
219test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
220{
221 kmp_int64 r;
222 r = KMP_TEST_THEN_INC_ACQ64( p );
223 return r;
224}
225
226// test_then_inc template (general template should NOT be used)
227template< typename T >
228static __forceinline T
229test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
230
231template<>
232__forceinline kmp_int32
233test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
234{
235 kmp_int32 r;
236 r = KMP_TEST_THEN_INC32( p );
237 return r;
238}
239
240template<>
241__forceinline kmp_int64
242test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
243{
244 kmp_int64 r;
245 r = KMP_TEST_THEN_INC64( p );
246 return r;
247}
248
249// compare_and_swap template (general template should NOT be used)
250template< typename T >
251static __forceinline kmp_int32
252compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
253
254template<>
255__forceinline kmp_int32
256compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
257{
258 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
259}
260
261template<>
262__forceinline kmp_int32
263compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
264{
265 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
266}
267
268/*
269 Spin wait loop that first does pause, then yield.
270 Waits until function returns non-zero when called with *spinner and check.
271 Does NOT put threads to sleep.
272#if USE_ITT_BUILD
273 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000274 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000275 locks consistently. For example, if lock is acquired immediately, its address is
276 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
277 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
278 address, not an address of low-level spinner.
279#endif // USE_ITT_BUILD
280*/
281template< typename UT >
282// ToDo: make inline function (move to header file for icl)
283static UT // unsigned 4- or 8-byte type
284__kmp_wait_yield( volatile UT * spinner,
285 UT checker,
286 kmp_uint32 (* pred)( UT, UT )
287 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
288 )
289{
290 // note: we may not belong to a team at this point
291 register volatile UT * spin = spinner;
292 register UT check = checker;
293 register kmp_uint32 spins;
294 register kmp_uint32 (*f) ( UT, UT ) = pred;
295 register UT r;
296
297 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
298 KMP_INIT_YIELD( spins );
299 // main wait spin loop
300 while(!f(r = *spin, check))
301 {
302 KMP_FSYNC_SPIN_PREPARE( obj );
303 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
304 It causes problems with infinite recursion because of exit lock */
305 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
306 __kmp_abort_thread(); */
307
Jim Cownie5e8470a2013-09-27 10:38:44 +0000308 // if we are oversubscribed,
309 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
310 // pause is in the following code
311 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
312 KMP_YIELD_SPIN( spins );
313 }
314 KMP_FSYNC_SPIN_ACQUIRED( obj );
315 return r;
316}
317
318template< typename UT >
319static kmp_uint32 __kmp_eq( UT value, UT checker) {
320 return value == checker;
321}
322
323template< typename UT >
324static kmp_uint32 __kmp_neq( UT value, UT checker) {
325 return value != checker;
326}
327
328template< typename UT >
329static kmp_uint32 __kmp_lt( UT value, UT checker) {
330 return value < checker;
331}
332
333template< typename UT >
334static kmp_uint32 __kmp_ge( UT value, UT checker) {
335 return value >= checker;
336}
337
338template< typename UT >
339static kmp_uint32 __kmp_le( UT value, UT checker) {
340 return value <= checker;
341}
342
343
344/* ------------------------------------------------------------------------ */
345/* ------------------------------------------------------------------------ */
346
347static void
348__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
349{
350 kmp_info_t *th;
351
352 KMP_DEBUG_ASSERT( gtid_ref );
353
354 if ( __kmp_env_consistency_check ) {
355 th = __kmp_threads[*gtid_ref];
356 if ( th -> th.th_root -> r.r_active
357 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000358#if KMP_USE_DYNAMIC_LOCK
359 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
360#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000361 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000362#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000363 }
364 }
365}
366
367template< typename UT >
368static void
369__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
370{
371 typedef typename traits_t< UT >::signed_t ST;
372 dispatch_private_info_template< UT > * pr;
373
374 int gtid = *gtid_ref;
375// int cid = *cid_ref;
376 kmp_info_t *th = __kmp_threads[ gtid ];
377 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
378
379 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
380 if ( __kmp_env_consistency_check ) {
381 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
382 ( th -> th.th_dispatch -> th_dispatch_pr_current );
383 if ( pr -> pushed_ws != ct_none ) {
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000384#if KMP_USE_DYNAMIC_LOCK
385 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
386#else
Jim Cownie5e8470a2013-09-27 10:38:44 +0000387 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov5c56fb52015-02-20 18:05:17 +0000388#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000389 }
390 }
391
392 if ( ! th -> th.th_team -> t.t_serialized ) {
393 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
394 ( th -> th.th_dispatch -> th_dispatch_sh_current );
395 UT lower;
396
397 if ( ! __kmp_env_consistency_check ) {
398 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
399 ( th -> th.th_dispatch -> th_dispatch_pr_current );
400 }
401 lower = pr->u.p.ordered_lower;
402
403 #if ! defined( KMP_GOMP_COMPAT )
404 if ( __kmp_env_consistency_check ) {
405 if ( pr->ordered_bumped ) {
406 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
407 __kmp_error_construct2(
408 kmp_i18n_msg_CnsMultipleNesting,
409 ct_ordered_in_pdo, loc_ref,
410 & p->stack_data[ p->w_top ]
411 );
412 }
413 }
414 #endif /* !defined(KMP_GOMP_COMPAT) */
415
416 KMP_MB();
417 #ifdef KMP_DEBUG
418 {
419 const char * buff;
420 // create format specifiers before the debug output
421 buff = __kmp_str_format(
422 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
423 traits_t< UT >::spec, traits_t< UT >::spec );
424 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
425 __kmp_str_free( &buff );
426 }
427 #endif
428
429 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
430 USE_ITT_BUILD_ARG( NULL )
431 );
432 KMP_MB(); /* is this necessary? */
433 #ifdef KMP_DEBUG
434 {
435 const char * buff;
436 // create format specifiers before the debug output
437 buff = __kmp_str_format(
438 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
439 traits_t< UT >::spec, traits_t< UT >::spec );
440 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
441 __kmp_str_free( &buff );
442 }
443 #endif
444 }
445 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
446}
447
448static void
449__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
450{
451 kmp_info_t *th;
452
453 if ( __kmp_env_consistency_check ) {
454 th = __kmp_threads[*gtid_ref];
455 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
456 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
457 }
458 }
459}
460
461template< typename UT >
462static void
463__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
464{
465 typedef typename traits_t< UT >::signed_t ST;
466 dispatch_private_info_template< UT > * pr;
467
468 int gtid = *gtid_ref;
469// int cid = *cid_ref;
470 kmp_info_t *th = __kmp_threads[ gtid ];
471 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
472
473 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
474 if ( __kmp_env_consistency_check ) {
475 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
476 ( th -> th.th_dispatch -> th_dispatch_pr_current );
477 if ( pr -> pushed_ws != ct_none ) {
478 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
479 }
480 }
481
482 if ( ! th -> th.th_team -> t.t_serialized ) {
483 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
484 ( th -> th.th_dispatch -> th_dispatch_sh_current );
485
486 if ( ! __kmp_env_consistency_check ) {
487 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
488 ( th -> th.th_dispatch -> th_dispatch_pr_current );
489 }
490
491 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
492 #if ! defined( KMP_GOMP_COMPAT )
493 if ( __kmp_env_consistency_check ) {
494 if ( pr->ordered_bumped != 0 ) {
495 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
496 /* How to test it? - OM */
497 __kmp_error_construct2(
498 kmp_i18n_msg_CnsMultipleNesting,
499 ct_ordered_in_pdo, loc_ref,
500 & p->stack_data[ p->w_top ]
501 );
502 }
503 }
504 #endif /* !defined(KMP_GOMP_COMPAT) */
505
506 KMP_MB(); /* Flush all pending memory write invalidates. */
507
508 pr->ordered_bumped += 1;
509
510 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
511 gtid, pr->ordered_bumped ) );
512
513 KMP_MB(); /* Flush all pending memory write invalidates. */
514
515 /* TODO use general release procedure? */
516 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
517
518 KMP_MB(); /* Flush all pending memory write invalidates. */
519 }
520 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
521}
522
523/* Computes and returns x to the power of y, where y must a non-negative integer */
524template< typename UT >
525static __forceinline long double
526__kmp_pow(long double x, UT y) {
527 long double s=1.0L;
528
529 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
530 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
531 while(y) {
532 if ( y & 1 )
533 s *= x;
534 x *= x;
535 y >>= 1;
536 }
537 return s;
538}
539
540/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
541 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
542 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
543 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
544*/
545template< typename T >
546static __inline typename traits_t< T >::unsigned_t
547__kmp_dispatch_guided_remaining(
548 T tc,
549 typename traits_t< T >::floating_t base,
550 typename traits_t< T >::unsigned_t idx
551) {
552 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
553 least for ICL 8.1, long double arithmetic may not really have
554 long double precision, even with /Qlong_double. Currently, we
555 workaround that in the caller code, by manipulating the FPCW for
556 Windows* OS on IA-32 architecture. The lack of precision is not
557 expected to be a correctness issue, though.
558 */
559 typedef typename traits_t< T >::unsigned_t UT;
560
561 long double x = tc * __kmp_pow< UT >(base, idx);
562 UT r = (UT) x;
563 if ( x == r )
564 return r;
565 return r + 1;
566}
567
568// Parameters of the guided-iterative algorithm:
569// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
570// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
571// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
572// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
573static int guided_int_param = 2;
574static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
575
576// UT - unsigned flavor of T, ST - signed flavor of T,
577// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
578template< typename T >
579static void
580__kmp_dispatch_init(
581 ident_t * loc,
582 int gtid,
583 enum sched_type schedule,
584 T lb,
585 T ub,
586 typename traits_t< T >::signed_t st,
587 typename traits_t< T >::signed_t chunk,
588 int push_ws
589) {
590 typedef typename traits_t< T >::unsigned_t UT;
591 typedef typename traits_t< T >::signed_t ST;
592 typedef typename traits_t< T >::floating_t DBL;
593 static const int ___kmp_size_type = sizeof( UT );
594
595 int active;
596 T tc;
597 kmp_info_t * th;
598 kmp_team_t * team;
599 kmp_uint32 my_buffer_index;
600 dispatch_private_info_template< T > * pr;
601 dispatch_shared_info_template< UT > volatile * sh;
602
603 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
604 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
605
606 if ( ! TCR_4( __kmp_init_parallel ) )
607 __kmp_parallel_initialize();
608
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000609#if INCLUDE_SSC_MARKS
610 SSC_MARK_DISPATCH_INIT();
611#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000612 #ifdef KMP_DEBUG
613 {
614 const char * buff;
615 // create format specifiers before the debug output
616 buff = __kmp_str_format(
617 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
618 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
619 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
620 __kmp_str_free( &buff );
621 }
622 #endif
623 /* setup data */
624 th = __kmp_threads[ gtid ];
625 team = th -> th.th_team;
626 active = ! team -> t.t_serialized;
627 th->th.th_ident = loc;
628
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000629#if USE_ITT_BUILD
630 kmp_uint64 cur_chunk = chunk;
631#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000632 if ( ! active ) {
633 pr = reinterpret_cast< dispatch_private_info_template< T >* >
634 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
635 } else {
636 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
637 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
638
639 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
640
641 /* What happens when number of threads changes, need to resize buffer? */
642 pr = reinterpret_cast< dispatch_private_info_template< T > * >
643 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
644 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
645 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
646 }
647
648 /* Pick up the nomerge/ordered bits from the scheduling type */
649 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
650 pr->nomerge = TRUE;
651 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
652 } else {
653 pr->nomerge = FALSE;
654 }
655 pr->type_size = ___kmp_size_type; // remember the size of variables
656 if ( kmp_ord_lower & schedule ) {
657 pr->ordered = TRUE;
658 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
659 } else {
660 pr->ordered = FALSE;
661 }
662 if ( schedule == kmp_sch_static ) {
663 schedule = __kmp_static;
664 } else {
665 if ( schedule == kmp_sch_runtime ) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000666 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
667 schedule = team -> t.t_sched.r_sched_type;
668 // Detail the schedule if needed (global controls are differentiated appropriately)
669 if ( schedule == kmp_sch_guided_chunked ) {
670 schedule = __kmp_guided;
671 } else if ( schedule == kmp_sch_static ) {
672 schedule = __kmp_static;
673 }
674 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
675 chunk = team -> t.t_sched.chunk;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000676
677 #ifdef KMP_DEBUG
678 {
679 const char * buff;
680 // create format specifiers before the debug output
681 buff = __kmp_str_format(
682 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
683 traits_t< ST >::spec );
684 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
685 __kmp_str_free( &buff );
686 }
687 #endif
688 } else {
689 if ( schedule == kmp_sch_guided_chunked ) {
690 schedule = __kmp_guided;
691 }
692 if ( chunk <= 0 ) {
693 chunk = KMP_DEFAULT_CHUNK;
694 }
695 }
696
Jim Cownie5e8470a2013-09-27 10:38:44 +0000697 if ( schedule == kmp_sch_auto ) {
698 // mapping and differentiation: in the __kmp_do_serial_initialize()
699 schedule = __kmp_auto;
700 #ifdef KMP_DEBUG
701 {
702 const char * buff;
703 // create format specifiers before the debug output
704 buff = __kmp_str_format(
705 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
706 traits_t< ST >::spec );
707 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
708 __kmp_str_free( &buff );
709 }
710 #endif
711 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000712
713 /* guided analytical not safe for too many threads */
714 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
715 schedule = kmp_sch_guided_iterative_chunked;
716 KMP_WARNING( DispatchManyThreads );
717 }
718 pr->u.p.parm1 = chunk;
719 }
720 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
721 "unknown scheduling type" );
722
723 pr->u.p.count = 0;
724
725 if ( __kmp_env_consistency_check ) {
726 if ( st == 0 ) {
727 __kmp_error_construct(
728 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
729 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
730 );
731 }
732 }
733
734 tc = ( ub - lb + st );
735 if ( st != 1 ) {
736 if ( st < 0 ) {
737 if ( lb < ub ) {
738 tc = 0; // zero-trip
739 } else { // lb >= ub
740 tc = (ST)tc / st; // convert to signed division
741 }
742 } else { // st > 0
743 if ( ub < lb ) {
744 tc = 0; // zero-trip
745 } else { // lb >= ub
746 tc /= st;
747 }
748 }
749 } else if ( ub < lb ) { // st == 1
750 tc = 0; // zero-trip
751 }
752
753 pr->u.p.lb = lb;
754 pr->u.p.ub = ub;
755 pr->u.p.st = st;
756 pr->u.p.tc = tc;
757
758 #if KMP_OS_WINDOWS
759 pr->u.p.last_upper = ub + st;
760 #endif /* KMP_OS_WINDOWS */
761
762 /* NOTE: only the active parallel region(s) has active ordered sections */
763
764 if ( active ) {
765 if ( pr->ordered == 0 ) {
766 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
767 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
768 } else {
769 pr->ordered_bumped = 0;
770
771 pr->u.p.ordered_lower = 1;
772 pr->u.p.ordered_upper = 0;
773
774 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
775 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
776 }
777 }
778
779 if ( __kmp_env_consistency_check ) {
780 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
781 if ( push_ws ) {
782 __kmp_push_workshare( gtid, ws, loc );
783 pr->pushed_ws = ws;
784 } else {
785 __kmp_check_workshare( gtid, ws, loc );
786 pr->pushed_ws = ct_none;
787 }
788 }
789
790 switch ( schedule ) {
791 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
792 case kmp_sch_static_steal:
793 {
794 T nproc = team->t.t_nproc;
795 T ntc, init;
796
797 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
798
799 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
800 if ( nproc > 1 && ntc >= nproc ) {
801 T id = __kmp_tid_from_gtid(gtid);
802 T small_chunk, extras;
803
804 small_chunk = ntc / nproc;
805 extras = ntc % nproc;
806
807 init = id * small_chunk + ( id < extras ? id : extras );
808 pr->u.p.count = init;
809 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
810
811 pr->u.p.parm2 = lb;
812 //pr->pfields.parm3 = 0; // it's not used in static_steal
813 pr->u.p.parm4 = id;
814 pr->u.p.st = st;
815 break;
816 } else {
817 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
818 gtid ) );
819 schedule = kmp_sch_static_balanced;
820 /* too few iterations: fall-through to kmp_sch_static_balanced */
821 } // if
822 /* FALL-THROUGH to static balanced */
823 } // case
824 #endif
825 case kmp_sch_static_balanced:
826 {
827 T nproc = team->t.t_nproc;
828 T init, limit;
829
830 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
831 gtid ) );
832
833 if ( nproc > 1 ) {
834 T id = __kmp_tid_from_gtid(gtid);
835
836 if ( tc < nproc ) {
837 if ( id < tc ) {
838 init = id;
839 limit = id;
840 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
841 } else {
842 pr->u.p.count = 1; /* means no more chunks to execute */
843 pr->u.p.parm1 = FALSE;
844 break;
845 }
846 } else {
847 T small_chunk = tc / nproc;
848 T extras = tc % nproc;
849 init = id * small_chunk + (id < extras ? id : extras);
850 limit = init + small_chunk - (id < extras ? 0 : 1);
851 pr->u.p.parm1 = (id == nproc - 1);
852 }
853 } else {
854 if ( tc > 0 ) {
855 init = 0;
856 limit = tc - 1;
857 pr->u.p.parm1 = TRUE;
858 } else {
859 // zero trip count
860 pr->u.p.count = 1; /* means no more chunks to execute */
861 pr->u.p.parm1 = FALSE;
862 break;
863 }
864 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000865#if USE_ITT_BUILD
866 // Calculate chunk for metadata report
867 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
868 cur_chunk = limit - init + 1;
869 }
870#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000871 if ( st == 1 ) {
872 pr->u.p.lb = lb + init;
873 pr->u.p.ub = lb + limit;
874 } else {
875 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
876 pr->u.p.lb = lb + init * st;
877 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
878 if ( st > 0 ) {
879 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
880 } else {
881 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
882 }
883 }
884 if ( pr->ordered ) {
885 pr->u.p.ordered_lower = init;
886 pr->u.p.ordered_upper = limit;
887 }
888 break;
889 } // case
890 case kmp_sch_guided_iterative_chunked :
891 {
892 T nproc = team->t.t_nproc;
893 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
894
895 if ( nproc > 1 ) {
896 if ( (2L * chunk + 1 ) * nproc >= tc ) {
897 /* chunk size too large, switch to dynamic */
898 schedule = kmp_sch_dynamic_chunked;
899 } else {
900 // when remaining iters become less than parm2 - switch to dynamic
901 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
902 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
903 }
904 } else {
905 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
906 schedule = kmp_sch_static_greedy;
907 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
908 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
909 pr->u.p.parm1 = tc;
910 } // if
911 } // case
912 break;
913 case kmp_sch_guided_analytical_chunked:
914 {
915 T nproc = team->t.t_nproc;
916 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
917
918 if ( nproc > 1 ) {
919 if ( (2L * chunk + 1 ) * nproc >= tc ) {
920 /* chunk size too large, switch to dynamic */
921 schedule = kmp_sch_dynamic_chunked;
922 } else {
923 /* commonly used term: (2 nproc - 1)/(2 nproc) */
924 DBL x;
925
926 #if KMP_OS_WINDOWS && KMP_ARCH_X86
927 /* Linux* OS already has 64-bit computation by default for
928 long double, and on Windows* OS on Intel(R) 64,
929 /Qlong_double doesn't work. On Windows* OS
930 on IA-32 architecture, we need to set precision to
931 64-bit instead of the default 53-bit. Even though long
932 double doesn't work on Windows* OS on Intel(R) 64, the
933 resulting lack of precision is not expected to impact
934 the correctness of the algorithm, but this has not been
935 mathematically proven.
936 */
937 // save original FPCW and set precision to 64-bit, as
938 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +0000939 unsigned int oldFpcw = _control87(0,0);
940 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +0000941 #endif
942 /* value used for comparison in solver for cross-over point */
943 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
944
945 /* crossover point--chunk indexes equal to or greater than
946 this point switch to dynamic-style scheduling */
947 UT cross;
948
949 /* commonly used term: (2 nproc - 1)/(2 nproc) */
950 x = (long double)1.0 - (long double)0.5 / nproc;
951
952 #ifdef KMP_DEBUG
953 { // test natural alignment
954 struct _test_a {
955 char a;
956 union {
957 char b;
958 DBL d;
959 };
960 } t;
961 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
962 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
963 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
964 }
965 #endif // KMP_DEBUG
966
967 /* save the term in thread private dispatch structure */
968 *(DBL*)&pr->u.p.parm3 = x;
969
970 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
971 {
972 UT left, right, mid;
973 long double p;
974
975 /* estimate initial upper and lower bound */
976
977 /* doesn't matter what value right is as long as it is positive, but
978 it affects performance of the solver
979 */
980 right = 229;
981 p = __kmp_pow< UT >(x,right);
982 if ( p > target ) {
983 do{
984 p *= p;
985 right <<= 1;
986 } while(p>target && right < (1<<27));
987 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
988 } else {
989 left = 0;
990 }
991
992 /* bisection root-finding method */
993 while ( left + 1 < right ) {
994 mid = (left + right) / 2;
995 if ( __kmp_pow< UT >(x,mid) > target ) {
996 left = mid;
997 } else {
998 right = mid;
999 }
1000 } // while
1001 cross = right;
1002 }
1003 /* assert sanity of computed crossover point */
1004 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
1005
1006 /* save the crossover point in thread private dispatch structure */
1007 pr->u.p.parm2 = cross;
1008
1009 // C75803
1010 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
1011 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
1012 #else
1013 #define GUIDED_ANALYTICAL_WORKAROUND (x)
1014 #endif
1015 /* dynamic-style scheduling offset */
1016 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
1017 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1018 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +00001019 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001020 #endif
1021 } // if
1022 } else {
1023 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1024 gtid ) );
1025 schedule = kmp_sch_static_greedy;
1026 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1027 pr->u.p.parm1 = tc;
1028 } // if
1029 } // case
1030 break;
1031 case kmp_sch_static_greedy:
1032 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1033 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1034 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1035 tc;
1036 break;
1037 case kmp_sch_static_chunked :
1038 case kmp_sch_dynamic_chunked :
1039 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1040 break;
1041 case kmp_sch_trapezoidal :
1042 {
1043 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1044
1045 T parm1, parm2, parm3, parm4;
1046 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1047
1048 parm1 = chunk;
1049
1050 /* F : size of the first cycle */
1051 parm2 = ( tc / (2 * team->t.t_nproc) );
1052
1053 if ( parm2 < 1 ) {
1054 parm2 = 1;
1055 }
1056
1057 /* L : size of the last cycle. Make sure the last cycle
1058 * is not larger than the first cycle.
1059 */
1060 if ( parm1 < 1 ) {
1061 parm1 = 1;
1062 } else if ( parm1 > parm2 ) {
1063 parm1 = parm2;
1064 }
1065
1066 /* N : number of cycles */
1067 parm3 = ( parm2 + parm1 );
1068 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1069
1070 if ( parm3 < 2 ) {
1071 parm3 = 2;
1072 }
1073
1074 /* sigma : decreasing incr of the trapezoid */
1075 parm4 = ( parm3 - 1 );
1076 parm4 = ( parm2 - parm1 ) / parm4;
1077
1078 // pointless check, because parm4 >= 0 always
1079 //if ( parm4 < 0 ) {
1080 // parm4 = 0;
1081 //}
1082
1083 pr->u.p.parm1 = parm1;
1084 pr->u.p.parm2 = parm2;
1085 pr->u.p.parm3 = parm3;
1086 pr->u.p.parm4 = parm4;
1087 } // case
1088 break;
1089
1090 default:
1091 {
1092 __kmp_msg(
1093 kmp_ms_fatal, // Severity
1094 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1095 KMP_HNT( GetNewerLibrary ), // Hint
1096 __kmp_msg_null // Variadic argument list terminator
1097 );
1098 }
1099 break;
1100 } // switch
1101 pr->schedule = schedule;
1102 if ( active ) {
1103 /* The name of this buffer should be my_buffer_index when it's free to use it */
1104
1105 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1106 gtid, my_buffer_index, sh->buffer_index) );
1107 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1108 USE_ITT_BUILD_ARG( NULL )
1109 );
1110 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1111 // *always* 32-bit integers.
1112 KMP_MB(); /* is this necessary? */
1113 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1114 gtid, my_buffer_index, sh->buffer_index) );
1115
1116 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1117 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1118#if USE_ITT_BUILD
1119 if ( pr->ordered ) {
1120 __kmp_itt_ordered_init( gtid );
1121 }; // if
1122#endif /* USE_ITT_BUILD */
1123 }; // if
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001124
1125#if USE_ITT_BUILD
1126 // Report loop metadata
1127 if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
1128 kmp_uint32 tid = __kmp_tid_from_gtid( gtid );
1129 if (KMP_MASTER_TID(tid)) {
1130 kmp_uint64 schedtype = 0;
1131
1132 switch ( schedule ) {
1133 case kmp_sch_static_chunked:
1134 case kmp_sch_static_balanced:// Chunk is calculated in the switch above
1135 break;
1136 case kmp_sch_static_greedy:
1137 cur_chunk = pr->u.p.parm1;
1138 break;
1139 case kmp_sch_dynamic_chunked:
1140 schedtype = 1;
1141 break;
1142 case kmp_sch_guided_iterative_chunked:
1143 case kmp_sch_guided_analytical_chunked:
1144 schedtype = 2;
1145 break;
1146 default:
1147// Should we put this case under "static"?
1148// case kmp_sch_static_steal:
1149 schedtype = 3;
1150 break;
1151 }
1152 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1153 }
1154 }
1155#endif /* USE_ITT_BUILD */
1156
Jim Cownie5e8470a2013-09-27 10:38:44 +00001157 #ifdef KMP_DEBUG
1158 {
1159 const char * buff;
1160 // create format specifiers before the debug output
1161 buff = __kmp_str_format(
1162 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1163 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1164 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1165 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1166 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1167 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1168 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1169 KD_TRACE(10, ( buff,
1170 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1171 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1172 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1173 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1174 __kmp_str_free( &buff );
1175 }
1176 #endif
1177 #if ( KMP_STATIC_STEAL_ENABLED )
1178 if ( ___kmp_size_type < 8 ) {
1179 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1180 // all the parm3 variables will contain the same value.
1181 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1182 // rather than program life-time increment.
1183 // So the dedicated variable is required. The 'static_steal_counter' is used.
1184 if( schedule == kmp_sch_static_steal ) {
1185 // Other threads will inspect this variable when searching for a victim.
1186 // This is a flag showing that other threads may steal from this thread since then.
1187 volatile T * p = &pr->u.p.static_steal_counter;
1188 *p = *p + 1;
1189 }
1190 }
1191 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1192}
1193
1194/*
1195 * For ordered loops, either __kmp_dispatch_finish() should be called after
1196 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1197 * every chunk of iterations. If the ordered section(s) were not executed
1198 * for this iteration (or every iteration in this chunk), we need to set the
1199 * ordered iteration counters so that the next thread can proceed.
1200 */
1201template< typename UT >
1202static void
1203__kmp_dispatch_finish( int gtid, ident_t *loc )
1204{
1205 typedef typename traits_t< UT >::signed_t ST;
1206 kmp_info_t *th = __kmp_threads[ gtid ];
1207
1208 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1209 if ( ! th -> th.th_team -> t.t_serialized ) {
1210
1211 dispatch_private_info_template< UT > * pr =
1212 reinterpret_cast< dispatch_private_info_template< UT >* >
1213 ( th->th.th_dispatch->th_dispatch_pr_current );
1214 dispatch_shared_info_template< UT > volatile * sh =
1215 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1216 ( th->th.th_dispatch->th_dispatch_sh_current );
1217 KMP_DEBUG_ASSERT( pr );
1218 KMP_DEBUG_ASSERT( sh );
1219 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1220 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1221
1222 if ( pr->ordered_bumped ) {
1223 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1224 gtid ) );
1225 pr->ordered_bumped = 0;
1226 } else {
1227 UT lower = pr->u.p.ordered_lower;
1228
1229 #ifdef KMP_DEBUG
1230 {
1231 const char * buff;
1232 // create format specifiers before the debug output
1233 buff = __kmp_str_format(
1234 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1235 traits_t< UT >::spec, traits_t< UT >::spec );
1236 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1237 __kmp_str_free( &buff );
1238 }
1239 #endif
1240
1241 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1242 USE_ITT_BUILD_ARG(NULL)
1243 );
1244 KMP_MB(); /* is this necessary? */
1245 #ifdef KMP_DEBUG
1246 {
1247 const char * buff;
1248 // create format specifiers before the debug output
1249 buff = __kmp_str_format(
1250 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1251 traits_t< UT >::spec, traits_t< UT >::spec );
1252 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1253 __kmp_str_free( &buff );
1254 }
1255 #endif
1256
1257 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1258 } // if
1259 } // if
1260 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1261}
1262
1263#ifdef KMP_GOMP_COMPAT
1264
1265template< typename UT >
1266static void
1267__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1268{
1269 typedef typename traits_t< UT >::signed_t ST;
1270 kmp_info_t *th = __kmp_threads[ gtid ];
1271
1272 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1273 if ( ! th -> th.th_team -> t.t_serialized ) {
1274// int cid;
1275 dispatch_private_info_template< UT > * pr =
1276 reinterpret_cast< dispatch_private_info_template< UT >* >
1277 ( th->th.th_dispatch->th_dispatch_pr_current );
1278 dispatch_shared_info_template< UT > volatile * sh =
1279 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1280 ( th->th.th_dispatch->th_dispatch_sh_current );
1281 KMP_DEBUG_ASSERT( pr );
1282 KMP_DEBUG_ASSERT( sh );
1283 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1284 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1285
1286// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1287 UT lower = pr->u.p.ordered_lower;
1288 UT upper = pr->u.p.ordered_upper;
1289 UT inc = upper - lower + 1;
1290
1291 if ( pr->ordered_bumped == inc ) {
1292 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1293 gtid ) );
1294 pr->ordered_bumped = 0;
1295 } else {
1296 inc -= pr->ordered_bumped;
1297
1298 #ifdef KMP_DEBUG
1299 {
1300 const char * buff;
1301 // create format specifiers before the debug output
1302 buff = __kmp_str_format(
1303 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1304 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1305 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1306 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1307 __kmp_str_free( &buff );
1308 }
1309 #endif
1310
1311 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1312 USE_ITT_BUILD_ARG(NULL)
1313 );
1314
1315 KMP_MB(); /* is this necessary? */
1316 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1317 gtid ) );
1318 pr->ordered_bumped = 0;
1319//!!!!! TODO check if the inc should be unsigned, or signed???
1320 #ifdef KMP_DEBUG
1321 {
1322 const char * buff;
1323 // create format specifiers before the debug output
1324 buff = __kmp_str_format(
1325 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1326 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1327 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1328 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1329 __kmp_str_free( &buff );
1330 }
1331 #endif
1332
1333 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1334 }
1335// }
1336 }
1337 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1338}
1339
1340#endif /* KMP_GOMP_COMPAT */
1341
1342template< typename T >
1343static int
1344__kmp_dispatch_next(
1345 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1346) {
1347
1348 typedef typename traits_t< T >::unsigned_t UT;
1349 typedef typename traits_t< T >::signed_t ST;
1350 typedef typename traits_t< T >::floating_t DBL;
1351 static const int ___kmp_size_type = sizeof( UT );
1352
1353 int status;
1354 dispatch_private_info_template< T > * pr;
1355 kmp_info_t * th = __kmp_threads[ gtid ];
1356 kmp_team_t * team = th -> th.th_team;
1357
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001358 KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie5e8470a2013-09-27 10:38:44 +00001359 #ifdef KMP_DEBUG
1360 {
1361 const char * buff;
1362 // create format specifiers before the debug output
1363 buff = __kmp_str_format(
1364 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1365 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1366 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1367 __kmp_str_free( &buff );
1368 }
1369 #endif
1370
1371 if ( team -> t.t_serialized ) {
1372 /* NOTE: serialize this dispatch becase we are not at the active level */
1373 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1374 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1375 KMP_DEBUG_ASSERT( pr );
1376
1377 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1378 *p_lb = 0;
1379 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001380// if ( p_last != NULL )
1381// *p_last = 0;
1382 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001383 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001384 if ( __kmp_env_consistency_check ) {
1385 if ( pr->pushed_ws != ct_none ) {
1386 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1387 }
1388 }
1389 } else if ( pr->nomerge ) {
1390 kmp_int32 last;
1391 T start;
1392 UT limit, trip, init;
1393 ST incr;
1394 T chunk = pr->u.p.parm1;
1395
1396 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1397
1398 init = chunk * pr->u.p.count++;
1399 trip = pr->u.p.tc - 1;
1400
1401 if ( (status = (init <= trip)) == 0 ) {
1402 *p_lb = 0;
1403 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001404// if ( p_last != NULL )
1405// *p_last = 0;
1406 if ( p_st != NULL )
1407 *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001408 if ( __kmp_env_consistency_check ) {
1409 if ( pr->pushed_ws != ct_none ) {
1410 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1411 }
1412 }
1413 } else {
1414 start = pr->u.p.lb;
1415 limit = chunk + init - 1;
1416 incr = pr->u.p.st;
1417
1418 if ( (last = (limit >= trip)) != 0 ) {
1419 limit = trip;
1420 #if KMP_OS_WINDOWS
1421 pr->u.p.last_upper = pr->u.p.ub;
1422 #endif /* KMP_OS_WINDOWS */
1423 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001424 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001425 *p_last = last;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001426 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001427 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001428 if ( incr == 1 ) {
1429 *p_lb = start + init;
1430 *p_ub = start + limit;
1431 } else {
1432 *p_lb = start + init * incr;
1433 *p_ub = start + limit * incr;
1434 }
1435
1436 if ( pr->ordered ) {
1437 pr->u.p.ordered_lower = init;
1438 pr->u.p.ordered_upper = limit;
1439 #ifdef KMP_DEBUG
1440 {
1441 const char * buff;
1442 // create format specifiers before the debug output
1443 buff = __kmp_str_format(
1444 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1445 traits_t< UT >::spec, traits_t< UT >::spec );
1446 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1447 __kmp_str_free( &buff );
1448 }
1449 #endif
1450 } // if
1451 } // if
1452 } else {
1453 pr->u.p.tc = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001454 *p_lb = pr->u.p.lb;
1455 *p_ub = pr->u.p.ub;
1456 #if KMP_OS_WINDOWS
1457 pr->u.p.last_upper = *p_ub;
1458 #endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001459 if ( p_last != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001460 *p_last = TRUE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001461 if ( p_st != NULL )
1462 *p_st = pr->u.p.st;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001463 } // if
1464 #ifdef KMP_DEBUG
1465 {
1466 const char * buff;
1467 // create format specifiers before the debug output
1468 buff = __kmp_str_format(
1469 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001470 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie5e8470a2013-09-27 10:38:44 +00001471 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001472 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status) );
Jim Cownie5e8470a2013-09-27 10:38:44 +00001473 __kmp_str_free( &buff );
1474 }
1475 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001476#if INCLUDE_SSC_MARKS
1477 SSC_MARK_DISPATCH_NEXT();
1478#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001479 return status;
1480 } else {
1481 kmp_int32 last = 0;
1482 dispatch_shared_info_template< UT > *sh;
1483 T start;
1484 ST incr;
1485 UT limit, trip, init;
1486
1487 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1488 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1489
1490 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1491 ( th->th.th_dispatch->th_dispatch_pr_current );
1492 KMP_DEBUG_ASSERT( pr );
1493 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1494 ( th->th.th_dispatch->th_dispatch_sh_current );
1495 KMP_DEBUG_ASSERT( sh );
1496
1497 if ( pr->u.p.tc == 0 ) {
1498 // zero trip count
1499 status = 0;
1500 } else {
1501 switch (pr->schedule) {
1502 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1503 case kmp_sch_static_steal:
1504 {
1505 T chunk = pr->u.p.parm1;
1506
1507 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1508
1509 trip = pr->u.p.tc - 1;
1510
1511 if ( ___kmp_size_type > 4 ) {
1512 // Other threads do not look into the data of this thread,
1513 // so it's not necessary to make volatile casting.
1514 init = ( pr->u.p.count )++;
1515 status = ( init < (UT)pr->u.p.ub );
1516 } else {
1517 typedef union {
1518 struct {
1519 UT count;
1520 T ub;
1521 } p;
1522 kmp_int64 b;
1523 } union_i4;
1524 // All operations on 'count' or 'ub' must be combined atomically together.
1525 // stealing implemented only for 4-byte indexes
1526 {
1527 union_i4 vold, vnew;
1528 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1529 vnew = vold;
1530 vnew.p.count++;
1531 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1532 ( volatile kmp_int64* )&pr->u.p.count,
1533 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1534 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1535 KMP_CPU_PAUSE();
1536 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1537 vnew = vold;
1538 vnew.p.count++;
1539 }
1540 vnew = vold;
1541 init = vnew.p.count;
1542 status = ( init < (UT)vnew.p.ub ) ;
1543 }
1544
1545 if( !status ) {
1546 kmp_info_t **other_threads = team->t.t_threads;
1547 int while_limit = 10;
1548 int while_index = 0;
1549
1550 // TODO: algorithm of searching for a victim
1551 // should be cleaned up and measured
1552 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1553 union_i4 vold, vnew;
1554 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1555 T victimIdx = pr->u.p.parm4;
1556 T oldVictimIdx = victimIdx;
1557 dispatch_private_info_template< T > * victim;
1558
1559 do {
1560 if( !victimIdx ) {
1561 victimIdx = team->t.t_nproc - 1;
1562 } else {
1563 --victimIdx;
1564 }
1565 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1566 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1567 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1568 // TODO: think about a proper place of this test
1569 if ( ( !victim ) ||
1570 ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1571 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1572 // TODO: delay would be nice
1573 continue;
1574 // the victim is not ready yet to participate in stealing
1575 // because the victim is still in kmp_init_dispatch
1576 }
1577 if ( oldVictimIdx == victimIdx ) {
1578 break;
1579 }
1580 pr->u.p.parm4 = victimIdx;
1581
1582 while( 1 ) {
1583 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1584 vnew = vold;
1585
1586 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1587 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1588 break;
1589 }
1590 vnew.p.ub -= (remaining >> 2);
1591 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1592 #pragma warning( push )
1593 // disable warning on pointless comparison of unsigned with 0
1594 #pragma warning( disable: 186 )
1595 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1596 #pragma warning( pop )
1597 // TODO: Should this be acquire or release?
1598 if ( KMP_COMPARE_AND_STORE_ACQ64(
1599 ( volatile kmp_int64 * )&victim->u.p.count,
1600 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1601 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1602 status = 1;
1603 while_index = 0;
1604 // now update own count and ub
1605 #if KMP_ARCH_X86
1606 // stealing executed on non-KMP_ARCH_X86 only
1607 // Atomic 64-bit write on ia32 is
1608 // unavailable, so we do this in steps.
1609 // This code is not tested.
1610 init = vold.p.count;
1611 pr->u.p.ub = 0;
1612 pr->u.p.count = init + 1;
1613 pr->u.p.ub = vnew.p.count;
1614 #else
1615 init = vnew.p.ub;
1616 vold.p.count = init + 1;
1617 // TODO: is it safe and enough?
1618 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1619 #endif // KMP_ARCH_X86
1620 break;
1621 } // if
1622 KMP_CPU_PAUSE();
1623 } // while (1)
1624 } // while
1625 } // if
1626 } // if
1627 if ( !status ) {
1628 *p_lb = 0;
1629 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001630 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001631 } else {
1632 start = pr->u.p.parm2;
1633 init *= chunk;
1634 limit = chunk + init - 1;
1635 incr = pr->u.p.st;
1636
1637 KMP_DEBUG_ASSERT(init <= trip);
1638 if ( (last = (limit >= trip)) != 0 )
1639 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001640 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001641
1642 if ( incr == 1 ) {
1643 *p_lb = start + init;
1644 *p_ub = start + limit;
1645 } else {
1646 *p_lb = start + init * incr;
1647 *p_ub = start + limit * incr;
1648 }
1649
1650 if ( pr->ordered ) {
1651 pr->u.p.ordered_lower = init;
1652 pr->u.p.ordered_upper = limit;
1653 #ifdef KMP_DEBUG
1654 {
1655 const char * buff;
1656 // create format specifiers before the debug output
1657 buff = __kmp_str_format(
1658 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1659 traits_t< UT >::spec, traits_t< UT >::spec );
1660 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1661 __kmp_str_free( &buff );
1662 }
1663 #endif
1664 } // if
1665 } // if
1666 break;
1667 } // case
1668 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1669 case kmp_sch_static_balanced:
1670 {
1671 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1672 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1673 pr->u.p.count = 1;
1674 *p_lb = pr->u.p.lb;
1675 *p_ub = pr->u.p.ub;
1676 last = pr->u.p.parm1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001677 if ( p_st != NULL )
Jim Cownie5e8470a2013-09-27 10:38:44 +00001678 *p_st = pr->u.p.st;
1679 } else { /* no iterations to do */
1680 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1681 }
1682 if ( pr->ordered ) {
1683 #ifdef KMP_DEBUG
1684 {
1685 const char * buff;
1686 // create format specifiers before the debug output
1687 buff = __kmp_str_format(
1688 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1689 traits_t< UT >::spec, traits_t< UT >::spec );
1690 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1691 __kmp_str_free( &buff );
1692 }
1693 #endif
1694 } // if
1695 } // case
1696 break;
1697 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1698 case kmp_sch_static_chunked:
1699 {
1700 T parm1;
1701
1702 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1703 gtid ) );
1704 parm1 = pr->u.p.parm1;
1705
1706 trip = pr->u.p.tc - 1;
1707 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1708
1709 if ( (status = (init <= trip)) != 0 ) {
1710 start = pr->u.p.lb;
1711 incr = pr->u.p.st;
1712 limit = parm1 + init - 1;
1713
1714 if ( (last = (limit >= trip)) != 0 )
1715 limit = trip;
1716
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001717 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001718
1719 pr->u.p.count += team->t.t_nproc;
1720
1721 if ( incr == 1 ) {
1722 *p_lb = start + init;
1723 *p_ub = start + limit;
1724 }
1725 else {
1726 *p_lb = start + init * incr;
1727 *p_ub = start + limit * incr;
1728 }
1729
1730 if ( pr->ordered ) {
1731 pr->u.p.ordered_lower = init;
1732 pr->u.p.ordered_upper = limit;
1733 #ifdef KMP_DEBUG
1734 {
1735 const char * buff;
1736 // create format specifiers before the debug output
1737 buff = __kmp_str_format(
1738 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1739 traits_t< UT >::spec, traits_t< UT >::spec );
1740 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1741 __kmp_str_free( &buff );
1742 }
1743 #endif
1744 } // if
1745 } // if
1746 } // case
1747 break;
1748
1749 case kmp_sch_dynamic_chunked:
1750 {
1751 T chunk = pr->u.p.parm1;
1752
1753 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1754 gtid ) );
1755
1756 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1757 trip = pr->u.p.tc - 1;
1758
1759 if ( (status = (init <= trip)) == 0 ) {
1760 *p_lb = 0;
1761 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001762 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001763 } else {
1764 start = pr->u.p.lb;
1765 limit = chunk + init - 1;
1766 incr = pr->u.p.st;
1767
1768 if ( (last = (limit >= trip)) != 0 )
1769 limit = trip;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001770
1771 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001772
1773 if ( incr == 1 ) {
1774 *p_lb = start + init;
1775 *p_ub = start + limit;
1776 } else {
1777 *p_lb = start + init * incr;
1778 *p_ub = start + limit * incr;
1779 }
1780
1781 if ( pr->ordered ) {
1782 pr->u.p.ordered_lower = init;
1783 pr->u.p.ordered_upper = limit;
1784 #ifdef KMP_DEBUG
1785 {
1786 const char * buff;
1787 // create format specifiers before the debug output
1788 buff = __kmp_str_format(
1789 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1790 traits_t< UT >::spec, traits_t< UT >::spec );
1791 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1792 __kmp_str_free( &buff );
1793 }
1794 #endif
1795 } // if
1796 } // if
1797 } // case
1798 break;
1799
1800 case kmp_sch_guided_iterative_chunked:
1801 {
1802 T chunkspec = pr->u.p.parm1;
1803 KD_TRACE(100,
1804 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1805 trip = pr->u.p.tc;
1806 // Start atomic part of calculations
1807 while(1) {
1808 ST remaining; // signed, because can be < 0
1809 init = sh->u.s.iteration; // shared value
1810 remaining = trip - init;
1811 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1812 // nothing to do, don't try atomic op
1813 status = 0;
1814 break;
1815 }
1816 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1817 // use dynamic-style shcedule
1818 // atomically inrement iterations, get old value
1819 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1820 remaining = trip - init;
1821 if (remaining <= 0) {
1822 status = 0; // all iterations got by other threads
1823 } else {
1824 // got some iterations to work on
1825 status = 1;
1826 if ( (T)remaining > chunkspec ) {
1827 limit = init + chunkspec - 1;
1828 } else {
1829 last = 1; // the last chunk
1830 limit = init + remaining - 1;
1831 } // if
1832 } // if
1833 break;
1834 } // if
1835 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1836 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1837 // CAS was successful, chunk obtained
1838 status = 1;
1839 --limit;
1840 break;
1841 } // if
1842 } // while
1843 if ( status != 0 ) {
1844 start = pr->u.p.lb;
1845 incr = pr->u.p.st;
1846 if ( p_st != NULL )
1847 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001848 *p_lb = start + init * incr;
1849 *p_ub = start + limit * incr;
1850 if ( pr->ordered ) {
1851 pr->u.p.ordered_lower = init;
1852 pr->u.p.ordered_upper = limit;
1853 #ifdef KMP_DEBUG
1854 {
1855 const char * buff;
1856 // create format specifiers before the debug output
1857 buff = __kmp_str_format(
1858 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1859 traits_t< UT >::spec, traits_t< UT >::spec );
1860 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1861 __kmp_str_free( &buff );
1862 }
1863 #endif
1864 } // if
1865 } else {
1866 *p_lb = 0;
1867 *p_ub = 0;
1868 if ( p_st != NULL )
1869 *p_st = 0;
1870 } // if
1871 } // case
1872 break;
1873
1874 case kmp_sch_guided_analytical_chunked:
1875 {
1876 T chunkspec = pr->u.p.parm1;
1877 UT chunkIdx;
1878 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1879 /* for storing original FPCW value for Windows* OS on
1880 IA-32 architecture 8-byte version */
1881 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001882 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001883 #endif
1884 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1885 gtid ) );
1886
1887 trip = pr->u.p.tc;
1888
1889 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1890 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1891
1892 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1893 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1894 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1895 --trip;
1896 /* use dynamic-style scheduling */
1897 init = chunkIdx * chunkspec + pr->u.p.count;
1898 /* need to verify init > 0 in case of overflow in the above calculation */
1899 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1900 limit = init + chunkspec -1;
1901
1902 if ( (last = (limit >= trip)) != 0 )
1903 limit = trip;
1904 }
1905 break;
1906 } else {
1907 /* use exponential-style scheduling */
1908 /* The following check is to workaround the lack of long double precision on Windows* OS.
1909 This check works around the possible effect that init != 0 for chunkIdx == 0.
1910 */
1911 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1912 /* If we haven't already done so, save original
1913 FPCW and set precision to 64-bit, as Windows* OS
1914 on IA-32 architecture defaults to 53-bit */
1915 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00001916 oldFpcw = _control87(0,0);
1917 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001918 fpcwSet = 0x30000;
1919 }
1920 #endif
1921 if ( chunkIdx ) {
1922 init = __kmp_dispatch_guided_remaining< T >(
1923 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1924 KMP_DEBUG_ASSERT(init);
1925 init = trip - init;
1926 } else
1927 init = 0;
1928 limit = trip - __kmp_dispatch_guided_remaining< T >(
1929 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1930 KMP_ASSERT(init <= limit);
1931 if ( init < limit ) {
1932 KMP_DEBUG_ASSERT(limit <= trip);
1933 --limit;
1934 status = 1;
1935 break;
1936 } // if
1937 } // if
1938 } // while (1)
1939 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00001940 /* restore FPCW if necessary
1941 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1942 */
1943 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1944 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001945 #endif
1946 if ( status != 0 ) {
1947 start = pr->u.p.lb;
1948 incr = pr->u.p.st;
1949 if ( p_st != NULL )
1950 *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001951 *p_lb = start + init * incr;
1952 *p_ub = start + limit * incr;
1953 if ( pr->ordered ) {
1954 pr->u.p.ordered_lower = init;
1955 pr->u.p.ordered_upper = limit;
1956 #ifdef KMP_DEBUG
1957 {
1958 const char * buff;
1959 // create format specifiers before the debug output
1960 buff = __kmp_str_format(
1961 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1962 traits_t< UT >::spec, traits_t< UT >::spec );
1963 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1964 __kmp_str_free( &buff );
1965 }
1966 #endif
1967 }
1968 } else {
1969 *p_lb = 0;
1970 *p_ub = 0;
1971 if ( p_st != NULL )
1972 *p_st = 0;
1973 }
1974 } // case
1975 break;
1976
1977 case kmp_sch_trapezoidal:
1978 {
1979 UT index;
1980 T parm2 = pr->u.p.parm2;
1981 T parm3 = pr->u.p.parm3;
1982 T parm4 = pr->u.p.parm4;
1983 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1984 gtid ) );
1985
1986 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1987
1988 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1989 trip = pr->u.p.tc - 1;
1990
1991 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1992 *p_lb = 0;
1993 *p_ub = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001994 if ( p_st != NULL ) *p_st = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001995 } else {
1996 start = pr->u.p.lb;
1997 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1998 incr = pr->u.p.st;
1999
2000 if ( (last = (limit >= trip)) != 0 )
2001 limit = trip;
2002
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002003 if ( p_st != NULL ) *p_st = incr;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002004
2005 if ( incr == 1 ) {
2006 *p_lb = start + init;
2007 *p_ub = start + limit;
2008 } else {
2009 *p_lb = start + init * incr;
2010 *p_ub = start + limit * incr;
2011 }
2012
2013 if ( pr->ordered ) {
2014 pr->u.p.ordered_lower = init;
2015 pr->u.p.ordered_upper = limit;
2016 #ifdef KMP_DEBUG
2017 {
2018 const char * buff;
2019 // create format specifiers before the debug output
2020 buff = __kmp_str_format(
2021 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
2022 traits_t< UT >::spec, traits_t< UT >::spec );
2023 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
2024 __kmp_str_free( &buff );
2025 }
2026 #endif
2027 } // if
2028 } // if
2029 } // case
2030 break;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002031 default:
2032 {
2033 status = 0; // to avoid complaints on uninitialized variable use
2034 __kmp_msg(
2035 kmp_ms_fatal, // Severity
2036 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
2037 KMP_HNT( GetNewerLibrary ), // Hint
2038 __kmp_msg_null // Variadic argument list terminator
2039 );
2040 }
2041 break;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002042 } // switch
2043 } // if tc == 0;
2044
2045 if ( status == 0 ) {
2046 UT num_done;
2047
2048 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2049 #ifdef KMP_DEBUG
2050 {
2051 const char * buff;
2052 // create format specifiers before the debug output
2053 buff = __kmp_str_format(
2054 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2055 traits_t< UT >::spec );
2056 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2057 __kmp_str_free( &buff );
2058 }
2059 #endif
2060
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002061 if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00002062 /* NOTE: release this buffer to be reused */
2063
2064 KMP_MB(); /* Flush all pending memory write invalidates. */
2065
2066 sh->u.s.num_done = 0;
2067 sh->u.s.iteration = 0;
2068
2069 /* TODO replace with general release procedure? */
2070 if ( pr->ordered ) {
2071 sh->u.s.ordered_iteration = 0;
2072 }
2073
2074 KMP_MB(); /* Flush all pending memory write invalidates. */
2075
2076 sh -> buffer_index += KMP_MAX_DISP_BUF;
2077 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2078 gtid, sh->buffer_index) );
2079
2080 KMP_MB(); /* Flush all pending memory write invalidates. */
2081
2082 } // if
2083 if ( __kmp_env_consistency_check ) {
2084 if ( pr->pushed_ws != ct_none ) {
2085 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2086 }
2087 }
2088
2089 th -> th.th_dispatch -> th_deo_fcn = NULL;
2090 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2091 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2092 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2093 } // if (status == 0)
2094#if KMP_OS_WINDOWS
2095 else if ( last ) {
2096 pr->u.p.last_upper = pr->u.p.ub;
2097 }
2098#endif /* KMP_OS_WINDOWS */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002099 if ( p_last != NULL && status != 0 )
2100 *p_last = last;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002101 } // if
2102
2103 #ifdef KMP_DEBUG
2104 {
2105 const char * buff;
2106 // create format specifiers before the debug output
2107 buff = __kmp_str_format(
2108 "__kmp_dispatch_next: T#%%d normal case: " \
2109 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2110 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2111 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2112 __kmp_str_free( &buff );
2113 }
2114 #endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002115#if INCLUDE_SSC_MARKS
2116 SSC_MARK_DISPATCH_NEXT();
2117#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00002118 return status;
2119}
2120
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002121template< typename T >
2122static void
2123__kmp_dist_get_bounds(
2124 ident_t *loc,
2125 kmp_int32 gtid,
2126 kmp_int32 *plastiter,
2127 T *plower,
2128 T *pupper,
2129 typename traits_t< T >::signed_t incr
2130) {
2131 KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
2132 typedef typename traits_t< T >::unsigned_t UT;
2133 typedef typename traits_t< T >::signed_t ST;
2134 register kmp_uint32 team_id;
2135 register kmp_uint32 nteams;
2136 register UT trip_count;
2137 register kmp_team_t *team;
2138 kmp_info_t * th;
2139
2140 KMP_DEBUG_ASSERT( plastiter && plower && pupper );
2141 KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2142 #ifdef KMP_DEBUG
2143 {
2144 const char * buff;
2145 // create format specifiers before the debug output
2146 buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
2147 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2148 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
2149 traits_t< T >::spec );
2150 KD_TRACE(100, ( buff, gtid, *plastiter, *plower, *pupper, incr ) );
2151 __kmp_str_free( &buff );
2152 }
2153 #endif
2154
2155 if( __kmp_env_consistency_check ) {
2156 if( incr == 0 ) {
2157 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
2158 }
2159 if( incr > 0 ? (*pupper < *plower) : (*plower < *pupper) ) {
2160 // The loop is illegal.
2161 // Some zero-trip loops maintained by compiler, e.g.:
2162 // for(i=10;i<0;++i) // lower >= upper - run-time check
2163 // for(i=0;i>10;--i) // lower <= upper - run-time check
2164 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2165 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2166 // Compiler does not check the following illegal loops:
2167 // for(i=0;i<10;i+=incr) // where incr<0
2168 // for(i=10;i>0;i-=incr) // where incr<0
2169 __kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
2170 }
2171 }
2172 th = __kmp_threads[gtid];
2173 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2174 team = th->th.th_team;
2175 #if OMP_40_ENABLED
2176 nteams = th->th.th_teams_size.nteams;
2177 #endif
2178 team_id = team->t.t_master_tid;
2179 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2180
2181 // compute global trip count
2182 if( incr == 1 ) {
2183 trip_count = *pupper - *plower + 1;
2184 } else if(incr == -1) {
2185 trip_count = *plower - *pupper + 1;
2186 } else {
2187 trip_count = (ST)(*pupper - *plower) / incr + 1; // cast to signed to cover incr<0 case
2188 }
2189 if( trip_count <= nteams ) {
2190 KMP_DEBUG_ASSERT(
2191 __kmp_static == kmp_sch_static_greedy || \
2192 __kmp_static == kmp_sch_static_balanced
2193 ); // Unknown static scheduling type.
2194 // only some teams get single iteration, others get nothing
2195 if( team_id < trip_count ) {
2196 *pupper = *plower = *plower + team_id * incr;
2197 } else {
2198 *plower = *pupper + incr; // zero-trip loop
2199 }
2200 if( plastiter != NULL )
2201 *plastiter = ( team_id == trip_count - 1 );
2202 } else {
2203 if( __kmp_static == kmp_sch_static_balanced ) {
2204 register UT chunk = trip_count / nteams;
2205 register UT extras = trip_count % nteams;
2206 *plower += incr * ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
2207 *pupper = *plower + chunk * incr - ( team_id < extras ? 0 : incr );
2208 if( plastiter != NULL )
2209 *plastiter = ( team_id == nteams - 1 );
2210 } else {
2211 register T chunk_inc_count =
2212 ( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
2213 register T upper = *pupper;
2214 KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
2215 // Unknown static scheduling type.
2216 *plower += team_id * chunk_inc_count;
2217 *pupper = *plower + chunk_inc_count - incr;
2218 // Check/correct bounds if needed
2219 if( incr > 0 ) {
2220 if( *pupper < *plower )
2221 *pupper = i_maxmin< T >::mx;
2222 if( plastiter != NULL )
2223 *plastiter = *plower <= upper && *pupper > upper - incr;
2224 if( *pupper > upper )
2225 *pupper = upper; // tracker C73258
2226 } else {
2227 if( *pupper > *plower )
2228 *pupper = i_maxmin< T >::mn;
2229 if( plastiter != NULL )
2230 *plastiter = *plower >= upper && *pupper < upper - incr;
2231 if( *pupper < upper )
2232 *pupper = upper; // tracker C73258
2233 }
2234 }
2235 }
2236}
2237
Jim Cownie5e8470a2013-09-27 10:38:44 +00002238//-----------------------------------------------------------------------------------------
2239// Dispatch routines
2240// Transfer call to template< type T >
2241// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2242// T lb, T ub, ST st, ST chunk )
2243extern "C" {
2244
2245/*!
2246@ingroup WORK_SHARING
2247@{
2248@param loc Source location
2249@param gtid Global thread id
2250@param schedule Schedule type
2251@param lb Lower bound
2252@param ub Upper bound
2253@param st Step (or increment if you prefer)
2254@param chunk The chunk size to block with
2255
2256This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2257These functions are all identical apart from the types of the arguments.
2258*/
2259
2260void
2261__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2262 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2263{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002264 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002265 KMP_DEBUG_ASSERT( __kmp_init_serial );
2266 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2267}
2268/*!
2269See @ref __kmpc_dispatch_init_4
2270*/
2271void
2272__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2273 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2274{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002275 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002276 KMP_DEBUG_ASSERT( __kmp_init_serial );
2277 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2278}
2279
2280/*!
2281See @ref __kmpc_dispatch_init_4
2282*/
2283void
2284__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2285 kmp_int64 lb, kmp_int64 ub,
2286 kmp_int64 st, kmp_int64 chunk )
2287{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002288 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002289 KMP_DEBUG_ASSERT( __kmp_init_serial );
2290 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2291}
2292
2293/*!
2294See @ref __kmpc_dispatch_init_4
2295*/
2296void
2297__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2298 kmp_uint64 lb, kmp_uint64 ub,
2299 kmp_int64 st, kmp_int64 chunk )
2300{
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002301 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002302 KMP_DEBUG_ASSERT( __kmp_init_serial );
2303 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2304}
2305
2306/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002307See @ref __kmpc_dispatch_init_4
2308
2309Difference from __kmpc_dispatch_init set of functions is these functions
2310are called for composite distribute parallel for construct. Thus before
2311regular iterations dispatching we need to calc per-team iteration space.
2312
2313These functions are all identical apart from the types of the arguments.
2314*/
2315void
2316__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2317 kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2318{
2319 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2320 KMP_DEBUG_ASSERT( __kmp_init_serial );
2321 __kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
2322 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2323}
2324
2325void
2326__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2327 kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2328{
2329 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2330 KMP_DEBUG_ASSERT( __kmp_init_serial );
2331 __kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
2332 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2333}
2334
2335void
2336__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2337 kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
2338{
2339 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2340 KMP_DEBUG_ASSERT( __kmp_init_serial );
2341 __kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
2342 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2343}
2344
2345void
2346__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2347 kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
2348{
2349 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
2350 KMP_DEBUG_ASSERT( __kmp_init_serial );
2351 __kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
2352 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2353}
2354
2355/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002356@param loc Source code location
2357@param gtid Global thread id
2358@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2359@param p_lb Pointer to the lower bound for the next chunk of work
2360@param p_ub Pointer to the upper bound for the next chunk of work
2361@param p_st Pointer to the stride for the next chunk of work
2362@return one if there is work to be done, zero otherwise
2363
2364Get the next dynamically allocated chunk of work for this thread.
2365If there is no more work, then the lb,ub and stride need not be modified.
2366*/
2367int
2368__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2369 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2370{
2371 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2372}
2373
2374/*!
2375See @ref __kmpc_dispatch_next_4
2376*/
2377int
2378__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2379 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2380{
2381 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2382}
2383
2384/*!
2385See @ref __kmpc_dispatch_next_4
2386*/
2387int
2388__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2389 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2390{
2391 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2392}
2393
2394/*!
2395See @ref __kmpc_dispatch_next_4
2396*/
2397int
2398__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2399 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2400{
2401 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2402}
2403
2404/*!
2405@param loc Source code location
2406@param gtid Global thread id
2407
2408Mark the end of a dynamic loop.
2409*/
2410void
2411__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2412{
2413 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2414}
2415
2416/*!
2417See @ref __kmpc_dispatch_fini_4
2418*/
2419void
2420__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2421{
2422 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2423}
2424
2425/*!
2426See @ref __kmpc_dispatch_fini_4
2427*/
2428void
2429__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2430{
2431 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2432}
2433
2434/*!
2435See @ref __kmpc_dispatch_fini_4
2436*/
2437void
2438__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2439{
2440 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2441}
2442/*! @} */
2443
2444//-----------------------------------------------------------------------------------------
2445//Non-template routines from kmp_dispatch.c used in other sources
2446
2447kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2448 return value == checker;
2449}
2450
2451kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2452 return value != checker;
2453}
2454
2455kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2456 return value < checker;
2457}
2458
2459kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2460 return value >= checker;
2461}
2462
2463kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2464 return value <= checker;
2465}
2466kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2467 return value == checker;
2468}
2469
2470kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2471 return value != checker;
2472}
2473
2474kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2475 return value < checker;
2476}
2477
2478kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2479 return value >= checker;
2480}
2481
2482kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2483 return value <= checker;
2484}
2485
2486kmp_uint32
2487__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2488 kmp_uint32 checker,
2489 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2490 , void * obj // Higher-level synchronization object, or NULL.
2491 )
2492{
2493 // note: we may not belong to a team at this point
2494 register volatile kmp_uint32 * spin = spinner;
2495 register kmp_uint32 check = checker;
2496 register kmp_uint32 spins;
2497 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2498 register kmp_uint32 r;
2499
2500 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2501 KMP_INIT_YIELD( spins );
2502 // main wait spin loop
2503 while(!f(r = TCR_4(*spin), check)) {
2504 KMP_FSYNC_SPIN_PREPARE( obj );
2505 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2506 It causes problems with infinite recursion because of exit lock */
2507 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2508 __kmp_abort_thread(); */
2509
Jim Cownie5e8470a2013-09-27 10:38:44 +00002510 /* if we have waited a bit, or are oversubscribed, yield */
2511 /* pause is in the following code */
2512 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2513 KMP_YIELD_SPIN( spins );
2514 }
2515 KMP_FSYNC_SPIN_ACQUIRED( obj );
2516 return r;
2517}
2518
2519kmp_uint64
2520__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2521 kmp_uint64 checker,
2522 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2523 , void * obj // Higher-level synchronization object, or NULL.
2524 )
2525{
2526 // note: we may not belong to a team at this point
2527 register volatile kmp_uint64 * spin = spinner;
2528 register kmp_uint64 check = checker;
2529 register kmp_uint32 spins;
2530 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2531 register kmp_uint64 r;
2532
2533 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2534 KMP_INIT_YIELD( spins );
2535 // main wait spin loop
2536 while(!f(r = *spin, check))
2537 {
2538 KMP_FSYNC_SPIN_PREPARE( obj );
2539 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2540 It causes problems with infinite recursion because of exit lock */
2541 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2542 __kmp_abort_thread(); */
2543
Jim Cownie5e8470a2013-09-27 10:38:44 +00002544 // if we are oversubscribed,
2545 // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2546 // pause is in the following code
2547 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2548 KMP_YIELD_SPIN( spins );
2549 }
2550 KMP_FSYNC_SPIN_ACQUIRED( obj );
2551 return r;
2552}
2553
2554} // extern "C"
2555
2556#ifdef KMP_GOMP_COMPAT
2557
2558void
2559__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2560 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2561 kmp_int32 chunk, int push_ws )
2562{
2563 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2564 push_ws );
2565}
2566
2567void
2568__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2569 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2570 kmp_int32 chunk, int push_ws )
2571{
2572 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2573 push_ws );
2574}
2575
2576void
2577__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2578 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2579 kmp_int64 chunk, int push_ws )
2580{
2581 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2582 push_ws );
2583}
2584
2585void
2586__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2587 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2588 kmp_int64 chunk, int push_ws )
2589{
2590 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2591 push_ws );
2592}
2593
2594void
2595__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2596{
2597 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2598}
2599
2600void
2601__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2602{
2603 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2604}
2605
2606void
2607__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2608{
2609 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2610}
2611
2612void
2613__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2614{
2615 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2616}
2617
2618#endif /* KMP_GOMP_COMPAT */
2619
2620/* ------------------------------------------------------------------------ */
2621/* ------------------------------------------------------------------------ */
2622