blob: 1128b871d589b0f485d22204d77e9a1979c2cf87 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
3 * $Revision: 42624 $
4 * $Date: 2013-08-27 10:53:11 -0500 (Tue, 27 Aug 2013) $
5 */
6
7
8//===----------------------------------------------------------------------===//
9//
10// The LLVM Compiler Infrastructure
11//
12// This file is dual licensed under the MIT and the University of Illinois Open
13// Source Licenses. See LICENSE.txt for details.
14//
15//===----------------------------------------------------------------------===//
16
17
18/*
19 * Dynamic scheduling initialization and dispatch.
20 *
21 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
22 * it may change values between parallel regions. __kmp_max_nth
23 * is the largest value __kmp_nth may take, 1 is the smallest.
24 *
25 */
26
27/* ------------------------------------------------------------------------ */
28/* ------------------------------------------------------------------------ */
29
30#include "kmp.h"
31#include "kmp_i18n.h"
32#include "kmp_itt.h"
33#include "kmp_str.h"
34#include "kmp_error.h"
35#if KMP_OS_WINDOWS && KMP_ARCH_X86
36 #include <float.h>
37#endif
38
39/* ------------------------------------------------------------------------ */
40/* ------------------------------------------------------------------------ */
41
42#ifdef KMP_STATIC_STEAL_ENABLED
43
44 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
45 template< typename T >
46 struct dispatch_private_infoXX_template {
47 typedef typename traits_t< T >::unsigned_t UT;
48 typedef typename traits_t< T >::signed_t ST;
49 UT count; // unsigned
50 T ub;
51 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
52 T lb;
53 ST st; // signed
54 UT tc; // unsigned
55 T static_steal_counter; // for static_steal only; maybe better to put after ub
56
57 /* parm[1-4] are used in different ways by different scheduling algorithms */
58
59 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
60 // a) parm3 is properly aligned and
61 // b) all parm1-4 are in the same cache line.
62 // Because of parm1-4 are used together, performance seems to be better
63 // if they are in the same line (not measured though).
64
65 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
66 T parm1;
67 T parm2;
68 T parm3;
69 T parm4;
70 };
71
72 UT ordered_lower; // unsigned
73 UT ordered_upper; // unsigned
74 #if KMP_OS_WINDOWS
75 T last_upper;
76 #endif /* KMP_OS_WINDOWS */
77 };
78
79#else /* KMP_STATIC_STEAL_ENABLED */
80
81 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
82 template< typename T >
83 struct dispatch_private_infoXX_template {
84 typedef typename traits_t< T >::unsigned_t UT;
85 typedef typename traits_t< T >::signed_t ST;
86 T lb;
87 T ub;
88 ST st; // signed
89 UT tc; // unsigned
90
91 T parm1;
92 T parm2;
93 T parm3;
94 T parm4;
95
96 UT count; // unsigned
97
98 UT ordered_lower; // unsigned
99 UT ordered_upper; // unsigned
100 #if KMP_OS_WINDOWS
101 T last_upper;
102 #endif /* KMP_OS_WINDOWS */
103 };
104
105#endif /* KMP_STATIC_STEAL_ENABLED */
106
107// replaces dispatch_private_info structure and dispatch_private_info_t type
108template< typename T >
109struct KMP_ALIGN_CACHE dispatch_private_info_template {
110 // duplicate alignment here, otherwise size of structure is not correct in our compiler
111 union KMP_ALIGN_CACHE private_info_tmpl {
112 dispatch_private_infoXX_template< T > p;
113 dispatch_private_info64_t p64;
114 } u;
115 enum sched_type schedule; /* scheduling algorithm */
116 kmp_uint32 ordered; /* ordered clause specified */
117 kmp_uint32 ordered_bumped;
118 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
119 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
120 kmp_uint32 nomerge; /* don't merge iters if serialized */
121 kmp_uint32 type_size;
122 enum cons_type pushed_ws;
123};
124
125
126// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
127template< typename UT >
128struct dispatch_shared_infoXX_template {
129 /* chunk index under dynamic, number of idle threads under static-steal;
130 iteration index otherwise */
131 volatile UT iteration;
132 volatile UT num_done;
133 volatile UT ordered_iteration;
134 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
135};
136
137// replaces dispatch_shared_info structure and dispatch_shared_info_t type
138template< typename UT >
139struct dispatch_shared_info_template {
140 // we need union here to keep the structure size
141 union shared_info_tmpl {
142 dispatch_shared_infoXX_template< UT > s;
143 dispatch_shared_info64_t s64;
144 } u;
145 volatile kmp_uint32 buffer_index;
146};
147
148/* ------------------------------------------------------------------------ */
149/* ------------------------------------------------------------------------ */
150
151static void
152__kmp_static_delay( int arg )
153{
154 /* Work around weird code-gen bug that causes assert to trip */
155 #if KMP_ARCH_X86_64 && KMP_OS_LINUX
156 #else
157 KMP_ASSERT( arg >= 0 );
158 #endif
159}
160
161static void
162__kmp_static_yield( int arg )
163{
164 __kmp_yield( arg );
165}
166
167#undef USE_TEST_LOCKS
168
169// test_then_add template (general template should NOT be used)
170template< typename T >
171static __forceinline T
172test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
173
174template<>
175__forceinline kmp_int32
176test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
177{
178 kmp_int32 r;
179 r = KMP_TEST_THEN_ADD32( p, d );
180 return r;
181}
182
183template<>
184__forceinline kmp_int64
185test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
186{
187 kmp_int64 r;
188 r = KMP_TEST_THEN_ADD64( p, d );
189 return r;
190}
191
192// test_then_inc_acq template (general template should NOT be used)
193template< typename T >
194static __forceinline T
195test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
196
197template<>
198__forceinline kmp_int32
199test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
200{
201 kmp_int32 r;
202 r = KMP_TEST_THEN_INC_ACQ32( p );
203 return r;
204}
205
206template<>
207__forceinline kmp_int64
208test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
209{
210 kmp_int64 r;
211 r = KMP_TEST_THEN_INC_ACQ64( p );
212 return r;
213}
214
215// test_then_inc template (general template should NOT be used)
216template< typename T >
217static __forceinline T
218test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
219
220template<>
221__forceinline kmp_int32
222test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
223{
224 kmp_int32 r;
225 r = KMP_TEST_THEN_INC32( p );
226 return r;
227}
228
229template<>
230__forceinline kmp_int64
231test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
232{
233 kmp_int64 r;
234 r = KMP_TEST_THEN_INC64( p );
235 return r;
236}
237
238// compare_and_swap template (general template should NOT be used)
239template< typename T >
240static __forceinline kmp_int32
241compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
242
243template<>
244__forceinline kmp_int32
245compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
246{
247 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
248}
249
250template<>
251__forceinline kmp_int32
252compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
253{
254 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
255}
256
257/*
258 Spin wait loop that first does pause, then yield.
259 Waits until function returns non-zero when called with *spinner and check.
260 Does NOT put threads to sleep.
261#if USE_ITT_BUILD
262 Arguments:
263 obj -- is higher-level syncronization object to report to ittnotify. It is used to report
264 locks consistently. For example, if lock is acquired immediately, its address is
265 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
266 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
267 address, not an address of low-level spinner.
268#endif // USE_ITT_BUILD
269*/
270template< typename UT >
271// ToDo: make inline function (move to header file for icl)
272static UT // unsigned 4- or 8-byte type
273__kmp_wait_yield( volatile UT * spinner,
274 UT checker,
275 kmp_uint32 (* pred)( UT, UT )
276 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
277 )
278{
279 // note: we may not belong to a team at this point
280 register volatile UT * spin = spinner;
281 register UT check = checker;
282 register kmp_uint32 spins;
283 register kmp_uint32 (*f) ( UT, UT ) = pred;
284 register UT r;
285
286 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
287 KMP_INIT_YIELD( spins );
288 // main wait spin loop
289 while(!f(r = *spin, check))
290 {
291 KMP_FSYNC_SPIN_PREPARE( obj );
292 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
293 It causes problems with infinite recursion because of exit lock */
294 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
295 __kmp_abort_thread(); */
296
297 __kmp_static_delay(TRUE);
298
299 // if we are oversubscribed,
300 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
301 // pause is in the following code
302 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
303 KMP_YIELD_SPIN( spins );
304 }
305 KMP_FSYNC_SPIN_ACQUIRED( obj );
306 return r;
307}
308
309template< typename UT >
310static kmp_uint32 __kmp_eq( UT value, UT checker) {
311 return value == checker;
312}
313
314template< typename UT >
315static kmp_uint32 __kmp_neq( UT value, UT checker) {
316 return value != checker;
317}
318
319template< typename UT >
320static kmp_uint32 __kmp_lt( UT value, UT checker) {
321 return value < checker;
322}
323
324template< typename UT >
325static kmp_uint32 __kmp_ge( UT value, UT checker) {
326 return value >= checker;
327}
328
329template< typename UT >
330static kmp_uint32 __kmp_le( UT value, UT checker) {
331 return value <= checker;
332}
333
334
335/* ------------------------------------------------------------------------ */
336/* ------------------------------------------------------------------------ */
337
338static void
339__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
340{
341 kmp_info_t *th;
342
343 KMP_DEBUG_ASSERT( gtid_ref );
344
345 if ( __kmp_env_consistency_check ) {
346 th = __kmp_threads[*gtid_ref];
347 if ( th -> th.th_root -> r.r_active
348 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
349 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
350 }
351 }
352}
353
354template< typename UT >
355static void
356__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
357{
358 typedef typename traits_t< UT >::signed_t ST;
359 dispatch_private_info_template< UT > * pr;
360
361 int gtid = *gtid_ref;
362// int cid = *cid_ref;
363 kmp_info_t *th = __kmp_threads[ gtid ];
364 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
365
366 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
367 if ( __kmp_env_consistency_check ) {
368 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
369 ( th -> th.th_dispatch -> th_dispatch_pr_current );
370 if ( pr -> pushed_ws != ct_none ) {
371 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
372 }
373 }
374
375 if ( ! th -> th.th_team -> t.t_serialized ) {
376 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
377 ( th -> th.th_dispatch -> th_dispatch_sh_current );
378 UT lower;
379
380 if ( ! __kmp_env_consistency_check ) {
381 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
382 ( th -> th.th_dispatch -> th_dispatch_pr_current );
383 }
384 lower = pr->u.p.ordered_lower;
385
386 #if ! defined( KMP_GOMP_COMPAT )
387 if ( __kmp_env_consistency_check ) {
388 if ( pr->ordered_bumped ) {
389 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
390 __kmp_error_construct2(
391 kmp_i18n_msg_CnsMultipleNesting,
392 ct_ordered_in_pdo, loc_ref,
393 & p->stack_data[ p->w_top ]
394 );
395 }
396 }
397 #endif /* !defined(KMP_GOMP_COMPAT) */
398
399 KMP_MB();
400 #ifdef KMP_DEBUG
401 {
402 const char * buff;
403 // create format specifiers before the debug output
404 buff = __kmp_str_format(
405 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
406 traits_t< UT >::spec, traits_t< UT >::spec );
407 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
408 __kmp_str_free( &buff );
409 }
410 #endif
411
412 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
413 USE_ITT_BUILD_ARG( NULL )
414 );
415 KMP_MB(); /* is this necessary? */
416 #ifdef KMP_DEBUG
417 {
418 const char * buff;
419 // create format specifiers before the debug output
420 buff = __kmp_str_format(
421 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
422 traits_t< UT >::spec, traits_t< UT >::spec );
423 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
424 __kmp_str_free( &buff );
425 }
426 #endif
427 }
428 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
429}
430
431static void
432__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
433{
434 kmp_info_t *th;
435
436 if ( __kmp_env_consistency_check ) {
437 th = __kmp_threads[*gtid_ref];
438 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
439 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
440 }
441 }
442}
443
444template< typename UT >
445static void
446__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
447{
448 typedef typename traits_t< UT >::signed_t ST;
449 dispatch_private_info_template< UT > * pr;
450
451 int gtid = *gtid_ref;
452// int cid = *cid_ref;
453 kmp_info_t *th = __kmp_threads[ gtid ];
454 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
455
456 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
457 if ( __kmp_env_consistency_check ) {
458 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
459 ( th -> th.th_dispatch -> th_dispatch_pr_current );
460 if ( pr -> pushed_ws != ct_none ) {
461 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
462 }
463 }
464
465 if ( ! th -> th.th_team -> t.t_serialized ) {
466 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
467 ( th -> th.th_dispatch -> th_dispatch_sh_current );
468
469 if ( ! __kmp_env_consistency_check ) {
470 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
471 ( th -> th.th_dispatch -> th_dispatch_pr_current );
472 }
473
474 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
475 #if ! defined( KMP_GOMP_COMPAT )
476 if ( __kmp_env_consistency_check ) {
477 if ( pr->ordered_bumped != 0 ) {
478 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
479 /* How to test it? - OM */
480 __kmp_error_construct2(
481 kmp_i18n_msg_CnsMultipleNesting,
482 ct_ordered_in_pdo, loc_ref,
483 & p->stack_data[ p->w_top ]
484 );
485 }
486 }
487 #endif /* !defined(KMP_GOMP_COMPAT) */
488
489 KMP_MB(); /* Flush all pending memory write invalidates. */
490
491 pr->ordered_bumped += 1;
492
493 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
494 gtid, pr->ordered_bumped ) );
495
496 KMP_MB(); /* Flush all pending memory write invalidates. */
497
498 /* TODO use general release procedure? */
499 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
500
501 KMP_MB(); /* Flush all pending memory write invalidates. */
502 }
503 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
504}
505
506/* Computes and returns x to the power of y, where y must a non-negative integer */
507template< typename UT >
508static __forceinline long double
509__kmp_pow(long double x, UT y) {
510 long double s=1.0L;
511
512 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
513 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
514 while(y) {
515 if ( y & 1 )
516 s *= x;
517 x *= x;
518 y >>= 1;
519 }
520 return s;
521}
522
523/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
524 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
525 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
526 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
527*/
528template< typename T >
529static __inline typename traits_t< T >::unsigned_t
530__kmp_dispatch_guided_remaining(
531 T tc,
532 typename traits_t< T >::floating_t base,
533 typename traits_t< T >::unsigned_t idx
534) {
535 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
536 least for ICL 8.1, long double arithmetic may not really have
537 long double precision, even with /Qlong_double. Currently, we
538 workaround that in the caller code, by manipulating the FPCW for
539 Windows* OS on IA-32 architecture. The lack of precision is not
540 expected to be a correctness issue, though.
541 */
542 typedef typename traits_t< T >::unsigned_t UT;
543
544 long double x = tc * __kmp_pow< UT >(base, idx);
545 UT r = (UT) x;
546 if ( x == r )
547 return r;
548 return r + 1;
549}
550
551// Parameters of the guided-iterative algorithm:
552// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
553// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
554// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
555// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
556static int guided_int_param = 2;
557static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
558
559// UT - unsigned flavor of T, ST - signed flavor of T,
560// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
561template< typename T >
562static void
563__kmp_dispatch_init(
564 ident_t * loc,
565 int gtid,
566 enum sched_type schedule,
567 T lb,
568 T ub,
569 typename traits_t< T >::signed_t st,
570 typename traits_t< T >::signed_t chunk,
571 int push_ws
572) {
573 typedef typename traits_t< T >::unsigned_t UT;
574 typedef typename traits_t< T >::signed_t ST;
575 typedef typename traits_t< T >::floating_t DBL;
576 static const int ___kmp_size_type = sizeof( UT );
577
578 int active;
579 T tc;
580 kmp_info_t * th;
581 kmp_team_t * team;
582 kmp_uint32 my_buffer_index;
583 dispatch_private_info_template< T > * pr;
584 dispatch_shared_info_template< UT > volatile * sh;
585
586 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
587 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
588
589 if ( ! TCR_4( __kmp_init_parallel ) )
590 __kmp_parallel_initialize();
591
592 #ifdef KMP_DEBUG
593 {
594 const char * buff;
595 // create format specifiers before the debug output
596 buff = __kmp_str_format(
597 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
598 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
599 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
600 __kmp_str_free( &buff );
601 }
602 #endif
603 /* setup data */
604 th = __kmp_threads[ gtid ];
605 team = th -> th.th_team;
606 active = ! team -> t.t_serialized;
607 th->th.th_ident = loc;
608
609 if ( ! active ) {
610 pr = reinterpret_cast< dispatch_private_info_template< T >* >
611 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
612 } else {
613 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
614 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
615
616 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
617
618 /* What happens when number of threads changes, need to resize buffer? */
619 pr = reinterpret_cast< dispatch_private_info_template< T > * >
620 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
621 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
622 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
623 }
624
625 /* Pick up the nomerge/ordered bits from the scheduling type */
626 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
627 pr->nomerge = TRUE;
628 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
629 } else {
630 pr->nomerge = FALSE;
631 }
632 pr->type_size = ___kmp_size_type; // remember the size of variables
633 if ( kmp_ord_lower & schedule ) {
634 pr->ordered = TRUE;
635 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
636 } else {
637 pr->ordered = FALSE;
638 }
639 if ( schedule == kmp_sch_static ) {
640 schedule = __kmp_static;
641 } else {
642 if ( schedule == kmp_sch_runtime ) {
643 #if OMP_30_ENABLED
644 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
645 schedule = team -> t.t_sched.r_sched_type;
646 // Detail the schedule if needed (global controls are differentiated appropriately)
647 if ( schedule == kmp_sch_guided_chunked ) {
648 schedule = __kmp_guided;
649 } else if ( schedule == kmp_sch_static ) {
650 schedule = __kmp_static;
651 }
652 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
653 chunk = team -> t.t_sched.chunk;
654 #else
655 kmp_r_sched_t r_sched = __kmp_get_schedule_global();
656 // Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default
657 schedule = r_sched.r_sched_type;
658 chunk = r_sched.chunk;
659 #endif
660
661 #ifdef KMP_DEBUG
662 {
663 const char * buff;
664 // create format specifiers before the debug output
665 buff = __kmp_str_format(
666 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
667 traits_t< ST >::spec );
668 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
669 __kmp_str_free( &buff );
670 }
671 #endif
672 } else {
673 if ( schedule == kmp_sch_guided_chunked ) {
674 schedule = __kmp_guided;
675 }
676 if ( chunk <= 0 ) {
677 chunk = KMP_DEFAULT_CHUNK;
678 }
679 }
680
681 #if OMP_30_ENABLED
682 if ( schedule == kmp_sch_auto ) {
683 // mapping and differentiation: in the __kmp_do_serial_initialize()
684 schedule = __kmp_auto;
685 #ifdef KMP_DEBUG
686 {
687 const char * buff;
688 // create format specifiers before the debug output
689 buff = __kmp_str_format(
690 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
691 traits_t< ST >::spec );
692 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
693 __kmp_str_free( &buff );
694 }
695 #endif
696 }
697 #endif // OMP_30_ENABLED
698
699 /* guided analytical not safe for too many threads */
700 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
701 schedule = kmp_sch_guided_iterative_chunked;
702 KMP_WARNING( DispatchManyThreads );
703 }
704 pr->u.p.parm1 = chunk;
705 }
706 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
707 "unknown scheduling type" );
708
709 pr->u.p.count = 0;
710
711 if ( __kmp_env_consistency_check ) {
712 if ( st == 0 ) {
713 __kmp_error_construct(
714 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
715 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
716 );
717 }
718 }
719
720 tc = ( ub - lb + st );
721 if ( st != 1 ) {
722 if ( st < 0 ) {
723 if ( lb < ub ) {
724 tc = 0; // zero-trip
725 } else { // lb >= ub
726 tc = (ST)tc / st; // convert to signed division
727 }
728 } else { // st > 0
729 if ( ub < lb ) {
730 tc = 0; // zero-trip
731 } else { // lb >= ub
732 tc /= st;
733 }
734 }
735 } else if ( ub < lb ) { // st == 1
736 tc = 0; // zero-trip
737 }
738
739 pr->u.p.lb = lb;
740 pr->u.p.ub = ub;
741 pr->u.p.st = st;
742 pr->u.p.tc = tc;
743
744 #if KMP_OS_WINDOWS
745 pr->u.p.last_upper = ub + st;
746 #endif /* KMP_OS_WINDOWS */
747
748 /* NOTE: only the active parallel region(s) has active ordered sections */
749
750 if ( active ) {
751 if ( pr->ordered == 0 ) {
752 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
753 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
754 } else {
755 pr->ordered_bumped = 0;
756
757 pr->u.p.ordered_lower = 1;
758 pr->u.p.ordered_upper = 0;
759
760 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
761 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
762 }
763 }
764
765 if ( __kmp_env_consistency_check ) {
766 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
767 if ( push_ws ) {
768 __kmp_push_workshare( gtid, ws, loc );
769 pr->pushed_ws = ws;
770 } else {
771 __kmp_check_workshare( gtid, ws, loc );
772 pr->pushed_ws = ct_none;
773 }
774 }
775
776 switch ( schedule ) {
777 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
778 case kmp_sch_static_steal:
779 {
780 T nproc = team->t.t_nproc;
781 T ntc, init;
782
783 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
784
785 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
786 if ( nproc > 1 && ntc >= nproc ) {
787 T id = __kmp_tid_from_gtid(gtid);
788 T small_chunk, extras;
789
790 small_chunk = ntc / nproc;
791 extras = ntc % nproc;
792
793 init = id * small_chunk + ( id < extras ? id : extras );
794 pr->u.p.count = init;
795 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
796
797 pr->u.p.parm2 = lb;
798 //pr->pfields.parm3 = 0; // it's not used in static_steal
799 pr->u.p.parm4 = id;
800 pr->u.p.st = st;
801 break;
802 } else {
803 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
804 gtid ) );
805 schedule = kmp_sch_static_balanced;
806 /* too few iterations: fall-through to kmp_sch_static_balanced */
807 } // if
808 /* FALL-THROUGH to static balanced */
809 } // case
810 #endif
811 case kmp_sch_static_balanced:
812 {
813 T nproc = team->t.t_nproc;
814 T init, limit;
815
816 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
817 gtid ) );
818
819 if ( nproc > 1 ) {
820 T id = __kmp_tid_from_gtid(gtid);
821
822 if ( tc < nproc ) {
823 if ( id < tc ) {
824 init = id;
825 limit = id;
826 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
827 } else {
828 pr->u.p.count = 1; /* means no more chunks to execute */
829 pr->u.p.parm1 = FALSE;
830 break;
831 }
832 } else {
833 T small_chunk = tc / nproc;
834 T extras = tc % nproc;
835 init = id * small_chunk + (id < extras ? id : extras);
836 limit = init + small_chunk - (id < extras ? 0 : 1);
837 pr->u.p.parm1 = (id == nproc - 1);
838 }
839 } else {
840 if ( tc > 0 ) {
841 init = 0;
842 limit = tc - 1;
843 pr->u.p.parm1 = TRUE;
844 } else {
845 // zero trip count
846 pr->u.p.count = 1; /* means no more chunks to execute */
847 pr->u.p.parm1 = FALSE;
848 break;
849 }
850 }
851 if ( st == 1 ) {
852 pr->u.p.lb = lb + init;
853 pr->u.p.ub = lb + limit;
854 } else {
855 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
856 pr->u.p.lb = lb + init * st;
857 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
858 if ( st > 0 ) {
859 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
860 } else {
861 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
862 }
863 }
864 if ( pr->ordered ) {
865 pr->u.p.ordered_lower = init;
866 pr->u.p.ordered_upper = limit;
867 }
868 break;
869 } // case
870 case kmp_sch_guided_iterative_chunked :
871 {
872 T nproc = team->t.t_nproc;
873 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
874
875 if ( nproc > 1 ) {
876 if ( (2L * chunk + 1 ) * nproc >= tc ) {
877 /* chunk size too large, switch to dynamic */
878 schedule = kmp_sch_dynamic_chunked;
879 } else {
880 // when remaining iters become less than parm2 - switch to dynamic
881 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
882 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
883 }
884 } else {
885 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
886 schedule = kmp_sch_static_greedy;
887 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
888 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
889 pr->u.p.parm1 = tc;
890 } // if
891 } // case
892 break;
893 case kmp_sch_guided_analytical_chunked:
894 {
895 T nproc = team->t.t_nproc;
896 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
897
898 if ( nproc > 1 ) {
899 if ( (2L * chunk + 1 ) * nproc >= tc ) {
900 /* chunk size too large, switch to dynamic */
901 schedule = kmp_sch_dynamic_chunked;
902 } else {
903 /* commonly used term: (2 nproc - 1)/(2 nproc) */
904 DBL x;
905
906 #if KMP_OS_WINDOWS && KMP_ARCH_X86
907 /* Linux* OS already has 64-bit computation by default for
908 long double, and on Windows* OS on Intel(R) 64,
909 /Qlong_double doesn't work. On Windows* OS
910 on IA-32 architecture, we need to set precision to
911 64-bit instead of the default 53-bit. Even though long
912 double doesn't work on Windows* OS on Intel(R) 64, the
913 resulting lack of precision is not expected to impact
914 the correctness of the algorithm, but this has not been
915 mathematically proven.
916 */
917 // save original FPCW and set precision to 64-bit, as
918 // Windows* OS on IA-32 architecture defaults to 53-bit
919 unsigned int oldFpcw = _control87(0,0x30000);
920 #endif
921 /* value used for comparison in solver for cross-over point */
922 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
923
924 /* crossover point--chunk indexes equal to or greater than
925 this point switch to dynamic-style scheduling */
926 UT cross;
927
928 /* commonly used term: (2 nproc - 1)/(2 nproc) */
929 x = (long double)1.0 - (long double)0.5 / nproc;
930
931 #ifdef KMP_DEBUG
932 { // test natural alignment
933 struct _test_a {
934 char a;
935 union {
936 char b;
937 DBL d;
938 };
939 } t;
940 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
941 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
942 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
943 }
944 #endif // KMP_DEBUG
945
946 /* save the term in thread private dispatch structure */
947 *(DBL*)&pr->u.p.parm3 = x;
948
949 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
950 {
951 UT left, right, mid;
952 long double p;
953
954 /* estimate initial upper and lower bound */
955
956 /* doesn't matter what value right is as long as it is positive, but
957 it affects performance of the solver
958 */
959 right = 229;
960 p = __kmp_pow< UT >(x,right);
961 if ( p > target ) {
962 do{
963 p *= p;
964 right <<= 1;
965 } while(p>target && right < (1<<27));
966 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
967 } else {
968 left = 0;
969 }
970
971 /* bisection root-finding method */
972 while ( left + 1 < right ) {
973 mid = (left + right) / 2;
974 if ( __kmp_pow< UT >(x,mid) > target ) {
975 left = mid;
976 } else {
977 right = mid;
978 }
979 } // while
980 cross = right;
981 }
982 /* assert sanity of computed crossover point */
983 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
984
985 /* save the crossover point in thread private dispatch structure */
986 pr->u.p.parm2 = cross;
987
988 // C75803
989 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
990 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
991 #else
992 #define GUIDED_ANALYTICAL_WORKAROUND (x)
993 #endif
994 /* dynamic-style scheduling offset */
995 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
996 #if KMP_OS_WINDOWS && KMP_ARCH_X86
997 // restore FPCW
998 _control87(oldFpcw,0x30000);
999 #endif
1000 } // if
1001 } else {
1002 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1003 gtid ) );
1004 schedule = kmp_sch_static_greedy;
1005 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1006 pr->u.p.parm1 = tc;
1007 } // if
1008 } // case
1009 break;
1010 case kmp_sch_static_greedy:
1011 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1012 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1013 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1014 tc;
1015 break;
1016 case kmp_sch_static_chunked :
1017 case kmp_sch_dynamic_chunked :
1018 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1019 break;
1020 case kmp_sch_trapezoidal :
1021 {
1022 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1023
1024 T parm1, parm2, parm3, parm4;
1025 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1026
1027 parm1 = chunk;
1028
1029 /* F : size of the first cycle */
1030 parm2 = ( tc / (2 * team->t.t_nproc) );
1031
1032 if ( parm2 < 1 ) {
1033 parm2 = 1;
1034 }
1035
1036 /* L : size of the last cycle. Make sure the last cycle
1037 * is not larger than the first cycle.
1038 */
1039 if ( parm1 < 1 ) {
1040 parm1 = 1;
1041 } else if ( parm1 > parm2 ) {
1042 parm1 = parm2;
1043 }
1044
1045 /* N : number of cycles */
1046 parm3 = ( parm2 + parm1 );
1047 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1048
1049 if ( parm3 < 2 ) {
1050 parm3 = 2;
1051 }
1052
1053 /* sigma : decreasing incr of the trapezoid */
1054 parm4 = ( parm3 - 1 );
1055 parm4 = ( parm2 - parm1 ) / parm4;
1056
1057 // pointless check, because parm4 >= 0 always
1058 //if ( parm4 < 0 ) {
1059 // parm4 = 0;
1060 //}
1061
1062 pr->u.p.parm1 = parm1;
1063 pr->u.p.parm2 = parm2;
1064 pr->u.p.parm3 = parm3;
1065 pr->u.p.parm4 = parm4;
1066 } // case
1067 break;
1068
1069 default:
1070 {
1071 __kmp_msg(
1072 kmp_ms_fatal, // Severity
1073 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1074 KMP_HNT( GetNewerLibrary ), // Hint
1075 __kmp_msg_null // Variadic argument list terminator
1076 );
1077 }
1078 break;
1079 } // switch
1080 pr->schedule = schedule;
1081 if ( active ) {
1082 /* The name of this buffer should be my_buffer_index when it's free to use it */
1083
1084 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1085 gtid, my_buffer_index, sh->buffer_index) );
1086 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1087 USE_ITT_BUILD_ARG( NULL )
1088 );
1089 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1090 // *always* 32-bit integers.
1091 KMP_MB(); /* is this necessary? */
1092 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1093 gtid, my_buffer_index, sh->buffer_index) );
1094
1095 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1096 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1097#if USE_ITT_BUILD
1098 if ( pr->ordered ) {
1099 __kmp_itt_ordered_init( gtid );
1100 }; // if
1101#endif /* USE_ITT_BUILD */
1102 }; // if
1103 #ifdef KMP_DEBUG
1104 {
1105 const char * buff;
1106 // create format specifiers before the debug output
1107 buff = __kmp_str_format(
1108 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1109 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1110 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1111 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1112 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1113 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1114 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1115 KD_TRACE(10, ( buff,
1116 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1117 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1118 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1119 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1120 __kmp_str_free( &buff );
1121 }
1122 #endif
1123 #if ( KMP_STATIC_STEAL_ENABLED )
1124 if ( ___kmp_size_type < 8 ) {
1125 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1126 // all the parm3 variables will contain the same value.
1127 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1128 // rather than program life-time increment.
1129 // So the dedicated variable is required. The 'static_steal_counter' is used.
1130 if( schedule == kmp_sch_static_steal ) {
1131 // Other threads will inspect this variable when searching for a victim.
1132 // This is a flag showing that other threads may steal from this thread since then.
1133 volatile T * p = &pr->u.p.static_steal_counter;
1134 *p = *p + 1;
1135 }
1136 }
1137 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1138}
1139
1140/*
1141 * For ordered loops, either __kmp_dispatch_finish() should be called after
1142 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1143 * every chunk of iterations. If the ordered section(s) were not executed
1144 * for this iteration (or every iteration in this chunk), we need to set the
1145 * ordered iteration counters so that the next thread can proceed.
1146 */
1147template< typename UT >
1148static void
1149__kmp_dispatch_finish( int gtid, ident_t *loc )
1150{
1151 typedef typename traits_t< UT >::signed_t ST;
1152 kmp_info_t *th = __kmp_threads[ gtid ];
1153
1154 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1155 if ( ! th -> th.th_team -> t.t_serialized ) {
1156
1157 dispatch_private_info_template< UT > * pr =
1158 reinterpret_cast< dispatch_private_info_template< UT >* >
1159 ( th->th.th_dispatch->th_dispatch_pr_current );
1160 dispatch_shared_info_template< UT > volatile * sh =
1161 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1162 ( th->th.th_dispatch->th_dispatch_sh_current );
1163 KMP_DEBUG_ASSERT( pr );
1164 KMP_DEBUG_ASSERT( sh );
1165 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1166 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1167
1168 if ( pr->ordered_bumped ) {
1169 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1170 gtid ) );
1171 pr->ordered_bumped = 0;
1172 } else {
1173 UT lower = pr->u.p.ordered_lower;
1174
1175 #ifdef KMP_DEBUG
1176 {
1177 const char * buff;
1178 // create format specifiers before the debug output
1179 buff = __kmp_str_format(
1180 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1181 traits_t< UT >::spec, traits_t< UT >::spec );
1182 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1183 __kmp_str_free( &buff );
1184 }
1185 #endif
1186
1187 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1188 USE_ITT_BUILD_ARG(NULL)
1189 );
1190 KMP_MB(); /* is this necessary? */
1191 #ifdef KMP_DEBUG
1192 {
1193 const char * buff;
1194 // create format specifiers before the debug output
1195 buff = __kmp_str_format(
1196 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1197 traits_t< UT >::spec, traits_t< UT >::spec );
1198 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1199 __kmp_str_free( &buff );
1200 }
1201 #endif
1202
1203 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1204 } // if
1205 } // if
1206 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1207}
1208
1209#ifdef KMP_GOMP_COMPAT
1210
1211template< typename UT >
1212static void
1213__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1214{
1215 typedef typename traits_t< UT >::signed_t ST;
1216 kmp_info_t *th = __kmp_threads[ gtid ];
1217
1218 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1219 if ( ! th -> th.th_team -> t.t_serialized ) {
1220// int cid;
1221 dispatch_private_info_template< UT > * pr =
1222 reinterpret_cast< dispatch_private_info_template< UT >* >
1223 ( th->th.th_dispatch->th_dispatch_pr_current );
1224 dispatch_shared_info_template< UT > volatile * sh =
1225 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1226 ( th->th.th_dispatch->th_dispatch_sh_current );
1227 KMP_DEBUG_ASSERT( pr );
1228 KMP_DEBUG_ASSERT( sh );
1229 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1230 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1231
1232// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1233 UT lower = pr->u.p.ordered_lower;
1234 UT upper = pr->u.p.ordered_upper;
1235 UT inc = upper - lower + 1;
1236
1237 if ( pr->ordered_bumped == inc ) {
1238 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1239 gtid ) );
1240 pr->ordered_bumped = 0;
1241 } else {
1242 inc -= pr->ordered_bumped;
1243
1244 #ifdef KMP_DEBUG
1245 {
1246 const char * buff;
1247 // create format specifiers before the debug output
1248 buff = __kmp_str_format(
1249 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1250 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1251 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1252 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1253 __kmp_str_free( &buff );
1254 }
1255 #endif
1256
1257 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1258 USE_ITT_BUILD_ARG(NULL)
1259 );
1260
1261 KMP_MB(); /* is this necessary? */
1262 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1263 gtid ) );
1264 pr->ordered_bumped = 0;
1265//!!!!! TODO check if the inc should be unsigned, or signed???
1266 #ifdef KMP_DEBUG
1267 {
1268 const char * buff;
1269 // create format specifiers before the debug output
1270 buff = __kmp_str_format(
1271 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1272 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1273 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1274 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1275 __kmp_str_free( &buff );
1276 }
1277 #endif
1278
1279 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1280 }
1281// }
1282 }
1283 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1284}
1285
1286#endif /* KMP_GOMP_COMPAT */
1287
1288template< typename T >
1289static int
1290__kmp_dispatch_next(
1291 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1292) {
1293
1294 typedef typename traits_t< T >::unsigned_t UT;
1295 typedef typename traits_t< T >::signed_t ST;
1296 typedef typename traits_t< T >::floating_t DBL;
1297 static const int ___kmp_size_type = sizeof( UT );
1298
1299 int status;
1300 dispatch_private_info_template< T > * pr;
1301 kmp_info_t * th = __kmp_threads[ gtid ];
1302 kmp_team_t * team = th -> th.th_team;
1303
1304 #ifdef KMP_DEBUG
1305 {
1306 const char * buff;
1307 // create format specifiers before the debug output
1308 buff = __kmp_str_format(
1309 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1310 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1311 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1312 __kmp_str_free( &buff );
1313 }
1314 #endif
1315
1316 if ( team -> t.t_serialized ) {
1317 /* NOTE: serialize this dispatch becase we are not at the active level */
1318 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1319 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1320 KMP_DEBUG_ASSERT( pr );
1321
1322 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1323 *p_lb = 0;
1324 *p_ub = 0;
1325 if ( p_st != 0 ) {
1326 *p_st = 0;
1327 }
1328 if ( __kmp_env_consistency_check ) {
1329 if ( pr->pushed_ws != ct_none ) {
1330 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1331 }
1332 }
1333 } else if ( pr->nomerge ) {
1334 kmp_int32 last;
1335 T start;
1336 UT limit, trip, init;
1337 ST incr;
1338 T chunk = pr->u.p.parm1;
1339
1340 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1341
1342 init = chunk * pr->u.p.count++;
1343 trip = pr->u.p.tc - 1;
1344
1345 if ( (status = (init <= trip)) == 0 ) {
1346 *p_lb = 0;
1347 *p_ub = 0;
1348 if ( p_st != 0 ) *p_st = 0;
1349 if ( __kmp_env_consistency_check ) {
1350 if ( pr->pushed_ws != ct_none ) {
1351 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1352 }
1353 }
1354 } else {
1355 start = pr->u.p.lb;
1356 limit = chunk + init - 1;
1357 incr = pr->u.p.st;
1358
1359 if ( (last = (limit >= trip)) != 0 ) {
1360 limit = trip;
1361 #if KMP_OS_WINDOWS
1362 pr->u.p.last_upper = pr->u.p.ub;
1363 #endif /* KMP_OS_WINDOWS */
1364 }
1365 if ( p_last ) {
1366 *p_last = last;
1367 }
1368 if ( p_st != 0 ) {
1369 *p_st = incr;
1370 }
1371 if ( incr == 1 ) {
1372 *p_lb = start + init;
1373 *p_ub = start + limit;
1374 } else {
1375 *p_lb = start + init * incr;
1376 *p_ub = start + limit * incr;
1377 }
1378
1379 if ( pr->ordered ) {
1380 pr->u.p.ordered_lower = init;
1381 pr->u.p.ordered_upper = limit;
1382 #ifdef KMP_DEBUG
1383 {
1384 const char * buff;
1385 // create format specifiers before the debug output
1386 buff = __kmp_str_format(
1387 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1388 traits_t< UT >::spec, traits_t< UT >::spec );
1389 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1390 __kmp_str_free( &buff );
1391 }
1392 #endif
1393 } // if
1394 } // if
1395 } else {
1396 pr->u.p.tc = 0;
1397
1398 *p_lb = pr->u.p.lb;
1399 *p_ub = pr->u.p.ub;
1400 #if KMP_OS_WINDOWS
1401 pr->u.p.last_upper = *p_ub;
1402 #endif /* KMP_OS_WINDOWS */
1403
1404 if ( p_st != 0 ) {
1405 *p_st = pr->u.p.st;
1406 }
1407 if ( p_last ) {
1408 *p_last = TRUE;
1409 }
1410 } // if
1411 #ifdef KMP_DEBUG
1412 {
1413 const char * buff;
1414 // create format specifiers before the debug output
1415 buff = __kmp_str_format(
1416 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1417 "p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
1418 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1419 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, status) );
1420 __kmp_str_free( &buff );
1421 }
1422 #endif
1423 return status;
1424 } else {
1425 kmp_int32 last = 0;
1426 dispatch_shared_info_template< UT > *sh;
1427 T start;
1428 ST incr;
1429 UT limit, trip, init;
1430
1431 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1432 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1433
1434 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1435 ( th->th.th_dispatch->th_dispatch_pr_current );
1436 KMP_DEBUG_ASSERT( pr );
1437 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1438 ( th->th.th_dispatch->th_dispatch_sh_current );
1439 KMP_DEBUG_ASSERT( sh );
1440
1441 if ( pr->u.p.tc == 0 ) {
1442 // zero trip count
1443 status = 0;
1444 } else {
1445 switch (pr->schedule) {
1446 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1447 case kmp_sch_static_steal:
1448 {
1449 T chunk = pr->u.p.parm1;
1450
1451 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1452
1453 trip = pr->u.p.tc - 1;
1454
1455 if ( ___kmp_size_type > 4 ) {
1456 // Other threads do not look into the data of this thread,
1457 // so it's not necessary to make volatile casting.
1458 init = ( pr->u.p.count )++;
1459 status = ( init < (UT)pr->u.p.ub );
1460 } else {
1461 typedef union {
1462 struct {
1463 UT count;
1464 T ub;
1465 } p;
1466 kmp_int64 b;
1467 } union_i4;
1468 // All operations on 'count' or 'ub' must be combined atomically together.
1469 // stealing implemented only for 4-byte indexes
1470 {
1471 union_i4 vold, vnew;
1472 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1473 vnew = vold;
1474 vnew.p.count++;
1475 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1476 ( volatile kmp_int64* )&pr->u.p.count,
1477 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1478 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1479 KMP_CPU_PAUSE();
1480 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1481 vnew = vold;
1482 vnew.p.count++;
1483 }
1484 vnew = vold;
1485 init = vnew.p.count;
1486 status = ( init < (UT)vnew.p.ub ) ;
1487 }
1488
1489 if( !status ) {
1490 kmp_info_t **other_threads = team->t.t_threads;
1491 int while_limit = 10;
1492 int while_index = 0;
1493
1494 // TODO: algorithm of searching for a victim
1495 // should be cleaned up and measured
1496 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1497 union_i4 vold, vnew;
1498 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1499 T victimIdx = pr->u.p.parm4;
1500 T oldVictimIdx = victimIdx;
1501 dispatch_private_info_template< T > * victim;
1502
1503 do {
1504 if( !victimIdx ) {
1505 victimIdx = team->t.t_nproc - 1;
1506 } else {
1507 --victimIdx;
1508 }
1509 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1510 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1511 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1512 // TODO: think about a proper place of this test
1513 if ( ( !victim ) ||
1514 ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1515 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1516 // TODO: delay would be nice
1517 continue;
1518 // the victim is not ready yet to participate in stealing
1519 // because the victim is still in kmp_init_dispatch
1520 }
1521 if ( oldVictimIdx == victimIdx ) {
1522 break;
1523 }
1524 pr->u.p.parm4 = victimIdx;
1525
1526 while( 1 ) {
1527 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1528 vnew = vold;
1529
1530 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1531 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1532 break;
1533 }
1534 vnew.p.ub -= (remaining >> 2);
1535 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1536 #pragma warning( push )
1537 // disable warning on pointless comparison of unsigned with 0
1538 #pragma warning( disable: 186 )
1539 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1540 #pragma warning( pop )
1541 // TODO: Should this be acquire or release?
1542 if ( KMP_COMPARE_AND_STORE_ACQ64(
1543 ( volatile kmp_int64 * )&victim->u.p.count,
1544 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1545 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1546 status = 1;
1547 while_index = 0;
1548 // now update own count and ub
1549 #if KMP_ARCH_X86
1550 // stealing executed on non-KMP_ARCH_X86 only
1551 // Atomic 64-bit write on ia32 is
1552 // unavailable, so we do this in steps.
1553 // This code is not tested.
1554 init = vold.p.count;
1555 pr->u.p.ub = 0;
1556 pr->u.p.count = init + 1;
1557 pr->u.p.ub = vnew.p.count;
1558 #else
1559 init = vnew.p.ub;
1560 vold.p.count = init + 1;
1561 // TODO: is it safe and enough?
1562 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1563 #endif // KMP_ARCH_X86
1564 break;
1565 } // if
1566 KMP_CPU_PAUSE();
1567 } // while (1)
1568 } // while
1569 } // if
1570 } // if
1571 if ( !status ) {
1572 *p_lb = 0;
1573 *p_ub = 0;
1574 if ( p_st != 0 ) *p_st = 0;
1575 } else {
1576 start = pr->u.p.parm2;
1577 init *= chunk;
1578 limit = chunk + init - 1;
1579 incr = pr->u.p.st;
1580
1581 KMP_DEBUG_ASSERT(init <= trip);
1582 if ( (last = (limit >= trip)) != 0 )
1583 limit = trip;
1584 if ( p_last ) {
1585 *p_last = last;
1586 }
1587 if ( p_st != 0 ) *p_st = incr;
1588
1589 if ( incr == 1 ) {
1590 *p_lb = start + init;
1591 *p_ub = start + limit;
1592 } else {
1593 *p_lb = start + init * incr;
1594 *p_ub = start + limit * incr;
1595 }
1596
1597 if ( pr->ordered ) {
1598 pr->u.p.ordered_lower = init;
1599 pr->u.p.ordered_upper = limit;
1600 #ifdef KMP_DEBUG
1601 {
1602 const char * buff;
1603 // create format specifiers before the debug output
1604 buff = __kmp_str_format(
1605 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1606 traits_t< UT >::spec, traits_t< UT >::spec );
1607 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1608 __kmp_str_free( &buff );
1609 }
1610 #endif
1611 } // if
1612 } // if
1613 break;
1614 } // case
1615 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1616 case kmp_sch_static_balanced:
1617 {
1618 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1619 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1620 pr->u.p.count = 1;
1621 *p_lb = pr->u.p.lb;
1622 *p_ub = pr->u.p.ub;
1623 last = pr->u.p.parm1;
1624 if ( p_last ) {
1625 *p_last = last;
1626 }
1627 if ( p_st )
1628 *p_st = pr->u.p.st;
1629 } else { /* no iterations to do */
1630 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1631 }
1632 if ( pr->ordered ) {
1633 #ifdef KMP_DEBUG
1634 {
1635 const char * buff;
1636 // create format specifiers before the debug output
1637 buff = __kmp_str_format(
1638 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1639 traits_t< UT >::spec, traits_t< UT >::spec );
1640 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1641 __kmp_str_free( &buff );
1642 }
1643 #endif
1644 } // if
1645 } // case
1646 break;
1647 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1648 case kmp_sch_static_chunked:
1649 {
1650 T parm1;
1651
1652 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1653 gtid ) );
1654 parm1 = pr->u.p.parm1;
1655
1656 trip = pr->u.p.tc - 1;
1657 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1658
1659 if ( (status = (init <= trip)) != 0 ) {
1660 start = pr->u.p.lb;
1661 incr = pr->u.p.st;
1662 limit = parm1 + init - 1;
1663
1664 if ( (last = (limit >= trip)) != 0 )
1665 limit = trip;
1666
1667 if ( p_last ) {
1668 *p_last = last;
1669 }
1670 if ( p_st != 0 ) *p_st = incr;
1671
1672 pr->u.p.count += team->t.t_nproc;
1673
1674 if ( incr == 1 ) {
1675 *p_lb = start + init;
1676 *p_ub = start + limit;
1677 }
1678 else {
1679 *p_lb = start + init * incr;
1680 *p_ub = start + limit * incr;
1681 }
1682
1683 if ( pr->ordered ) {
1684 pr->u.p.ordered_lower = init;
1685 pr->u.p.ordered_upper = limit;
1686 #ifdef KMP_DEBUG
1687 {
1688 const char * buff;
1689 // create format specifiers before the debug output
1690 buff = __kmp_str_format(
1691 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1692 traits_t< UT >::spec, traits_t< UT >::spec );
1693 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1694 __kmp_str_free( &buff );
1695 }
1696 #endif
1697 } // if
1698 } // if
1699 } // case
1700 break;
1701
1702 case kmp_sch_dynamic_chunked:
1703 {
1704 T chunk = pr->u.p.parm1;
1705
1706 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1707 gtid ) );
1708
1709 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1710 trip = pr->u.p.tc - 1;
1711
1712 if ( (status = (init <= trip)) == 0 ) {
1713 *p_lb = 0;
1714 *p_ub = 0;
1715 if ( p_st != 0 ) *p_st = 0;
1716 } else {
1717 start = pr->u.p.lb;
1718 limit = chunk + init - 1;
1719 incr = pr->u.p.st;
1720
1721 if ( (last = (limit >= trip)) != 0 )
1722 limit = trip;
1723 if ( p_last ) {
1724 *p_last = last;
1725 }
1726 if ( p_st != 0 ) *p_st = incr;
1727
1728 if ( incr == 1 ) {
1729 *p_lb = start + init;
1730 *p_ub = start + limit;
1731 } else {
1732 *p_lb = start + init * incr;
1733 *p_ub = start + limit * incr;
1734 }
1735
1736 if ( pr->ordered ) {
1737 pr->u.p.ordered_lower = init;
1738 pr->u.p.ordered_upper = limit;
1739 #ifdef KMP_DEBUG
1740 {
1741 const char * buff;
1742 // create format specifiers before the debug output
1743 buff = __kmp_str_format(
1744 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1745 traits_t< UT >::spec, traits_t< UT >::spec );
1746 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1747 __kmp_str_free( &buff );
1748 }
1749 #endif
1750 } // if
1751 } // if
1752 } // case
1753 break;
1754
1755 case kmp_sch_guided_iterative_chunked:
1756 {
1757 T chunkspec = pr->u.p.parm1;
1758 KD_TRACE(100,
1759 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1760 trip = pr->u.p.tc;
1761 // Start atomic part of calculations
1762 while(1) {
1763 ST remaining; // signed, because can be < 0
1764 init = sh->u.s.iteration; // shared value
1765 remaining = trip - init;
1766 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1767 // nothing to do, don't try atomic op
1768 status = 0;
1769 break;
1770 }
1771 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1772 // use dynamic-style shcedule
1773 // atomically inrement iterations, get old value
1774 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1775 remaining = trip - init;
1776 if (remaining <= 0) {
1777 status = 0; // all iterations got by other threads
1778 } else {
1779 // got some iterations to work on
1780 status = 1;
1781 if ( (T)remaining > chunkspec ) {
1782 limit = init + chunkspec - 1;
1783 } else {
1784 last = 1; // the last chunk
1785 limit = init + remaining - 1;
1786 } // if
1787 } // if
1788 break;
1789 } // if
1790 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1791 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1792 // CAS was successful, chunk obtained
1793 status = 1;
1794 --limit;
1795 break;
1796 } // if
1797 } // while
1798 if ( status != 0 ) {
1799 start = pr->u.p.lb;
1800 incr = pr->u.p.st;
1801 if ( p_st != NULL )
1802 *p_st = incr;
1803 if ( p_last != NULL )
1804 *p_last = last;
1805 *p_lb = start + init * incr;
1806 *p_ub = start + limit * incr;
1807 if ( pr->ordered ) {
1808 pr->u.p.ordered_lower = init;
1809 pr->u.p.ordered_upper = limit;
1810 #ifdef KMP_DEBUG
1811 {
1812 const char * buff;
1813 // create format specifiers before the debug output
1814 buff = __kmp_str_format(
1815 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1816 traits_t< UT >::spec, traits_t< UT >::spec );
1817 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1818 __kmp_str_free( &buff );
1819 }
1820 #endif
1821 } // if
1822 } else {
1823 *p_lb = 0;
1824 *p_ub = 0;
1825 if ( p_st != NULL )
1826 *p_st = 0;
1827 } // if
1828 } // case
1829 break;
1830
1831 case kmp_sch_guided_analytical_chunked:
1832 {
1833 T chunkspec = pr->u.p.parm1;
1834 UT chunkIdx;
1835 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1836 /* for storing original FPCW value for Windows* OS on
1837 IA-32 architecture 8-byte version */
1838 unsigned int oldFpcw;
1839 int fpcwSet = 0;
1840 #endif
1841 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1842 gtid ) );
1843
1844 trip = pr->u.p.tc;
1845
1846 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1847 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1848
1849 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1850 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1851 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1852 --trip;
1853 /* use dynamic-style scheduling */
1854 init = chunkIdx * chunkspec + pr->u.p.count;
1855 /* need to verify init > 0 in case of overflow in the above calculation */
1856 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1857 limit = init + chunkspec -1;
1858
1859 if ( (last = (limit >= trip)) != 0 )
1860 limit = trip;
1861 }
1862 break;
1863 } else {
1864 /* use exponential-style scheduling */
1865 /* The following check is to workaround the lack of long double precision on Windows* OS.
1866 This check works around the possible effect that init != 0 for chunkIdx == 0.
1867 */
1868 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1869 /* If we haven't already done so, save original
1870 FPCW and set precision to 64-bit, as Windows* OS
1871 on IA-32 architecture defaults to 53-bit */
1872 if ( !fpcwSet ) {
1873 oldFpcw = _control87(0,0x30000);
1874 fpcwSet = 0x30000;
1875 }
1876 #endif
1877 if ( chunkIdx ) {
1878 init = __kmp_dispatch_guided_remaining< T >(
1879 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1880 KMP_DEBUG_ASSERT(init);
1881 init = trip - init;
1882 } else
1883 init = 0;
1884 limit = trip - __kmp_dispatch_guided_remaining< T >(
1885 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1886 KMP_ASSERT(init <= limit);
1887 if ( init < limit ) {
1888 KMP_DEBUG_ASSERT(limit <= trip);
1889 --limit;
1890 status = 1;
1891 break;
1892 } // if
1893 } // if
1894 } // while (1)
1895 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1896 /* restore FPCW if necessary */
1897 if ( oldFpcw & fpcwSet != 0 )
1898 _control87(oldFpcw,0x30000);
1899 #endif
1900 if ( status != 0 ) {
1901 start = pr->u.p.lb;
1902 incr = pr->u.p.st;
1903 if ( p_st != NULL )
1904 *p_st = incr;
1905 if ( p_last != NULL )
1906 *p_last = last;
1907 *p_lb = start + init * incr;
1908 *p_ub = start + limit * incr;
1909 if ( pr->ordered ) {
1910 pr->u.p.ordered_lower = init;
1911 pr->u.p.ordered_upper = limit;
1912 #ifdef KMP_DEBUG
1913 {
1914 const char * buff;
1915 // create format specifiers before the debug output
1916 buff = __kmp_str_format(
1917 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1918 traits_t< UT >::spec, traits_t< UT >::spec );
1919 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1920 __kmp_str_free( &buff );
1921 }
1922 #endif
1923 }
1924 } else {
1925 *p_lb = 0;
1926 *p_ub = 0;
1927 if ( p_st != NULL )
1928 *p_st = 0;
1929 }
1930 } // case
1931 break;
1932
1933 case kmp_sch_trapezoidal:
1934 {
1935 UT index;
1936 T parm2 = pr->u.p.parm2;
1937 T parm3 = pr->u.p.parm3;
1938 T parm4 = pr->u.p.parm4;
1939 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1940 gtid ) );
1941
1942 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1943
1944 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1945 trip = pr->u.p.tc - 1;
1946
1947 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1948 *p_lb = 0;
1949 *p_ub = 0;
1950 if ( p_st != 0 ) *p_st = 0;
1951 } else {
1952 start = pr->u.p.lb;
1953 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1954 incr = pr->u.p.st;
1955
1956 if ( (last = (limit >= trip)) != 0 )
1957 limit = trip;
1958
1959 if ( p_last != 0 ) {
1960 *p_last = last;
1961 }
1962 if ( p_st != 0 ) *p_st = incr;
1963
1964 if ( incr == 1 ) {
1965 *p_lb = start + init;
1966 *p_ub = start + limit;
1967 } else {
1968 *p_lb = start + init * incr;
1969 *p_ub = start + limit * incr;
1970 }
1971
1972 if ( pr->ordered ) {
1973 pr->u.p.ordered_lower = init;
1974 pr->u.p.ordered_upper = limit;
1975 #ifdef KMP_DEBUG
1976 {
1977 const char * buff;
1978 // create format specifiers before the debug output
1979 buff = __kmp_str_format(
1980 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1981 traits_t< UT >::spec, traits_t< UT >::spec );
1982 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1983 __kmp_str_free( &buff );
1984 }
1985 #endif
1986 } // if
1987 } // if
1988 } // case
1989 break;
1990 } // switch
1991 } // if tc == 0;
1992
1993 if ( status == 0 ) {
1994 UT num_done;
1995
1996 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
1997 #ifdef KMP_DEBUG
1998 {
1999 const char * buff;
2000 // create format specifiers before the debug output
2001 buff = __kmp_str_format(
2002 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2003 traits_t< UT >::spec );
2004 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2005 __kmp_str_free( &buff );
2006 }
2007 #endif
2008
2009 if ( num_done == team->t.t_nproc-1 ) {
2010 /* NOTE: release this buffer to be reused */
2011
2012 KMP_MB(); /* Flush all pending memory write invalidates. */
2013
2014 sh->u.s.num_done = 0;
2015 sh->u.s.iteration = 0;
2016
2017 /* TODO replace with general release procedure? */
2018 if ( pr->ordered ) {
2019 sh->u.s.ordered_iteration = 0;
2020 }
2021
2022 KMP_MB(); /* Flush all pending memory write invalidates. */
2023
2024 sh -> buffer_index += KMP_MAX_DISP_BUF;
2025 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2026 gtid, sh->buffer_index) );
2027
2028 KMP_MB(); /* Flush all pending memory write invalidates. */
2029
2030 } // if
2031 if ( __kmp_env_consistency_check ) {
2032 if ( pr->pushed_ws != ct_none ) {
2033 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2034 }
2035 }
2036
2037 th -> th.th_dispatch -> th_deo_fcn = NULL;
2038 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2039 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2040 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2041 } // if (status == 0)
2042#if KMP_OS_WINDOWS
2043 else if ( last ) {
2044 pr->u.p.last_upper = pr->u.p.ub;
2045 }
2046#endif /* KMP_OS_WINDOWS */
2047 } // if
2048
2049 #ifdef KMP_DEBUG
2050 {
2051 const char * buff;
2052 // create format specifiers before the debug output
2053 buff = __kmp_str_format(
2054 "__kmp_dispatch_next: T#%%d normal case: " \
2055 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2056 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2057 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2058 __kmp_str_free( &buff );
2059 }
2060 #endif
2061 return status;
2062}
2063
2064//-----------------------------------------------------------------------------------------
2065// Dispatch routines
2066// Transfer call to template< type T >
2067// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2068// T lb, T ub, ST st, ST chunk )
2069extern "C" {
2070
2071/*!
2072@ingroup WORK_SHARING
2073@{
2074@param loc Source location
2075@param gtid Global thread id
2076@param schedule Schedule type
2077@param lb Lower bound
2078@param ub Upper bound
2079@param st Step (or increment if you prefer)
2080@param chunk The chunk size to block with
2081
2082This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2083These functions are all identical apart from the types of the arguments.
2084*/
2085
2086void
2087__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2088 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2089{
2090 KMP_DEBUG_ASSERT( __kmp_init_serial );
2091 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2092}
2093/*!
2094See @ref __kmpc_dispatch_init_4
2095*/
2096void
2097__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2098 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2099{
2100 KMP_DEBUG_ASSERT( __kmp_init_serial );
2101 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2102}
2103
2104/*!
2105See @ref __kmpc_dispatch_init_4
2106*/
2107void
2108__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2109 kmp_int64 lb, kmp_int64 ub,
2110 kmp_int64 st, kmp_int64 chunk )
2111{
2112 KMP_DEBUG_ASSERT( __kmp_init_serial );
2113 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2114}
2115
2116/*!
2117See @ref __kmpc_dispatch_init_4
2118*/
2119void
2120__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2121 kmp_uint64 lb, kmp_uint64 ub,
2122 kmp_int64 st, kmp_int64 chunk )
2123{
2124 KMP_DEBUG_ASSERT( __kmp_init_serial );
2125 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2126}
2127
2128/*!
2129@param loc Source code location
2130@param gtid Global thread id
2131@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2132@param p_lb Pointer to the lower bound for the next chunk of work
2133@param p_ub Pointer to the upper bound for the next chunk of work
2134@param p_st Pointer to the stride for the next chunk of work
2135@return one if there is work to be done, zero otherwise
2136
2137Get the next dynamically allocated chunk of work for this thread.
2138If there is no more work, then the lb,ub and stride need not be modified.
2139*/
2140int
2141__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2142 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2143{
2144 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2145}
2146
2147/*!
2148See @ref __kmpc_dispatch_next_4
2149*/
2150int
2151__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2152 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2153{
2154 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2155}
2156
2157/*!
2158See @ref __kmpc_dispatch_next_4
2159*/
2160int
2161__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2162 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2163{
2164 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2165}
2166
2167/*!
2168See @ref __kmpc_dispatch_next_4
2169*/
2170int
2171__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2172 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2173{
2174 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2175}
2176
2177/*!
2178@param loc Source code location
2179@param gtid Global thread id
2180
2181Mark the end of a dynamic loop.
2182*/
2183void
2184__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2185{
2186 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2187}
2188
2189/*!
2190See @ref __kmpc_dispatch_fini_4
2191*/
2192void
2193__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2194{
2195 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2196}
2197
2198/*!
2199See @ref __kmpc_dispatch_fini_4
2200*/
2201void
2202__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2203{
2204 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2205}
2206
2207/*!
2208See @ref __kmpc_dispatch_fini_4
2209*/
2210void
2211__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2212{
2213 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2214}
2215/*! @} */
2216
2217//-----------------------------------------------------------------------------------------
2218//Non-template routines from kmp_dispatch.c used in other sources
2219
2220kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2221 return value == checker;
2222}
2223
2224kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2225 return value != checker;
2226}
2227
2228kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2229 return value < checker;
2230}
2231
2232kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2233 return value >= checker;
2234}
2235
2236kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2237 return value <= checker;
2238}
2239kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2240 return value == checker;
2241}
2242
2243kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2244 return value != checker;
2245}
2246
2247kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2248 return value < checker;
2249}
2250
2251kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2252 return value >= checker;
2253}
2254
2255kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2256 return value <= checker;
2257}
2258
2259kmp_uint32
2260__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2261 kmp_uint32 checker,
2262 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2263 , void * obj // Higher-level synchronization object, or NULL.
2264 )
2265{
2266 // note: we may not belong to a team at this point
2267 register volatile kmp_uint32 * spin = spinner;
2268 register kmp_uint32 check = checker;
2269 register kmp_uint32 spins;
2270 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2271 register kmp_uint32 r;
2272
2273 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2274 KMP_INIT_YIELD( spins );
2275 // main wait spin loop
2276 while(!f(r = TCR_4(*spin), check)) {
2277 KMP_FSYNC_SPIN_PREPARE( obj );
2278 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2279 It causes problems with infinite recursion because of exit lock */
2280 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2281 __kmp_abort_thread(); */
2282
2283 __kmp_static_delay(TRUE);
2284
2285 /* if we have waited a bit, or are oversubscribed, yield */
2286 /* pause is in the following code */
2287 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2288 KMP_YIELD_SPIN( spins );
2289 }
2290 KMP_FSYNC_SPIN_ACQUIRED( obj );
2291 return r;
2292}
2293
2294kmp_uint64
2295__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2296 kmp_uint64 checker,
2297 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2298 , void * obj // Higher-level synchronization object, or NULL.
2299 )
2300{
2301 // note: we may not belong to a team at this point
2302 register volatile kmp_uint64 * spin = spinner;
2303 register kmp_uint64 check = checker;
2304 register kmp_uint32 spins;
2305 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2306 register kmp_uint64 r;
2307
2308 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2309 KMP_INIT_YIELD( spins );
2310 // main wait spin loop
2311 while(!f(r = *spin, check))
2312 {
2313 KMP_FSYNC_SPIN_PREPARE( obj );
2314 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2315 It causes problems with infinite recursion because of exit lock */
2316 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2317 __kmp_abort_thread(); */
2318
2319 __kmp_static_delay(TRUE);
2320
2321 // if we are oversubscribed,
2322 // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2323 // pause is in the following code
2324 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2325 KMP_YIELD_SPIN( spins );
2326 }
2327 KMP_FSYNC_SPIN_ACQUIRED( obj );
2328 return r;
2329}
2330
2331} // extern "C"
2332
2333#ifdef KMP_GOMP_COMPAT
2334
2335void
2336__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2337 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2338 kmp_int32 chunk, int push_ws )
2339{
2340 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2341 push_ws );
2342}
2343
2344void
2345__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2346 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2347 kmp_int32 chunk, int push_ws )
2348{
2349 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2350 push_ws );
2351}
2352
2353void
2354__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2355 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2356 kmp_int64 chunk, int push_ws )
2357{
2358 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2359 push_ws );
2360}
2361
2362void
2363__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2364 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2365 kmp_int64 chunk, int push_ws )
2366{
2367 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2368 push_ws );
2369}
2370
2371void
2372__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2373{
2374 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2375}
2376
2377void
2378__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2379{
2380 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2381}
2382
2383void
2384__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2385{
2386 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2387}
2388
2389void
2390__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2391{
2392 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2393}
2394
2395#endif /* KMP_GOMP_COMPAT */
2396
2397/* ------------------------------------------------------------------------ */
2398/* ------------------------------------------------------------------------ */
2399