blob: 0225305303293477ffcd3cce20bda8f26e65dcd0 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie181b4bb2013-12-23 17:28:57 +00003 * $Revision: 42674 $
4 * $Date: 2013-09-18 11:12:49 -0500 (Wed, 18 Sep 2013) $
Jim Cownie5e8470a2013-09-27 10:38:44 +00005 */
6
7
8//===----------------------------------------------------------------------===//
9//
10// The LLVM Compiler Infrastructure
11//
12// This file is dual licensed under the MIT and the University of Illinois Open
13// Source Licenses. See LICENSE.txt for details.
14//
15//===----------------------------------------------------------------------===//
16
17
18/*
19 * Dynamic scheduling initialization and dispatch.
20 *
21 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
22 * it may change values between parallel regions. __kmp_max_nth
23 * is the largest value __kmp_nth may take, 1 is the smallest.
24 *
25 */
26
27/* ------------------------------------------------------------------------ */
28/* ------------------------------------------------------------------------ */
29
30#include "kmp.h"
31#include "kmp_i18n.h"
32#include "kmp_itt.h"
33#include "kmp_str.h"
34#include "kmp_error.h"
35#if KMP_OS_WINDOWS && KMP_ARCH_X86
36 #include <float.h>
37#endif
38
39/* ------------------------------------------------------------------------ */
40/* ------------------------------------------------------------------------ */
41
42#ifdef KMP_STATIC_STEAL_ENABLED
43
44 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
45 template< typename T >
46 struct dispatch_private_infoXX_template {
47 typedef typename traits_t< T >::unsigned_t UT;
48 typedef typename traits_t< T >::signed_t ST;
49 UT count; // unsigned
50 T ub;
51 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
52 T lb;
53 ST st; // signed
54 UT tc; // unsigned
55 T static_steal_counter; // for static_steal only; maybe better to put after ub
56
57 /* parm[1-4] are used in different ways by different scheduling algorithms */
58
59 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
60 // a) parm3 is properly aligned and
61 // b) all parm1-4 are in the same cache line.
62 // Because of parm1-4 are used together, performance seems to be better
63 // if they are in the same line (not measured though).
64
65 struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
66 T parm1;
67 T parm2;
68 T parm3;
69 T parm4;
70 };
71
72 UT ordered_lower; // unsigned
73 UT ordered_upper; // unsigned
74 #if KMP_OS_WINDOWS
75 T last_upper;
76 #endif /* KMP_OS_WINDOWS */
77 };
78
79#else /* KMP_STATIC_STEAL_ENABLED */
80
81 // replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
82 template< typename T >
83 struct dispatch_private_infoXX_template {
84 typedef typename traits_t< T >::unsigned_t UT;
85 typedef typename traits_t< T >::signed_t ST;
86 T lb;
87 T ub;
88 ST st; // signed
89 UT tc; // unsigned
90
91 T parm1;
92 T parm2;
93 T parm3;
94 T parm4;
95
96 UT count; // unsigned
97
98 UT ordered_lower; // unsigned
99 UT ordered_upper; // unsigned
100 #if KMP_OS_WINDOWS
101 T last_upper;
102 #endif /* KMP_OS_WINDOWS */
103 };
104
105#endif /* KMP_STATIC_STEAL_ENABLED */
106
107// replaces dispatch_private_info structure and dispatch_private_info_t type
108template< typename T >
109struct KMP_ALIGN_CACHE dispatch_private_info_template {
110 // duplicate alignment here, otherwise size of structure is not correct in our compiler
111 union KMP_ALIGN_CACHE private_info_tmpl {
112 dispatch_private_infoXX_template< T > p;
113 dispatch_private_info64_t p64;
114 } u;
115 enum sched_type schedule; /* scheduling algorithm */
116 kmp_uint32 ordered; /* ordered clause specified */
117 kmp_uint32 ordered_bumped;
118 kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
119 dispatch_private_info * next; /* stack of buffers for nest of serial regions */
120 kmp_uint32 nomerge; /* don't merge iters if serialized */
121 kmp_uint32 type_size;
122 enum cons_type pushed_ws;
123};
124
125
126// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
127template< typename UT >
128struct dispatch_shared_infoXX_template {
129 /* chunk index under dynamic, number of idle threads under static-steal;
130 iteration index otherwise */
131 volatile UT iteration;
132 volatile UT num_done;
133 volatile UT ordered_iteration;
134 UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
135};
136
137// replaces dispatch_shared_info structure and dispatch_shared_info_t type
138template< typename UT >
139struct dispatch_shared_info_template {
140 // we need union here to keep the structure size
141 union shared_info_tmpl {
142 dispatch_shared_infoXX_template< UT > s;
143 dispatch_shared_info64_t s64;
144 } u;
145 volatile kmp_uint32 buffer_index;
146};
147
148/* ------------------------------------------------------------------------ */
149/* ------------------------------------------------------------------------ */
150
151static void
152__kmp_static_delay( int arg )
153{
154 /* Work around weird code-gen bug that causes assert to trip */
155 #if KMP_ARCH_X86_64 && KMP_OS_LINUX
156 #else
157 KMP_ASSERT( arg >= 0 );
158 #endif
159}
160
161static void
162__kmp_static_yield( int arg )
163{
164 __kmp_yield( arg );
165}
166
167#undef USE_TEST_LOCKS
168
169// test_then_add template (general template should NOT be used)
170template< typename T >
171static __forceinline T
172test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
173
174template<>
175__forceinline kmp_int32
176test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
177{
178 kmp_int32 r;
179 r = KMP_TEST_THEN_ADD32( p, d );
180 return r;
181}
182
183template<>
184__forceinline kmp_int64
185test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
186{
187 kmp_int64 r;
188 r = KMP_TEST_THEN_ADD64( p, d );
189 return r;
190}
191
192// test_then_inc_acq template (general template should NOT be used)
193template< typename T >
194static __forceinline T
195test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
196
197template<>
198__forceinline kmp_int32
199test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
200{
201 kmp_int32 r;
202 r = KMP_TEST_THEN_INC_ACQ32( p );
203 return r;
204}
205
206template<>
207__forceinline kmp_int64
208test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
209{
210 kmp_int64 r;
211 r = KMP_TEST_THEN_INC_ACQ64( p );
212 return r;
213}
214
215// test_then_inc template (general template should NOT be used)
216template< typename T >
217static __forceinline T
218test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
219
220template<>
221__forceinline kmp_int32
222test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
223{
224 kmp_int32 r;
225 r = KMP_TEST_THEN_INC32( p );
226 return r;
227}
228
229template<>
230__forceinline kmp_int64
231test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
232{
233 kmp_int64 r;
234 r = KMP_TEST_THEN_INC64( p );
235 return r;
236}
237
238// compare_and_swap template (general template should NOT be used)
239template< typename T >
240static __forceinline kmp_int32
241compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
242
243template<>
244__forceinline kmp_int32
245compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
246{
247 return KMP_COMPARE_AND_STORE_REL32( p, c, s );
248}
249
250template<>
251__forceinline kmp_int32
252compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
253{
254 return KMP_COMPARE_AND_STORE_REL64( p, c, s );
255}
256
257/*
258 Spin wait loop that first does pause, then yield.
259 Waits until function returns non-zero when called with *spinner and check.
260 Does NOT put threads to sleep.
261#if USE_ITT_BUILD
262 Arguments:
Alp Toker8f2d3f02014-02-24 10:40:15 +0000263 obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie5e8470a2013-09-27 10:38:44 +0000264 locks consistently. For example, if lock is acquired immediately, its address is
265 reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
266 immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
267 address, not an address of low-level spinner.
268#endif // USE_ITT_BUILD
269*/
270template< typename UT >
271// ToDo: make inline function (move to header file for icl)
272static UT // unsigned 4- or 8-byte type
273__kmp_wait_yield( volatile UT * spinner,
274 UT checker,
275 kmp_uint32 (* pred)( UT, UT )
276 USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
277 )
278{
279 // note: we may not belong to a team at this point
280 register volatile UT * spin = spinner;
281 register UT check = checker;
282 register kmp_uint32 spins;
283 register kmp_uint32 (*f) ( UT, UT ) = pred;
284 register UT r;
285
286 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
287 KMP_INIT_YIELD( spins );
288 // main wait spin loop
289 while(!f(r = *spin, check))
290 {
291 KMP_FSYNC_SPIN_PREPARE( obj );
292 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
293 It causes problems with infinite recursion because of exit lock */
294 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
295 __kmp_abort_thread(); */
296
297 __kmp_static_delay(TRUE);
298
299 // if we are oversubscribed,
300 // or have waited a bit (and KMP_LIBRARY=throughput, then yield
301 // pause is in the following code
302 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
303 KMP_YIELD_SPIN( spins );
304 }
305 KMP_FSYNC_SPIN_ACQUIRED( obj );
306 return r;
307}
308
309template< typename UT >
310static kmp_uint32 __kmp_eq( UT value, UT checker) {
311 return value == checker;
312}
313
314template< typename UT >
315static kmp_uint32 __kmp_neq( UT value, UT checker) {
316 return value != checker;
317}
318
319template< typename UT >
320static kmp_uint32 __kmp_lt( UT value, UT checker) {
321 return value < checker;
322}
323
324template< typename UT >
325static kmp_uint32 __kmp_ge( UT value, UT checker) {
326 return value >= checker;
327}
328
329template< typename UT >
330static kmp_uint32 __kmp_le( UT value, UT checker) {
331 return value <= checker;
332}
333
334
335/* ------------------------------------------------------------------------ */
336/* ------------------------------------------------------------------------ */
337
338static void
339__kmp_dispatch_deo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
340{
341 kmp_info_t *th;
342
343 KMP_DEBUG_ASSERT( gtid_ref );
344
345 if ( __kmp_env_consistency_check ) {
346 th = __kmp_threads[*gtid_ref];
347 if ( th -> th.th_root -> r.r_active
348 && ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
349 __kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
350 }
351 }
352}
353
354template< typename UT >
355static void
356__kmp_dispatch_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
357{
358 typedef typename traits_t< UT >::signed_t ST;
359 dispatch_private_info_template< UT > * pr;
360
361 int gtid = *gtid_ref;
362// int cid = *cid_ref;
363 kmp_info_t *th = __kmp_threads[ gtid ];
364 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
365
366 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
367 if ( __kmp_env_consistency_check ) {
368 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
369 ( th -> th.th_dispatch -> th_dispatch_pr_current );
370 if ( pr -> pushed_ws != ct_none ) {
371 __kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
372 }
373 }
374
375 if ( ! th -> th.th_team -> t.t_serialized ) {
376 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
377 ( th -> th.th_dispatch -> th_dispatch_sh_current );
378 UT lower;
379
380 if ( ! __kmp_env_consistency_check ) {
381 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
382 ( th -> th.th_dispatch -> th_dispatch_pr_current );
383 }
384 lower = pr->u.p.ordered_lower;
385
386 #if ! defined( KMP_GOMP_COMPAT )
387 if ( __kmp_env_consistency_check ) {
388 if ( pr->ordered_bumped ) {
389 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
390 __kmp_error_construct2(
391 kmp_i18n_msg_CnsMultipleNesting,
392 ct_ordered_in_pdo, loc_ref,
393 & p->stack_data[ p->w_top ]
394 );
395 }
396 }
397 #endif /* !defined(KMP_GOMP_COMPAT) */
398
399 KMP_MB();
400 #ifdef KMP_DEBUG
401 {
402 const char * buff;
403 // create format specifiers before the debug output
404 buff = __kmp_str_format(
405 "__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
406 traits_t< UT >::spec, traits_t< UT >::spec );
407 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
408 __kmp_str_free( &buff );
409 }
410 #endif
411
412 __kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
413 USE_ITT_BUILD_ARG( NULL )
414 );
415 KMP_MB(); /* is this necessary? */
416 #ifdef KMP_DEBUG
417 {
418 const char * buff;
419 // create format specifiers before the debug output
420 buff = __kmp_str_format(
421 "__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
422 traits_t< UT >::spec, traits_t< UT >::spec );
423 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
424 __kmp_str_free( &buff );
425 }
426 #endif
427 }
428 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
429}
430
431static void
432__kmp_dispatch_dxo_error( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
433{
434 kmp_info_t *th;
435
436 if ( __kmp_env_consistency_check ) {
437 th = __kmp_threads[*gtid_ref];
438 if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
439 __kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
440 }
441 }
442}
443
444template< typename UT >
445static void
446__kmp_dispatch_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
447{
448 typedef typename traits_t< UT >::signed_t ST;
449 dispatch_private_info_template< UT > * pr;
450
451 int gtid = *gtid_ref;
452// int cid = *cid_ref;
453 kmp_info_t *th = __kmp_threads[ gtid ];
454 KMP_DEBUG_ASSERT( th -> th.th_dispatch );
455
456 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
457 if ( __kmp_env_consistency_check ) {
458 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
459 ( th -> th.th_dispatch -> th_dispatch_pr_current );
460 if ( pr -> pushed_ws != ct_none ) {
461 __kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
462 }
463 }
464
465 if ( ! th -> th.th_team -> t.t_serialized ) {
466 dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
467 ( th -> th.th_dispatch -> th_dispatch_sh_current );
468
469 if ( ! __kmp_env_consistency_check ) {
470 pr = reinterpret_cast< dispatch_private_info_template< UT >* >
471 ( th -> th.th_dispatch -> th_dispatch_pr_current );
472 }
473
474 KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
475 #if ! defined( KMP_GOMP_COMPAT )
476 if ( __kmp_env_consistency_check ) {
477 if ( pr->ordered_bumped != 0 ) {
478 struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
479 /* How to test it? - OM */
480 __kmp_error_construct2(
481 kmp_i18n_msg_CnsMultipleNesting,
482 ct_ordered_in_pdo, loc_ref,
483 & p->stack_data[ p->w_top ]
484 );
485 }
486 }
487 #endif /* !defined(KMP_GOMP_COMPAT) */
488
489 KMP_MB(); /* Flush all pending memory write invalidates. */
490
491 pr->ordered_bumped += 1;
492
493 KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
494 gtid, pr->ordered_bumped ) );
495
496 KMP_MB(); /* Flush all pending memory write invalidates. */
497
498 /* TODO use general release procedure? */
499 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
500
501 KMP_MB(); /* Flush all pending memory write invalidates. */
502 }
503 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
504}
505
506/* Computes and returns x to the power of y, where y must a non-negative integer */
507template< typename UT >
508static __forceinline long double
509__kmp_pow(long double x, UT y) {
510 long double s=1.0L;
511
512 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
513 //KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
514 while(y) {
515 if ( y & 1 )
516 s *= x;
517 x *= x;
518 y >>= 1;
519 }
520 return s;
521}
522
523/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
524 (the total number of unassigned iterations in chunks with index greater than or equal to idx).
525 __forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
526 (one of the unit tests, sch_guided_analytical_basic.cpp, fails)
527*/
528template< typename T >
529static __inline typename traits_t< T >::unsigned_t
530__kmp_dispatch_guided_remaining(
531 T tc,
532 typename traits_t< T >::floating_t base,
533 typename traits_t< T >::unsigned_t idx
534) {
535 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
536 least for ICL 8.1, long double arithmetic may not really have
537 long double precision, even with /Qlong_double. Currently, we
538 workaround that in the caller code, by manipulating the FPCW for
539 Windows* OS on IA-32 architecture. The lack of precision is not
540 expected to be a correctness issue, though.
541 */
542 typedef typename traits_t< T >::unsigned_t UT;
543
544 long double x = tc * __kmp_pow< UT >(base, idx);
545 UT r = (UT) x;
546 if ( x == r )
547 return r;
548 return r + 1;
549}
550
551// Parameters of the guided-iterative algorithm:
552// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
553// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
554// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
555// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
556static int guided_int_param = 2;
557static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
558
559// UT - unsigned flavor of T, ST - signed flavor of T,
560// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
561template< typename T >
562static void
563__kmp_dispatch_init(
564 ident_t * loc,
565 int gtid,
566 enum sched_type schedule,
567 T lb,
568 T ub,
569 typename traits_t< T >::signed_t st,
570 typename traits_t< T >::signed_t chunk,
571 int push_ws
572) {
573 typedef typename traits_t< T >::unsigned_t UT;
574 typedef typename traits_t< T >::signed_t ST;
575 typedef typename traits_t< T >::floating_t DBL;
576 static const int ___kmp_size_type = sizeof( UT );
577
578 int active;
579 T tc;
580 kmp_info_t * th;
581 kmp_team_t * team;
582 kmp_uint32 my_buffer_index;
583 dispatch_private_info_template< T > * pr;
584 dispatch_shared_info_template< UT > volatile * sh;
585
586 KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
587 KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
588
589 if ( ! TCR_4( __kmp_init_parallel ) )
590 __kmp_parallel_initialize();
591
592 #ifdef KMP_DEBUG
593 {
594 const char * buff;
595 // create format specifiers before the debug output
596 buff = __kmp_str_format(
597 "__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
598 traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
599 KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
600 __kmp_str_free( &buff );
601 }
602 #endif
603 /* setup data */
604 th = __kmp_threads[ gtid ];
605 team = th -> th.th_team;
606 active = ! team -> t.t_serialized;
607 th->th.th_ident = loc;
608
609 if ( ! active ) {
610 pr = reinterpret_cast< dispatch_private_info_template< T >* >
611 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
612 } else {
613 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
614 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
615
616 my_buffer_index = th->th.th_dispatch->th_disp_index ++;
617
618 /* What happens when number of threads changes, need to resize buffer? */
619 pr = reinterpret_cast< dispatch_private_info_template< T > * >
620 ( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
621 sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
622 ( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
623 }
624
625 /* Pick up the nomerge/ordered bits from the scheduling type */
626 if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
627 pr->nomerge = TRUE;
628 schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
629 } else {
630 pr->nomerge = FALSE;
631 }
632 pr->type_size = ___kmp_size_type; // remember the size of variables
633 if ( kmp_ord_lower & schedule ) {
634 pr->ordered = TRUE;
635 schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
636 } else {
637 pr->ordered = FALSE;
638 }
639 if ( schedule == kmp_sch_static ) {
640 schedule = __kmp_static;
641 } else {
642 if ( schedule == kmp_sch_runtime ) {
643 #if OMP_30_ENABLED
644 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
645 schedule = team -> t.t_sched.r_sched_type;
646 // Detail the schedule if needed (global controls are differentiated appropriately)
647 if ( schedule == kmp_sch_guided_chunked ) {
648 schedule = __kmp_guided;
649 } else if ( schedule == kmp_sch_static ) {
650 schedule = __kmp_static;
651 }
652 // Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
653 chunk = team -> t.t_sched.chunk;
654 #else
655 kmp_r_sched_t r_sched = __kmp_get_schedule_global();
656 // Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default
657 schedule = r_sched.r_sched_type;
658 chunk = r_sched.chunk;
659 #endif
660
661 #ifdef KMP_DEBUG
662 {
663 const char * buff;
664 // create format specifiers before the debug output
665 buff = __kmp_str_format(
666 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
667 traits_t< ST >::spec );
668 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
669 __kmp_str_free( &buff );
670 }
671 #endif
672 } else {
673 if ( schedule == kmp_sch_guided_chunked ) {
674 schedule = __kmp_guided;
675 }
676 if ( chunk <= 0 ) {
677 chunk = KMP_DEFAULT_CHUNK;
678 }
679 }
680
681 #if OMP_30_ENABLED
682 if ( schedule == kmp_sch_auto ) {
683 // mapping and differentiation: in the __kmp_do_serial_initialize()
684 schedule = __kmp_auto;
685 #ifdef KMP_DEBUG
686 {
687 const char * buff;
688 // create format specifiers before the debug output
689 buff = __kmp_str_format(
690 "__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
691 traits_t< ST >::spec );
692 KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
693 __kmp_str_free( &buff );
694 }
695 #endif
696 }
697 #endif // OMP_30_ENABLED
698
699 /* guided analytical not safe for too many threads */
700 if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
701 schedule = kmp_sch_guided_iterative_chunked;
702 KMP_WARNING( DispatchManyThreads );
703 }
704 pr->u.p.parm1 = chunk;
705 }
706 KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
707 "unknown scheduling type" );
708
709 pr->u.p.count = 0;
710
711 if ( __kmp_env_consistency_check ) {
712 if ( st == 0 ) {
713 __kmp_error_construct(
714 kmp_i18n_msg_CnsLoopIncrZeroProhibited,
715 ( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
716 );
717 }
718 }
719
720 tc = ( ub - lb + st );
721 if ( st != 1 ) {
722 if ( st < 0 ) {
723 if ( lb < ub ) {
724 tc = 0; // zero-trip
725 } else { // lb >= ub
726 tc = (ST)tc / st; // convert to signed division
727 }
728 } else { // st > 0
729 if ( ub < lb ) {
730 tc = 0; // zero-trip
731 } else { // lb >= ub
732 tc /= st;
733 }
734 }
735 } else if ( ub < lb ) { // st == 1
736 tc = 0; // zero-trip
737 }
738
739 pr->u.p.lb = lb;
740 pr->u.p.ub = ub;
741 pr->u.p.st = st;
742 pr->u.p.tc = tc;
743
744 #if KMP_OS_WINDOWS
745 pr->u.p.last_upper = ub + st;
746 #endif /* KMP_OS_WINDOWS */
747
748 /* NOTE: only the active parallel region(s) has active ordered sections */
749
750 if ( active ) {
751 if ( pr->ordered == 0 ) {
752 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
753 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
754 } else {
755 pr->ordered_bumped = 0;
756
757 pr->u.p.ordered_lower = 1;
758 pr->u.p.ordered_upper = 0;
759
760 th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
761 th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
762 }
763 }
764
765 if ( __kmp_env_consistency_check ) {
766 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
767 if ( push_ws ) {
768 __kmp_push_workshare( gtid, ws, loc );
769 pr->pushed_ws = ws;
770 } else {
771 __kmp_check_workshare( gtid, ws, loc );
772 pr->pushed_ws = ct_none;
773 }
774 }
775
776 switch ( schedule ) {
777 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
778 case kmp_sch_static_steal:
779 {
780 T nproc = team->t.t_nproc;
781 T ntc, init;
782
783 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
784
785 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
786 if ( nproc > 1 && ntc >= nproc ) {
787 T id = __kmp_tid_from_gtid(gtid);
788 T small_chunk, extras;
789
790 small_chunk = ntc / nproc;
791 extras = ntc % nproc;
792
793 init = id * small_chunk + ( id < extras ? id : extras );
794 pr->u.p.count = init;
795 pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
796
797 pr->u.p.parm2 = lb;
798 //pr->pfields.parm3 = 0; // it's not used in static_steal
799 pr->u.p.parm4 = id;
800 pr->u.p.st = st;
801 break;
802 } else {
803 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
804 gtid ) );
805 schedule = kmp_sch_static_balanced;
806 /* too few iterations: fall-through to kmp_sch_static_balanced */
807 } // if
808 /* FALL-THROUGH to static balanced */
809 } // case
810 #endif
811 case kmp_sch_static_balanced:
812 {
813 T nproc = team->t.t_nproc;
814 T init, limit;
815
816 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
817 gtid ) );
818
819 if ( nproc > 1 ) {
820 T id = __kmp_tid_from_gtid(gtid);
821
822 if ( tc < nproc ) {
823 if ( id < tc ) {
824 init = id;
825 limit = id;
826 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
827 } else {
828 pr->u.p.count = 1; /* means no more chunks to execute */
829 pr->u.p.parm1 = FALSE;
830 break;
831 }
832 } else {
833 T small_chunk = tc / nproc;
834 T extras = tc % nproc;
835 init = id * small_chunk + (id < extras ? id : extras);
836 limit = init + small_chunk - (id < extras ? 0 : 1);
837 pr->u.p.parm1 = (id == nproc - 1);
838 }
839 } else {
840 if ( tc > 0 ) {
841 init = 0;
842 limit = tc - 1;
843 pr->u.p.parm1 = TRUE;
844 } else {
845 // zero trip count
846 pr->u.p.count = 1; /* means no more chunks to execute */
847 pr->u.p.parm1 = FALSE;
848 break;
849 }
850 }
851 if ( st == 1 ) {
852 pr->u.p.lb = lb + init;
853 pr->u.p.ub = lb + limit;
854 } else {
855 T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
856 pr->u.p.lb = lb + init * st;
857 // adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
858 if ( st > 0 ) {
859 pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
860 } else {
861 pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
862 }
863 }
864 if ( pr->ordered ) {
865 pr->u.p.ordered_lower = init;
866 pr->u.p.ordered_upper = limit;
867 }
868 break;
869 } // case
870 case kmp_sch_guided_iterative_chunked :
871 {
872 T nproc = team->t.t_nproc;
873 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
874
875 if ( nproc > 1 ) {
876 if ( (2L * chunk + 1 ) * nproc >= tc ) {
877 /* chunk size too large, switch to dynamic */
878 schedule = kmp_sch_dynamic_chunked;
879 } else {
880 // when remaining iters become less than parm2 - switch to dynamic
881 pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
882 *(double*)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
883 }
884 } else {
885 KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
886 schedule = kmp_sch_static_greedy;
887 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
888 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
889 pr->u.p.parm1 = tc;
890 } // if
891 } // case
892 break;
893 case kmp_sch_guided_analytical_chunked:
894 {
895 T nproc = team->t.t_nproc;
896 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
897
898 if ( nproc > 1 ) {
899 if ( (2L * chunk + 1 ) * nproc >= tc ) {
900 /* chunk size too large, switch to dynamic */
901 schedule = kmp_sch_dynamic_chunked;
902 } else {
903 /* commonly used term: (2 nproc - 1)/(2 nproc) */
904 DBL x;
905
906 #if KMP_OS_WINDOWS && KMP_ARCH_X86
907 /* Linux* OS already has 64-bit computation by default for
908 long double, and on Windows* OS on Intel(R) 64,
909 /Qlong_double doesn't work. On Windows* OS
910 on IA-32 architecture, we need to set precision to
911 64-bit instead of the default 53-bit. Even though long
912 double doesn't work on Windows* OS on Intel(R) 64, the
913 resulting lack of precision is not expected to impact
914 the correctness of the algorithm, but this has not been
915 mathematically proven.
916 */
917 // save original FPCW and set precision to 64-bit, as
918 // Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie181b4bb2013-12-23 17:28:57 +0000919 unsigned int oldFpcw = _control87(0,0);
920 _control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie5e8470a2013-09-27 10:38:44 +0000921 #endif
922 /* value used for comparison in solver for cross-over point */
923 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
924
925 /* crossover point--chunk indexes equal to or greater than
926 this point switch to dynamic-style scheduling */
927 UT cross;
928
929 /* commonly used term: (2 nproc - 1)/(2 nproc) */
930 x = (long double)1.0 - (long double)0.5 / nproc;
931
932 #ifdef KMP_DEBUG
933 { // test natural alignment
934 struct _test_a {
935 char a;
936 union {
937 char b;
938 DBL d;
939 };
940 } t;
941 ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
942 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
943 KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
944 }
945 #endif // KMP_DEBUG
946
947 /* save the term in thread private dispatch structure */
948 *(DBL*)&pr->u.p.parm3 = x;
949
950 /* solve for the crossover point to the nearest integer i for which C_i <= chunk */
951 {
952 UT left, right, mid;
953 long double p;
954
955 /* estimate initial upper and lower bound */
956
957 /* doesn't matter what value right is as long as it is positive, but
958 it affects performance of the solver
959 */
960 right = 229;
961 p = __kmp_pow< UT >(x,right);
962 if ( p > target ) {
963 do{
964 p *= p;
965 right <<= 1;
966 } while(p>target && right < (1<<27));
967 left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
968 } else {
969 left = 0;
970 }
971
972 /* bisection root-finding method */
973 while ( left + 1 < right ) {
974 mid = (left + right) / 2;
975 if ( __kmp_pow< UT >(x,mid) > target ) {
976 left = mid;
977 } else {
978 right = mid;
979 }
980 } // while
981 cross = right;
982 }
983 /* assert sanity of computed crossover point */
984 KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
985
986 /* save the crossover point in thread private dispatch structure */
987 pr->u.p.parm2 = cross;
988
989 // C75803
990 #if ( ( KMP_OS_LINUX || KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
991 #define GUIDED_ANALYTICAL_WORKAROUND (*( DBL * )&pr->u.p.parm3)
992 #else
993 #define GUIDED_ANALYTICAL_WORKAROUND (x)
994 #endif
995 /* dynamic-style scheduling offset */
996 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
997 #if KMP_OS_WINDOWS && KMP_ARCH_X86
998 // restore FPCW
Jim Cownie181b4bb2013-12-23 17:28:57 +0000999 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001000 #endif
1001 } // if
1002 } else {
1003 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
1004 gtid ) );
1005 schedule = kmp_sch_static_greedy;
1006 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1007 pr->u.p.parm1 = tc;
1008 } // if
1009 } // case
1010 break;
1011 case kmp_sch_static_greedy:
1012 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
1013 pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
1014 ( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
1015 tc;
1016 break;
1017 case kmp_sch_static_chunked :
1018 case kmp_sch_dynamic_chunked :
1019 KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
1020 break;
1021 case kmp_sch_trapezoidal :
1022 {
1023 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1024
1025 T parm1, parm2, parm3, parm4;
1026 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
1027
1028 parm1 = chunk;
1029
1030 /* F : size of the first cycle */
1031 parm2 = ( tc / (2 * team->t.t_nproc) );
1032
1033 if ( parm2 < 1 ) {
1034 parm2 = 1;
1035 }
1036
1037 /* L : size of the last cycle. Make sure the last cycle
1038 * is not larger than the first cycle.
1039 */
1040 if ( parm1 < 1 ) {
1041 parm1 = 1;
1042 } else if ( parm1 > parm2 ) {
1043 parm1 = parm2;
1044 }
1045
1046 /* N : number of cycles */
1047 parm3 = ( parm2 + parm1 );
1048 parm3 = ( 2 * tc + parm3 - 1) / parm3;
1049
1050 if ( parm3 < 2 ) {
1051 parm3 = 2;
1052 }
1053
1054 /* sigma : decreasing incr of the trapezoid */
1055 parm4 = ( parm3 - 1 );
1056 parm4 = ( parm2 - parm1 ) / parm4;
1057
1058 // pointless check, because parm4 >= 0 always
1059 //if ( parm4 < 0 ) {
1060 // parm4 = 0;
1061 //}
1062
1063 pr->u.p.parm1 = parm1;
1064 pr->u.p.parm2 = parm2;
1065 pr->u.p.parm3 = parm3;
1066 pr->u.p.parm4 = parm4;
1067 } // case
1068 break;
1069
1070 default:
1071 {
1072 __kmp_msg(
1073 kmp_ms_fatal, // Severity
1074 KMP_MSG( UnknownSchedTypeDetected ), // Primary message
1075 KMP_HNT( GetNewerLibrary ), // Hint
1076 __kmp_msg_null // Variadic argument list terminator
1077 );
1078 }
1079 break;
1080 } // switch
1081 pr->schedule = schedule;
1082 if ( active ) {
1083 /* The name of this buffer should be my_buffer_index when it's free to use it */
1084
1085 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
1086 gtid, my_buffer_index, sh->buffer_index) );
1087 __kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
1088 USE_ITT_BUILD_ARG( NULL )
1089 );
1090 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
1091 // *always* 32-bit integers.
1092 KMP_MB(); /* is this necessary? */
1093 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
1094 gtid, my_buffer_index, sh->buffer_index) );
1095
1096 th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
1097 th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
1098#if USE_ITT_BUILD
1099 if ( pr->ordered ) {
1100 __kmp_itt_ordered_init( gtid );
1101 }; // if
1102#endif /* USE_ITT_BUILD */
1103 }; // if
1104 #ifdef KMP_DEBUG
1105 {
1106 const char * buff;
1107 // create format specifiers before the debug output
1108 buff = __kmp_str_format(
1109 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
1110 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
1111 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1112 traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
1113 traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
1114 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
1115 traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
1116 KD_TRACE(10, ( buff,
1117 gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1118 pr->u.p.st, pr->u.p.tc, pr->u.p.count,
1119 pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
1120 pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
1121 __kmp_str_free( &buff );
1122 }
1123 #endif
1124 #if ( KMP_STATIC_STEAL_ENABLED )
1125 if ( ___kmp_size_type < 8 ) {
1126 // It cannot be guaranteed that after execution of a loop with some other schedule kind
1127 // all the parm3 variables will contain the same value.
1128 // Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
1129 // rather than program life-time increment.
1130 // So the dedicated variable is required. The 'static_steal_counter' is used.
1131 if( schedule == kmp_sch_static_steal ) {
1132 // Other threads will inspect this variable when searching for a victim.
1133 // This is a flag showing that other threads may steal from this thread since then.
1134 volatile T * p = &pr->u.p.static_steal_counter;
1135 *p = *p + 1;
1136 }
1137 }
1138 #endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
1139}
1140
1141/*
1142 * For ordered loops, either __kmp_dispatch_finish() should be called after
1143 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1144 * every chunk of iterations. If the ordered section(s) were not executed
1145 * for this iteration (or every iteration in this chunk), we need to set the
1146 * ordered iteration counters so that the next thread can proceed.
1147 */
1148template< typename UT >
1149static void
1150__kmp_dispatch_finish( int gtid, ident_t *loc )
1151{
1152 typedef typename traits_t< UT >::signed_t ST;
1153 kmp_info_t *th = __kmp_threads[ gtid ];
1154
1155 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
1156 if ( ! th -> th.th_team -> t.t_serialized ) {
1157
1158 dispatch_private_info_template< UT > * pr =
1159 reinterpret_cast< dispatch_private_info_template< UT >* >
1160 ( th->th.th_dispatch->th_dispatch_pr_current );
1161 dispatch_shared_info_template< UT > volatile * sh =
1162 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1163 ( th->th.th_dispatch->th_dispatch_sh_current );
1164 KMP_DEBUG_ASSERT( pr );
1165 KMP_DEBUG_ASSERT( sh );
1166 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1167 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1168
1169 if ( pr->ordered_bumped ) {
1170 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1171 gtid ) );
1172 pr->ordered_bumped = 0;
1173 } else {
1174 UT lower = pr->u.p.ordered_lower;
1175
1176 #ifdef KMP_DEBUG
1177 {
1178 const char * buff;
1179 // create format specifiers before the debug output
1180 buff = __kmp_str_format(
1181 "__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
1182 traits_t< UT >::spec, traits_t< UT >::spec );
1183 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1184 __kmp_str_free( &buff );
1185 }
1186 #endif
1187
1188 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1189 USE_ITT_BUILD_ARG(NULL)
1190 );
1191 KMP_MB(); /* is this necessary? */
1192 #ifdef KMP_DEBUG
1193 {
1194 const char * buff;
1195 // create format specifiers before the debug output
1196 buff = __kmp_str_format(
1197 "__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
1198 traits_t< UT >::spec, traits_t< UT >::spec );
1199 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
1200 __kmp_str_free( &buff );
1201 }
1202 #endif
1203
1204 test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
1205 } // if
1206 } // if
1207 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
1208}
1209
1210#ifdef KMP_GOMP_COMPAT
1211
1212template< typename UT >
1213static void
1214__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
1215{
1216 typedef typename traits_t< UT >::signed_t ST;
1217 kmp_info_t *th = __kmp_threads[ gtid ];
1218
1219 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
1220 if ( ! th -> th.th_team -> t.t_serialized ) {
1221// int cid;
1222 dispatch_private_info_template< UT > * pr =
1223 reinterpret_cast< dispatch_private_info_template< UT >* >
1224 ( th->th.th_dispatch->th_dispatch_pr_current );
1225 dispatch_shared_info_template< UT > volatile * sh =
1226 reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
1227 ( th->th.th_dispatch->th_dispatch_sh_current );
1228 KMP_DEBUG_ASSERT( pr );
1229 KMP_DEBUG_ASSERT( sh );
1230 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1231 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1232
1233// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1234 UT lower = pr->u.p.ordered_lower;
1235 UT upper = pr->u.p.ordered_upper;
1236 UT inc = upper - lower + 1;
1237
1238 if ( pr->ordered_bumped == inc ) {
1239 KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1240 gtid ) );
1241 pr->ordered_bumped = 0;
1242 } else {
1243 inc -= pr->ordered_bumped;
1244
1245 #ifdef KMP_DEBUG
1246 {
1247 const char * buff;
1248 // create format specifiers before the debug output
1249 buff = __kmp_str_format(
1250 "__kmp_dispatch_finish_chunk: T#%%d before wait: " \
1251 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1252 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1253 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
1254 __kmp_str_free( &buff );
1255 }
1256 #endif
1257
1258 __kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
1259 USE_ITT_BUILD_ARG(NULL)
1260 );
1261
1262 KMP_MB(); /* is this necessary? */
1263 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
1264 gtid ) );
1265 pr->ordered_bumped = 0;
1266//!!!!! TODO check if the inc should be unsigned, or signed???
1267 #ifdef KMP_DEBUG
1268 {
1269 const char * buff;
1270 // create format specifiers before the debug output
1271 buff = __kmp_str_format(
1272 "__kmp_dispatch_finish_chunk: T#%%d after wait: " \
1273 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1274 traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
1275 KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
1276 __kmp_str_free( &buff );
1277 }
1278 #endif
1279
1280 test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
1281 }
1282// }
1283 }
1284 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
1285}
1286
1287#endif /* KMP_GOMP_COMPAT */
1288
1289template< typename T >
1290static int
1291__kmp_dispatch_next(
1292 ident_t *loc, int gtid, kmp_int32 *p_last, T *p_lb, T *p_ub, typename traits_t< T >::signed_t *p_st
1293) {
1294
1295 typedef typename traits_t< T >::unsigned_t UT;
1296 typedef typename traits_t< T >::signed_t ST;
1297 typedef typename traits_t< T >::floating_t DBL;
1298 static const int ___kmp_size_type = sizeof( UT );
1299
1300 int status;
1301 dispatch_private_info_template< T > * pr;
1302 kmp_info_t * th = __kmp_threads[ gtid ];
1303 kmp_team_t * team = th -> th.th_team;
1304
1305 #ifdef KMP_DEBUG
1306 {
1307 const char * buff;
1308 // create format specifiers before the debug output
1309 buff = __kmp_str_format(
1310 "__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
1311 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1312 KD_TRACE(1000, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last ) );
1313 __kmp_str_free( &buff );
1314 }
1315 #endif
1316
1317 if ( team -> t.t_serialized ) {
1318 /* NOTE: serialize this dispatch becase we are not at the active level */
1319 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1320 ( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
1321 KMP_DEBUG_ASSERT( pr );
1322
1323 if ( (status = (pr->u.p.tc != 0)) == 0 ) {
1324 *p_lb = 0;
1325 *p_ub = 0;
1326 if ( p_st != 0 ) {
1327 *p_st = 0;
1328 }
1329 if ( __kmp_env_consistency_check ) {
1330 if ( pr->pushed_ws != ct_none ) {
1331 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1332 }
1333 }
1334 } else if ( pr->nomerge ) {
1335 kmp_int32 last;
1336 T start;
1337 UT limit, trip, init;
1338 ST incr;
1339 T chunk = pr->u.p.parm1;
1340
1341 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
1342
1343 init = chunk * pr->u.p.count++;
1344 trip = pr->u.p.tc - 1;
1345
1346 if ( (status = (init <= trip)) == 0 ) {
1347 *p_lb = 0;
1348 *p_ub = 0;
1349 if ( p_st != 0 ) *p_st = 0;
1350 if ( __kmp_env_consistency_check ) {
1351 if ( pr->pushed_ws != ct_none ) {
1352 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
1353 }
1354 }
1355 } else {
1356 start = pr->u.p.lb;
1357 limit = chunk + init - 1;
1358 incr = pr->u.p.st;
1359
1360 if ( (last = (limit >= trip)) != 0 ) {
1361 limit = trip;
1362 #if KMP_OS_WINDOWS
1363 pr->u.p.last_upper = pr->u.p.ub;
1364 #endif /* KMP_OS_WINDOWS */
1365 }
1366 if ( p_last ) {
1367 *p_last = last;
1368 }
1369 if ( p_st != 0 ) {
1370 *p_st = incr;
1371 }
1372 if ( incr == 1 ) {
1373 *p_lb = start + init;
1374 *p_ub = start + limit;
1375 } else {
1376 *p_lb = start + init * incr;
1377 *p_ub = start + limit * incr;
1378 }
1379
1380 if ( pr->ordered ) {
1381 pr->u.p.ordered_lower = init;
1382 pr->u.p.ordered_upper = limit;
1383 #ifdef KMP_DEBUG
1384 {
1385 const char * buff;
1386 // create format specifiers before the debug output
1387 buff = __kmp_str_format(
1388 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1389 traits_t< UT >::spec, traits_t< UT >::spec );
1390 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1391 __kmp_str_free( &buff );
1392 }
1393 #endif
1394 } // if
1395 } // if
1396 } else {
1397 pr->u.p.tc = 0;
1398
1399 *p_lb = pr->u.p.lb;
1400 *p_ub = pr->u.p.ub;
1401 #if KMP_OS_WINDOWS
1402 pr->u.p.last_upper = *p_ub;
1403 #endif /* KMP_OS_WINDOWS */
1404
1405 if ( p_st != 0 ) {
1406 *p_st = pr->u.p.st;
1407 }
1408 if ( p_last ) {
1409 *p_last = TRUE;
1410 }
1411 } // if
1412 #ifdef KMP_DEBUG
1413 {
1414 const char * buff;
1415 // create format specifiers before the debug output
1416 buff = __kmp_str_format(
1417 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
1418 "p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
1419 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
1420 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, *p_st, p_last, status) );
1421 __kmp_str_free( &buff );
1422 }
1423 #endif
1424 return status;
1425 } else {
1426 kmp_int32 last = 0;
1427 dispatch_shared_info_template< UT > *sh;
1428 T start;
1429 ST incr;
1430 UT limit, trip, init;
1431
1432 KMP_DEBUG_ASSERT( th->th.th_dispatch ==
1433 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
1434
1435 pr = reinterpret_cast< dispatch_private_info_template< T >* >
1436 ( th->th.th_dispatch->th_dispatch_pr_current );
1437 KMP_DEBUG_ASSERT( pr );
1438 sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
1439 ( th->th.th_dispatch->th_dispatch_sh_current );
1440 KMP_DEBUG_ASSERT( sh );
1441
1442 if ( pr->u.p.tc == 0 ) {
1443 // zero trip count
1444 status = 0;
1445 } else {
1446 switch (pr->schedule) {
1447 #if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1448 case kmp_sch_static_steal:
1449 {
1450 T chunk = pr->u.p.parm1;
1451
1452 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
1453
1454 trip = pr->u.p.tc - 1;
1455
1456 if ( ___kmp_size_type > 4 ) {
1457 // Other threads do not look into the data of this thread,
1458 // so it's not necessary to make volatile casting.
1459 init = ( pr->u.p.count )++;
1460 status = ( init < (UT)pr->u.p.ub );
1461 } else {
1462 typedef union {
1463 struct {
1464 UT count;
1465 T ub;
1466 } p;
1467 kmp_int64 b;
1468 } union_i4;
1469 // All operations on 'count' or 'ub' must be combined atomically together.
1470 // stealing implemented only for 4-byte indexes
1471 {
1472 union_i4 vold, vnew;
1473 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1474 vnew = vold;
1475 vnew.p.count++;
1476 while( ! KMP_COMPARE_AND_STORE_ACQ64(
1477 ( volatile kmp_int64* )&pr->u.p.count,
1478 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1479 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1480 KMP_CPU_PAUSE();
1481 vold.b = *( volatile kmp_int64 * )(&pr->u.p.count);
1482 vnew = vold;
1483 vnew.p.count++;
1484 }
1485 vnew = vold;
1486 init = vnew.p.count;
1487 status = ( init < (UT)vnew.p.ub ) ;
1488 }
1489
1490 if( !status ) {
1491 kmp_info_t **other_threads = team->t.t_threads;
1492 int while_limit = 10;
1493 int while_index = 0;
1494
1495 // TODO: algorithm of searching for a victim
1496 // should be cleaned up and measured
1497 while ( ( !status ) && ( while_limit != ++while_index ) ) {
1498 union_i4 vold, vnew;
1499 kmp_int32 remaining; // kmp_int32 because KMP_I4 only
1500 T victimIdx = pr->u.p.parm4;
1501 T oldVictimIdx = victimIdx;
1502 dispatch_private_info_template< T > * victim;
1503
1504 do {
1505 if( !victimIdx ) {
1506 victimIdx = team->t.t_nproc - 1;
1507 } else {
1508 --victimIdx;
1509 }
1510 victim = reinterpret_cast< dispatch_private_info_template< T >* >
1511 ( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
1512 } while ( (victim == NULL || victim == pr) && oldVictimIdx != victimIdx );
1513 // TODO: think about a proper place of this test
1514 if ( ( !victim ) ||
1515 ( (*( volatile T * )&victim->u.p.static_steal_counter) !=
1516 (*( volatile T * )&pr->u.p.static_steal_counter) ) ) {
1517 // TODO: delay would be nice
1518 continue;
1519 // the victim is not ready yet to participate in stealing
1520 // because the victim is still in kmp_init_dispatch
1521 }
1522 if ( oldVictimIdx == victimIdx ) {
1523 break;
1524 }
1525 pr->u.p.parm4 = victimIdx;
1526
1527 while( 1 ) {
1528 vold.b = *( volatile kmp_int64 * )( &victim->u.p.count );
1529 vnew = vold;
1530
1531 KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
1532 if ( vnew.p.count >= (UT)vnew.p.ub || (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
1533 break;
1534 }
1535 vnew.p.ub -= (remaining >> 2);
1536 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1537 #pragma warning( push )
1538 // disable warning on pointless comparison of unsigned with 0
1539 #pragma warning( disable: 186 )
1540 KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
1541 #pragma warning( pop )
1542 // TODO: Should this be acquire or release?
1543 if ( KMP_COMPARE_AND_STORE_ACQ64(
1544 ( volatile kmp_int64 * )&victim->u.p.count,
1545 *VOLATILE_CAST(kmp_int64 *)&vold.b,
1546 *VOLATILE_CAST(kmp_int64 *)&vnew.b ) ) {
1547 status = 1;
1548 while_index = 0;
1549 // now update own count and ub
1550 #if KMP_ARCH_X86
1551 // stealing executed on non-KMP_ARCH_X86 only
1552 // Atomic 64-bit write on ia32 is
1553 // unavailable, so we do this in steps.
1554 // This code is not tested.
1555 init = vold.p.count;
1556 pr->u.p.ub = 0;
1557 pr->u.p.count = init + 1;
1558 pr->u.p.ub = vnew.p.count;
1559 #else
1560 init = vnew.p.ub;
1561 vold.p.count = init + 1;
1562 // TODO: is it safe and enough?
1563 *( volatile kmp_int64 * )(&pr->u.p.count) = vold.b;
1564 #endif // KMP_ARCH_X86
1565 break;
1566 } // if
1567 KMP_CPU_PAUSE();
1568 } // while (1)
1569 } // while
1570 } // if
1571 } // if
1572 if ( !status ) {
1573 *p_lb = 0;
1574 *p_ub = 0;
1575 if ( p_st != 0 ) *p_st = 0;
1576 } else {
1577 start = pr->u.p.parm2;
1578 init *= chunk;
1579 limit = chunk + init - 1;
1580 incr = pr->u.p.st;
1581
1582 KMP_DEBUG_ASSERT(init <= trip);
1583 if ( (last = (limit >= trip)) != 0 )
1584 limit = trip;
1585 if ( p_last ) {
1586 *p_last = last;
1587 }
1588 if ( p_st != 0 ) *p_st = incr;
1589
1590 if ( incr == 1 ) {
1591 *p_lb = start + init;
1592 *p_ub = start + limit;
1593 } else {
1594 *p_lb = start + init * incr;
1595 *p_ub = start + limit * incr;
1596 }
1597
1598 if ( pr->ordered ) {
1599 pr->u.p.ordered_lower = init;
1600 pr->u.p.ordered_upper = limit;
1601 #ifdef KMP_DEBUG
1602 {
1603 const char * buff;
1604 // create format specifiers before the debug output
1605 buff = __kmp_str_format(
1606 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1607 traits_t< UT >::spec, traits_t< UT >::spec );
1608 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1609 __kmp_str_free( &buff );
1610 }
1611 #endif
1612 } // if
1613 } // if
1614 break;
1615 } // case
1616 #endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
1617 case kmp_sch_static_balanced:
1618 {
1619 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
1620 if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
1621 pr->u.p.count = 1;
1622 *p_lb = pr->u.p.lb;
1623 *p_ub = pr->u.p.ub;
1624 last = pr->u.p.parm1;
1625 if ( p_last ) {
1626 *p_last = last;
1627 }
1628 if ( p_st )
1629 *p_st = pr->u.p.st;
1630 } else { /* no iterations to do */
1631 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
1632 }
1633 if ( pr->ordered ) {
1634 #ifdef KMP_DEBUG
1635 {
1636 const char * buff;
1637 // create format specifiers before the debug output
1638 buff = __kmp_str_format(
1639 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1640 traits_t< UT >::spec, traits_t< UT >::spec );
1641 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1642 __kmp_str_free( &buff );
1643 }
1644 #endif
1645 } // if
1646 } // case
1647 break;
1648 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
1649 case kmp_sch_static_chunked:
1650 {
1651 T parm1;
1652
1653 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity|chunked] case\n",
1654 gtid ) );
1655 parm1 = pr->u.p.parm1;
1656
1657 trip = pr->u.p.tc - 1;
1658 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1659
1660 if ( (status = (init <= trip)) != 0 ) {
1661 start = pr->u.p.lb;
1662 incr = pr->u.p.st;
1663 limit = parm1 + init - 1;
1664
1665 if ( (last = (limit >= trip)) != 0 )
1666 limit = trip;
1667
1668 if ( p_last ) {
1669 *p_last = last;
1670 }
1671 if ( p_st != 0 ) *p_st = incr;
1672
1673 pr->u.p.count += team->t.t_nproc;
1674
1675 if ( incr == 1 ) {
1676 *p_lb = start + init;
1677 *p_ub = start + limit;
1678 }
1679 else {
1680 *p_lb = start + init * incr;
1681 *p_ub = start + limit * incr;
1682 }
1683
1684 if ( pr->ordered ) {
1685 pr->u.p.ordered_lower = init;
1686 pr->u.p.ordered_upper = limit;
1687 #ifdef KMP_DEBUG
1688 {
1689 const char * buff;
1690 // create format specifiers before the debug output
1691 buff = __kmp_str_format(
1692 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1693 traits_t< UT >::spec, traits_t< UT >::spec );
1694 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1695 __kmp_str_free( &buff );
1696 }
1697 #endif
1698 } // if
1699 } // if
1700 } // case
1701 break;
1702
1703 case kmp_sch_dynamic_chunked:
1704 {
1705 T chunk = pr->u.p.parm1;
1706
1707 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1708 gtid ) );
1709
1710 init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1711 trip = pr->u.p.tc - 1;
1712
1713 if ( (status = (init <= trip)) == 0 ) {
1714 *p_lb = 0;
1715 *p_ub = 0;
1716 if ( p_st != 0 ) *p_st = 0;
1717 } else {
1718 start = pr->u.p.lb;
1719 limit = chunk + init - 1;
1720 incr = pr->u.p.st;
1721
1722 if ( (last = (limit >= trip)) != 0 )
1723 limit = trip;
1724 if ( p_last ) {
1725 *p_last = last;
1726 }
1727 if ( p_st != 0 ) *p_st = incr;
1728
1729 if ( incr == 1 ) {
1730 *p_lb = start + init;
1731 *p_ub = start + limit;
1732 } else {
1733 *p_lb = start + init * incr;
1734 *p_ub = start + limit * incr;
1735 }
1736
1737 if ( pr->ordered ) {
1738 pr->u.p.ordered_lower = init;
1739 pr->u.p.ordered_upper = limit;
1740 #ifdef KMP_DEBUG
1741 {
1742 const char * buff;
1743 // create format specifiers before the debug output
1744 buff = __kmp_str_format(
1745 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1746 traits_t< UT >::spec, traits_t< UT >::spec );
1747 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1748 __kmp_str_free( &buff );
1749 }
1750 #endif
1751 } // if
1752 } // if
1753 } // case
1754 break;
1755
1756 case kmp_sch_guided_iterative_chunked:
1757 {
1758 T chunkspec = pr->u.p.parm1;
1759 KD_TRACE(100,
1760 ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
1761 trip = pr->u.p.tc;
1762 // Start atomic part of calculations
1763 while(1) {
1764 ST remaining; // signed, because can be < 0
1765 init = sh->u.s.iteration; // shared value
1766 remaining = trip - init;
1767 if ( remaining <= 0 ) { // AC: need to compare with 0 first
1768 // nothing to do, don't try atomic op
1769 status = 0;
1770 break;
1771 }
1772 if ( (T)remaining < pr->u.p.parm2 ) { // compare with K*nproc*(chunk+1), K=2 by default
1773 // use dynamic-style shcedule
1774 // atomically inrement iterations, get old value
1775 init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
1776 remaining = trip - init;
1777 if (remaining <= 0) {
1778 status = 0; // all iterations got by other threads
1779 } else {
1780 // got some iterations to work on
1781 status = 1;
1782 if ( (T)remaining > chunkspec ) {
1783 limit = init + chunkspec - 1;
1784 } else {
1785 last = 1; // the last chunk
1786 limit = init + remaining - 1;
1787 } // if
1788 } // if
1789 break;
1790 } // if
1791 limit = init + (UT)( remaining * *(double*)&pr->u.p.parm3 ); // divide by K*nproc
1792 if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
1793 // CAS was successful, chunk obtained
1794 status = 1;
1795 --limit;
1796 break;
1797 } // if
1798 } // while
1799 if ( status != 0 ) {
1800 start = pr->u.p.lb;
1801 incr = pr->u.p.st;
1802 if ( p_st != NULL )
1803 *p_st = incr;
1804 if ( p_last != NULL )
1805 *p_last = last;
1806 *p_lb = start + init * incr;
1807 *p_ub = start + limit * incr;
1808 if ( pr->ordered ) {
1809 pr->u.p.ordered_lower = init;
1810 pr->u.p.ordered_upper = limit;
1811 #ifdef KMP_DEBUG
1812 {
1813 const char * buff;
1814 // create format specifiers before the debug output
1815 buff = __kmp_str_format(
1816 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1817 traits_t< UT >::spec, traits_t< UT >::spec );
1818 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1819 __kmp_str_free( &buff );
1820 }
1821 #endif
1822 } // if
1823 } else {
1824 *p_lb = 0;
1825 *p_ub = 0;
1826 if ( p_st != NULL )
1827 *p_st = 0;
1828 } // if
1829 } // case
1830 break;
1831
1832 case kmp_sch_guided_analytical_chunked:
1833 {
1834 T chunkspec = pr->u.p.parm1;
1835 UT chunkIdx;
1836 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1837 /* for storing original FPCW value for Windows* OS on
1838 IA-32 architecture 8-byte version */
1839 unsigned int oldFpcw;
Jim Cownie181b4bb2013-12-23 17:28:57 +00001840 unsigned int fpcwSet = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001841 #endif
1842 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
1843 gtid ) );
1844
1845 trip = pr->u.p.tc;
1846
1847 KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
1848 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
1849
1850 while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
1851 chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
1852 if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
1853 --trip;
1854 /* use dynamic-style scheduling */
1855 init = chunkIdx * chunkspec + pr->u.p.count;
1856 /* need to verify init > 0 in case of overflow in the above calculation */
1857 if ( (status = (init > 0 && init <= trip)) != 0 ) {
1858 limit = init + chunkspec -1;
1859
1860 if ( (last = (limit >= trip)) != 0 )
1861 limit = trip;
1862 }
1863 break;
1864 } else {
1865 /* use exponential-style scheduling */
1866 /* The following check is to workaround the lack of long double precision on Windows* OS.
1867 This check works around the possible effect that init != 0 for chunkIdx == 0.
1868 */
1869 #if KMP_OS_WINDOWS && KMP_ARCH_X86
1870 /* If we haven't already done so, save original
1871 FPCW and set precision to 64-bit, as Windows* OS
1872 on IA-32 architecture defaults to 53-bit */
1873 if ( !fpcwSet ) {
Jim Cownie181b4bb2013-12-23 17:28:57 +00001874 oldFpcw = _control87(0,0);
1875 _control87(_PC_64,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001876 fpcwSet = 0x30000;
1877 }
1878 #endif
1879 if ( chunkIdx ) {
1880 init = __kmp_dispatch_guided_remaining< T >(
1881 trip, *( DBL * )&pr->u.p.parm3, chunkIdx );
1882 KMP_DEBUG_ASSERT(init);
1883 init = trip - init;
1884 } else
1885 init = 0;
1886 limit = trip - __kmp_dispatch_guided_remaining< T >(
1887 trip, *( DBL * )&pr->u.p.parm3, chunkIdx + 1 );
1888 KMP_ASSERT(init <= limit);
1889 if ( init < limit ) {
1890 KMP_DEBUG_ASSERT(limit <= trip);
1891 --limit;
1892 status = 1;
1893 break;
1894 } // if
1895 } // if
1896 } // while (1)
1897 #if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie181b4bb2013-12-23 17:28:57 +00001898 /* restore FPCW if necessary
1899 AC: check fpcwSet flag first because oldFpcw can be uninitialized here
1900 */
1901 if ( fpcwSet && ( oldFpcw & fpcwSet ) )
1902 _control87(oldFpcw,_MCW_PC);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001903 #endif
1904 if ( status != 0 ) {
1905 start = pr->u.p.lb;
1906 incr = pr->u.p.st;
1907 if ( p_st != NULL )
1908 *p_st = incr;
1909 if ( p_last != NULL )
1910 *p_last = last;
1911 *p_lb = start + init * incr;
1912 *p_ub = start + limit * incr;
1913 if ( pr->ordered ) {
1914 pr->u.p.ordered_lower = init;
1915 pr->u.p.ordered_upper = limit;
1916 #ifdef KMP_DEBUG
1917 {
1918 const char * buff;
1919 // create format specifiers before the debug output
1920 buff = __kmp_str_format(
1921 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1922 traits_t< UT >::spec, traits_t< UT >::spec );
1923 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1924 __kmp_str_free( &buff );
1925 }
1926 #endif
1927 }
1928 } else {
1929 *p_lb = 0;
1930 *p_ub = 0;
1931 if ( p_st != NULL )
1932 *p_st = 0;
1933 }
1934 } // case
1935 break;
1936
1937 case kmp_sch_trapezoidal:
1938 {
1939 UT index;
1940 T parm2 = pr->u.p.parm2;
1941 T parm3 = pr->u.p.parm3;
1942 T parm4 = pr->u.p.parm4;
1943 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
1944 gtid ) );
1945
1946 index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
1947
1948 init = ( index * ( (2*parm2) - (index-1)*parm4 ) ) / 2;
1949 trip = pr->u.p.tc - 1;
1950
1951 if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
1952 *p_lb = 0;
1953 *p_ub = 0;
1954 if ( p_st != 0 ) *p_st = 0;
1955 } else {
1956 start = pr->u.p.lb;
1957 limit = ( (index+1) * ( 2*parm2 - index*parm4 ) ) / 2 - 1;
1958 incr = pr->u.p.st;
1959
1960 if ( (last = (limit >= trip)) != 0 )
1961 limit = trip;
1962
1963 if ( p_last != 0 ) {
1964 *p_last = last;
1965 }
1966 if ( p_st != 0 ) *p_st = incr;
1967
1968 if ( incr == 1 ) {
1969 *p_lb = start + init;
1970 *p_ub = start + limit;
1971 } else {
1972 *p_lb = start + init * incr;
1973 *p_ub = start + limit * incr;
1974 }
1975
1976 if ( pr->ordered ) {
1977 pr->u.p.ordered_lower = init;
1978 pr->u.p.ordered_upper = limit;
1979 #ifdef KMP_DEBUG
1980 {
1981 const char * buff;
1982 // create format specifiers before the debug output
1983 buff = __kmp_str_format(
1984 "__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
1985 traits_t< UT >::spec, traits_t< UT >::spec );
1986 KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
1987 __kmp_str_free( &buff );
1988 }
1989 #endif
1990 } // if
1991 } // if
1992 } // case
1993 break;
1994 } // switch
1995 } // if tc == 0;
1996
1997 if ( status == 0 ) {
1998 UT num_done;
1999
2000 num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
2001 #ifdef KMP_DEBUG
2002 {
2003 const char * buff;
2004 // create format specifiers before the debug output
2005 buff = __kmp_str_format(
2006 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2007 traits_t< UT >::spec );
2008 KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
2009 __kmp_str_free( &buff );
2010 }
2011 #endif
2012
2013 if ( num_done == team->t.t_nproc-1 ) {
2014 /* NOTE: release this buffer to be reused */
2015
2016 KMP_MB(); /* Flush all pending memory write invalidates. */
2017
2018 sh->u.s.num_done = 0;
2019 sh->u.s.iteration = 0;
2020
2021 /* TODO replace with general release procedure? */
2022 if ( pr->ordered ) {
2023 sh->u.s.ordered_iteration = 0;
2024 }
2025
2026 KMP_MB(); /* Flush all pending memory write invalidates. */
2027
2028 sh -> buffer_index += KMP_MAX_DISP_BUF;
2029 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2030 gtid, sh->buffer_index) );
2031
2032 KMP_MB(); /* Flush all pending memory write invalidates. */
2033
2034 } // if
2035 if ( __kmp_env_consistency_check ) {
2036 if ( pr->pushed_ws != ct_none ) {
2037 pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
2038 }
2039 }
2040
2041 th -> th.th_dispatch -> th_deo_fcn = NULL;
2042 th -> th.th_dispatch -> th_dxo_fcn = NULL;
2043 th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
2044 th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
2045 } // if (status == 0)
2046#if KMP_OS_WINDOWS
2047 else if ( last ) {
2048 pr->u.p.last_upper = pr->u.p.ub;
2049 }
2050#endif /* KMP_OS_WINDOWS */
2051 } // if
2052
2053 #ifdef KMP_DEBUG
2054 {
2055 const char * buff;
2056 // create format specifiers before the debug output
2057 buff = __kmp_str_format(
2058 "__kmp_dispatch_next: T#%%d normal case: " \
2059 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2060 traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
2061 KD_TRACE(10, ( buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status ) );
2062 __kmp_str_free( &buff );
2063 }
2064 #endif
2065 return status;
2066}
2067
2068//-----------------------------------------------------------------------------------------
2069// Dispatch routines
2070// Transfer call to template< type T >
2071// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2072// T lb, T ub, ST st, ST chunk )
2073extern "C" {
2074
2075/*!
2076@ingroup WORK_SHARING
2077@{
2078@param loc Source location
2079@param gtid Global thread id
2080@param schedule Schedule type
2081@param lb Lower bound
2082@param ub Upper bound
2083@param st Step (or increment if you prefer)
2084@param chunk The chunk size to block with
2085
2086This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
2087These functions are all identical apart from the types of the arguments.
2088*/
2089
2090void
2091__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2092 kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
2093{
2094 KMP_DEBUG_ASSERT( __kmp_init_serial );
2095 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2096}
2097/*!
2098See @ref __kmpc_dispatch_init_4
2099*/
2100void
2101__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2102 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
2103{
2104 KMP_DEBUG_ASSERT( __kmp_init_serial );
2105 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2106}
2107
2108/*!
2109See @ref __kmpc_dispatch_init_4
2110*/
2111void
2112__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2113 kmp_int64 lb, kmp_int64 ub,
2114 kmp_int64 st, kmp_int64 chunk )
2115{
2116 KMP_DEBUG_ASSERT( __kmp_init_serial );
2117 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2118}
2119
2120/*!
2121See @ref __kmpc_dispatch_init_4
2122*/
2123void
2124__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2125 kmp_uint64 lb, kmp_uint64 ub,
2126 kmp_int64 st, kmp_int64 chunk )
2127{
2128 KMP_DEBUG_ASSERT( __kmp_init_serial );
2129 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
2130}
2131
2132/*!
2133@param loc Source code location
2134@param gtid Global thread id
2135@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
2136@param p_lb Pointer to the lower bound for the next chunk of work
2137@param p_ub Pointer to the upper bound for the next chunk of work
2138@param p_st Pointer to the stride for the next chunk of work
2139@return one if there is work to be done, zero otherwise
2140
2141Get the next dynamically allocated chunk of work for this thread.
2142If there is no more work, then the lb,ub and stride need not be modified.
2143*/
2144int
2145__kmpc_dispatch_next_4( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2146 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st )
2147{
2148 return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2149}
2150
2151/*!
2152See @ref __kmpc_dispatch_next_4
2153*/
2154int
2155__kmpc_dispatch_next_4u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2156 kmp_uint32 *p_lb, kmp_uint32 *p_ub, kmp_int32 *p_st )
2157{
2158 return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2159}
2160
2161/*!
2162See @ref __kmpc_dispatch_next_4
2163*/
2164int
2165__kmpc_dispatch_next_8( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2166 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st )
2167{
2168 return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2169}
2170
2171/*!
2172See @ref __kmpc_dispatch_next_4
2173*/
2174int
2175__kmpc_dispatch_next_8u( ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2176 kmp_uint64 *p_lb, kmp_uint64 *p_ub, kmp_int64 *p_st )
2177{
2178 return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
2179}
2180
2181/*!
2182@param loc Source code location
2183@param gtid Global thread id
2184
2185Mark the end of a dynamic loop.
2186*/
2187void
2188__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
2189{
2190 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2191}
2192
2193/*!
2194See @ref __kmpc_dispatch_fini_4
2195*/
2196void
2197__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
2198{
2199 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2200}
2201
2202/*!
2203See @ref __kmpc_dispatch_fini_4
2204*/
2205void
2206__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
2207{
2208 __kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
2209}
2210
2211/*!
2212See @ref __kmpc_dispatch_fini_4
2213*/
2214void
2215__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
2216{
2217 __kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
2218}
2219/*! @} */
2220
2221//-----------------------------------------------------------------------------------------
2222//Non-template routines from kmp_dispatch.c used in other sources
2223
2224kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
2225 return value == checker;
2226}
2227
2228kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
2229 return value != checker;
2230}
2231
2232kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
2233 return value < checker;
2234}
2235
2236kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
2237 return value >= checker;
2238}
2239
2240kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
2241 return value <= checker;
2242}
2243kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
2244 return value == checker;
2245}
2246
2247kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
2248 return value != checker;
2249}
2250
2251kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
2252 return value < checker;
2253}
2254
2255kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
2256 return value >= checker;
2257}
2258
2259kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
2260 return value <= checker;
2261}
2262
2263kmp_uint32
2264__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
2265 kmp_uint32 checker,
2266 kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
2267 , void * obj // Higher-level synchronization object, or NULL.
2268 )
2269{
2270 // note: we may not belong to a team at this point
2271 register volatile kmp_uint32 * spin = spinner;
2272 register kmp_uint32 check = checker;
2273 register kmp_uint32 spins;
2274 register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
2275 register kmp_uint32 r;
2276
2277 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2278 KMP_INIT_YIELD( spins );
2279 // main wait spin loop
2280 while(!f(r = TCR_4(*spin), check)) {
2281 KMP_FSYNC_SPIN_PREPARE( obj );
2282 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2283 It causes problems with infinite recursion because of exit lock */
2284 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2285 __kmp_abort_thread(); */
2286
2287 __kmp_static_delay(TRUE);
2288
2289 /* if we have waited a bit, or are oversubscribed, yield */
2290 /* pause is in the following code */
2291 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2292 KMP_YIELD_SPIN( spins );
2293 }
2294 KMP_FSYNC_SPIN_ACQUIRED( obj );
2295 return r;
2296}
2297
2298kmp_uint64
2299__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
2300 kmp_uint64 checker,
2301 kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
2302 , void * obj // Higher-level synchronization object, or NULL.
2303 )
2304{
2305 // note: we may not belong to a team at this point
2306 register volatile kmp_uint64 * spin = spinner;
2307 register kmp_uint64 check = checker;
2308 register kmp_uint32 spins;
2309 register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
2310 register kmp_uint64 r;
2311
2312 KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
2313 KMP_INIT_YIELD( spins );
2314 // main wait spin loop
2315 while(!f(r = *spin, check))
2316 {
2317 KMP_FSYNC_SPIN_PREPARE( obj );
2318 /* GEH - remove this since it was accidentally introduced when kmp_wait was split.
2319 It causes problems with infinite recursion because of exit lock */
2320 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2321 __kmp_abort_thread(); */
2322
2323 __kmp_static_delay(TRUE);
2324
2325 // if we are oversubscribed,
2326 // or have waited a bit (and KMP_LIBARRY=throughput, then yield
2327 // pause is in the following code
2328 KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
2329 KMP_YIELD_SPIN( spins );
2330 }
2331 KMP_FSYNC_SPIN_ACQUIRED( obj );
2332 return r;
2333}
2334
2335} // extern "C"
2336
2337#ifdef KMP_GOMP_COMPAT
2338
2339void
2340__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2341 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2342 kmp_int32 chunk, int push_ws )
2343{
2344 __kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
2345 push_ws );
2346}
2347
2348void
2349__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2350 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2351 kmp_int32 chunk, int push_ws )
2352{
2353 __kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
2354 push_ws );
2355}
2356
2357void
2358__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2359 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2360 kmp_int64 chunk, int push_ws )
2361{
2362 __kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
2363 push_ws );
2364}
2365
2366void
2367__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
2368 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2369 kmp_int64 chunk, int push_ws )
2370{
2371 __kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
2372 push_ws );
2373}
2374
2375void
2376__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
2377{
2378 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2379}
2380
2381void
2382__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
2383{
2384 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2385}
2386
2387void
2388__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
2389{
2390 __kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
2391}
2392
2393void
2394__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
2395{
2396 __kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
2397}
2398
2399#endif /* KMP_GOMP_COMPAT */
2400
2401/* ------------------------------------------------------------------------ */
2402/* ------------------------------------------------------------------------ */
2403