Blame - openmp/runtime/src/kmp_dispatch.cpp - toolchain/llvm-project

blob: 1128b871d589b0f485d22204d77e9a1979c2cf87 [file] [log] [blame]

Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1	/*
				2	* kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
				3	* $Revision: 42624 $
				4	* $Date: 2013-08-27 10:53:11 -0500 (Tue, 27 Aug 2013) $
				5	*/
				6
				7
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// The LLVM Compiler Infrastructure
				11	//
				12	// This file is dual licensed under the MIT and the University of Illinois Open
				13	// Source Licenses. See LICENSE.txt for details.
				14	//
				15	//===----------------------------------------------------------------------===//
				16
				17
				18	/*
				19	* Dynamic scheduling initialization and dispatch.
				20	*
				21	* NOTE: __kmp_nth is a constant inside of any dispatch loop, however
				22	* it may change values between parallel regions. __kmp_max_nth
				23	* is the largest value __kmp_nth may take, 1 is the smallest.
				24	*
				25	*/
				26
				27	/* ------------------------------------------------------------------------ */
				28	/* ------------------------------------------------------------------------ */
				29
				30	#include "kmp.h"
				31	#include "kmp_i18n.h"
				32	#include "kmp_itt.h"
				33	#include "kmp_str.h"
				34	#include "kmp_error.h"
				35	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				36	#include <float.h>
				37	#endif
				38
				39	/* ------------------------------------------------------------------------ */
				40	/* ------------------------------------------------------------------------ */
				41
				42	#ifdef KMP_STATIC_STEAL_ENABLED
				43
				44	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				45	template< typename T >
				46	struct dispatch_private_infoXX_template {
				47	typedef typename traits_t< T >::unsigned_t UT;
				48	typedef typename traits_t< T >::signed_t ST;
				49	UT count; // unsigned
				50	T ub;
				51	/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
				52	T lb;
				53	ST st; // signed
				54	UT tc; // unsigned
				55	T static_steal_counter; // for static_steal only; maybe better to put after ub
				56
				57	/* parm[1-4] are used in different ways by different scheduling algorithms */
				58
				59	// KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
				60	// a) parm3 is properly aligned and
				61	// b) all parm1-4 are in the same cache line.
				62	// Because of parm1-4 are used together, performance seems to be better
				63	// if they are in the same line (not measured though).
				64
				65	struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
				66	T parm1;
				67	T parm2;
				68	T parm3;
				69	T parm4;
				70	};
				71
				72	UT ordered_lower; // unsigned
				73	UT ordered_upper; // unsigned
				74	#if KMP_OS_WINDOWS
				75	T last_upper;
				76	#endif /* KMP_OS_WINDOWS */
				77	};
				78
				79	#else /* KMP_STATIC_STEAL_ENABLED */
				80
				81	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				82	template< typename T >
				83	struct dispatch_private_infoXX_template {
				84	typedef typename traits_t< T >::unsigned_t UT;
				85	typedef typename traits_t< T >::signed_t ST;
				86	T lb;
				87	T ub;
				88	ST st; // signed
				89	UT tc; // unsigned
				90
				91	T parm1;
				92	T parm2;
				93	T parm3;
				94	T parm4;
				95
				96	UT count; // unsigned
				97
				98	UT ordered_lower; // unsigned
				99	UT ordered_upper; // unsigned
				100	#if KMP_OS_WINDOWS
				101	T last_upper;
				102	#endif /* KMP_OS_WINDOWS */
				103	};
				104
				105	#endif /* KMP_STATIC_STEAL_ENABLED */
				106
				107	// replaces dispatch_private_info structure and dispatch_private_info_t type
				108	template< typename T >
				109	struct KMP_ALIGN_CACHE dispatch_private_info_template {
				110	// duplicate alignment here, otherwise size of structure is not correct in our compiler
				111	union KMP_ALIGN_CACHE private_info_tmpl {
				112	dispatch_private_infoXX_template< T > p;
				113	dispatch_private_info64_t p64;
				114	} u;
				115	enum sched_type schedule; /* scheduling algorithm */
				116	kmp_uint32 ordered; /* ordered clause specified */
				117	kmp_uint32 ordered_bumped;
				118	kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
				119	dispatch_private_info * next; /* stack of buffers for nest of serial regions */
				120	kmp_uint32 nomerge; /* don't merge iters if serialized */
				121	kmp_uint32 type_size;
				122	enum cons_type pushed_ws;
				123	};
				124
				125
				126	// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
				127	template< typename UT >
				128	struct dispatch_shared_infoXX_template {
				129	/* chunk index under dynamic, number of idle threads under static-steal;
				130	iteration index otherwise */
				131	volatile UT iteration;
				132	volatile UT num_done;
				133	volatile UT ordered_iteration;
				134	UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
				135	};
				136
				137	// replaces dispatch_shared_info structure and dispatch_shared_info_t type
				138	template< typename UT >
				139	struct dispatch_shared_info_template {
				140	// we need union here to keep the structure size
				141	union shared_info_tmpl {
				142	dispatch_shared_infoXX_template< UT > s;
				143	dispatch_shared_info64_t s64;
				144	} u;
				145	volatile kmp_uint32 buffer_index;
				146	};
				147
				148	/* ------------------------------------------------------------------------ */
				149	/* ------------------------------------------------------------------------ */
				150
				151	static void
				152	__kmp_static_delay( int arg )
				153	{
				154	/* Work around weird code-gen bug that causes assert to trip */
				155	#if KMP_ARCH_X86_64 && KMP_OS_LINUX
				156	#else
				157	KMP_ASSERT( arg >= 0 );
				158	#endif
				159	}
				160
				161	static void
				162	__kmp_static_yield( int arg )
				163	{
				164	__kmp_yield( arg );
				165	}
				166
				167	#undef USE_TEST_LOCKS
				168
				169	// test_then_add template (general template should NOT be used)
				170	template< typename T >
				171	static __forceinline T
				172	test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
				173
				174	template<>
				175	__forceinline kmp_int32
				176	test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
				177	{
				178	kmp_int32 r;
				179	r = KMP_TEST_THEN_ADD32( p, d );
				180	return r;
				181	}
				182
				183	template<>
				184	__forceinline kmp_int64
				185	test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
				186	{
				187	kmp_int64 r;
				188	r = KMP_TEST_THEN_ADD64( p, d );
				189	return r;
				190	}
				191
				192	// test_then_inc_acq template (general template should NOT be used)
				193	template< typename T >
				194	static __forceinline T
				195	test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
				196
				197	template<>
				198	__forceinline kmp_int32
				199	test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
				200	{
				201	kmp_int32 r;
				202	r = KMP_TEST_THEN_INC_ACQ32( p );
				203	return r;
				204	}
				205
				206	template<>
				207	__forceinline kmp_int64
				208	test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
				209	{
				210	kmp_int64 r;
				211	r = KMP_TEST_THEN_INC_ACQ64( p );
				212	return r;
				213	}
				214
				215	// test_then_inc template (general template should NOT be used)
				216	template< typename T >
				217	static __forceinline T
				218	test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
				219
				220	template<>
				221	__forceinline kmp_int32
				222	test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
				223	{
				224	kmp_int32 r;
				225	r = KMP_TEST_THEN_INC32( p );
				226	return r;
				227	}
				228
				229	template<>
				230	__forceinline kmp_int64
				231	test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
				232	{
				233	kmp_int64 r;
				234	r = KMP_TEST_THEN_INC64( p );
				235	return r;
				236	}
				237
				238	// compare_and_swap template (general template should NOT be used)
				239	template< typename T >
				240	static __forceinline kmp_int32
				241	compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
				242
				243	template<>
				244	__forceinline kmp_int32
				245	compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
				246	{
				247	return KMP_COMPARE_AND_STORE_REL32( p, c, s );
				248	}
				249
				250	template<>
				251	__forceinline kmp_int32
				252	compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
				253	{
				254	return KMP_COMPARE_AND_STORE_REL64( p, c, s );
				255	}
				256
				257	/*
				258	Spin wait loop that first does pause, then yield.
				259	Waits until function returns non-zero when called with *spinner and check.
				260	Does NOT put threads to sleep.
				261	#if USE_ITT_BUILD
				262	Arguments:
				263	obj -- is higher-level syncronization object to report to ittnotify. It is used to report
				264	locks consistently. For example, if lock is acquired immediately, its address is
				265	reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
				266	immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
				267	address, not an address of low-level spinner.
				268	#endif // USE_ITT_BUILD
				269	*/
				270	template< typename UT >
				271	// ToDo: make inline function (move to header file for icl)
				272	static UT // unsigned 4- or 8-byte type
				273	__kmp_wait_yield( volatile UT * spinner,
				274	UT checker,
				275	kmp_uint32 (* pred)( UT, UT )
				276	USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
				277	)
				278	{
				279	// note: we may not belong to a team at this point
				280	register volatile UT * spin = spinner;
				281	register UT check = checker;
				282	register kmp_uint32 spins;
				283	register kmp_uint32 (*f) ( UT, UT ) = pred;
				284	register UT r;
				285
				286	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				287	KMP_INIT_YIELD( spins );
				288	// main wait spin loop
				289	while(!f(r = *spin, check))
				290	{
				291	KMP_FSYNC_SPIN_PREPARE( obj );
				292	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				293	It causes problems with infinite recursion because of exit lock */
				294	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				295	__kmp_abort_thread(); */
				296
				297	__kmp_static_delay(TRUE);
				298
				299	// if we are oversubscribed,
				300	// or have waited a bit (and KMP_LIBRARY=throughput, then yield
				301	// pause is in the following code
				302	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				303	KMP_YIELD_SPIN( spins );
				304	}
				305	KMP_FSYNC_SPIN_ACQUIRED( obj );
				306	return r;
				307	}
				308
				309	template< typename UT >
				310	static kmp_uint32 __kmp_eq( UT value, UT checker) {
				311	return value == checker;
				312	}
				313
				314	template< typename UT >
				315	static kmp_uint32 __kmp_neq( UT value, UT checker) {
				316	return value != checker;
				317	}
				318
				319	template< typename UT >
				320	static kmp_uint32 __kmp_lt( UT value, UT checker) {
				321	return value < checker;
				322	}
				323
				324	template< typename UT >
				325	static kmp_uint32 __kmp_ge( UT value, UT checker) {
				326	return value >= checker;
				327	}
				328
				329	template< typename UT >
				330	static kmp_uint32 __kmp_le( UT value, UT checker) {
				331	return value <= checker;
				332	}
				333
				334
				335	/* ------------------------------------------------------------------------ */
				336	/* ------------------------------------------------------------------------ */
				337
				338	static void
				339	__kmp_dispatch_deo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				340	{
				341	kmp_info_t *th;
				342
				343	KMP_DEBUG_ASSERT( gtid_ref );
				344
				345	if ( __kmp_env_consistency_check ) {
				346	th = __kmp_threads[*gtid_ref];
				347	if ( th -> th.th_root -> r.r_active
				348	&& ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
				349	__kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
				350	}
				351	}
				352	}
				353
				354	template< typename UT >
				355	static void
				356	__kmp_dispatch_deo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				357	{
				358	typedef typename traits_t< UT >::signed_t ST;
				359	dispatch_private_info_template< UT > * pr;
				360
				361	int gtid = *gtid_ref;
				362	// int cid = *cid_ref;
				363	kmp_info_t *th = __kmp_threads[ gtid ];
				364	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				365
				366	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
				367	if ( __kmp_env_consistency_check ) {
				368	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				369	( th -> th.th_dispatch -> th_dispatch_pr_current );
				370	if ( pr -> pushed_ws != ct_none ) {
				371	__kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
				372	}
				373	}
				374
				375	if ( ! th -> th.th_team -> t.t_serialized ) {
				376	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				377	( th -> th.th_dispatch -> th_dispatch_sh_current );
				378	UT lower;
				379
				380	if ( ! __kmp_env_consistency_check ) {
				381	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				382	( th -> th.th_dispatch -> th_dispatch_pr_current );
				383	}
				384	lower = pr->u.p.ordered_lower;
				385
				386	#if ! defined( KMP_GOMP_COMPAT )
				387	if ( __kmp_env_consistency_check ) {
				388	if ( pr->ordered_bumped ) {
				389	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				390	__kmp_error_construct2(
				391	kmp_i18n_msg_CnsMultipleNesting,
				392	ct_ordered_in_pdo, loc_ref,
				393	& p->stack_data[ p->w_top ]
				394	);
				395	}
				396	}
				397	#endif /* !defined(KMP_GOMP_COMPAT) */
				398
				399	KMP_MB();
				400	#ifdef KMP_DEBUG
				401	{
				402	const char * buff;
				403	// create format specifiers before the debug output
				404	buff = __kmp_str_format(
				405	"__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
				406	traits_t< UT >::spec, traits_t< UT >::spec );
				407	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				408	__kmp_str_free( &buff );
				409	}
				410	#endif
				411
				412	__kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				413	USE_ITT_BUILD_ARG( NULL )
				414	);
				415	KMP_MB(); /* is this necessary? */
				416	#ifdef KMP_DEBUG
				417	{
				418	const char * buff;
				419	// create format specifiers before the debug output
				420	buff = __kmp_str_format(
				421	"__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
				422	traits_t< UT >::spec, traits_t< UT >::spec );
				423	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				424	__kmp_str_free( &buff );
				425	}
				426	#endif
				427	}
				428	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
				429	}
				430
				431	static void
				432	__kmp_dispatch_dxo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				433	{
				434	kmp_info_t *th;
				435
				436	if ( __kmp_env_consistency_check ) {
				437	th = __kmp_threads[*gtid_ref];
				438	if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
				439	__kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
				440	}
				441	}
				442	}
				443
				444	template< typename UT >
				445	static void
				446	__kmp_dispatch_dxo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				447	{
				448	typedef typename traits_t< UT >::signed_t ST;
				449	dispatch_private_info_template< UT > * pr;
				450
				451	int gtid = *gtid_ref;
				452	// int cid = *cid_ref;
				453	kmp_info_t *th = __kmp_threads[ gtid ];
				454	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				455
				456	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
				457	if ( __kmp_env_consistency_check ) {
				458	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				459	( th -> th.th_dispatch -> th_dispatch_pr_current );
				460	if ( pr -> pushed_ws != ct_none ) {
				461	__kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
				462	}
				463	}
				464
				465	if ( ! th -> th.th_team -> t.t_serialized ) {
				466	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				467	( th -> th.th_dispatch -> th_dispatch_sh_current );
				468
				469	if ( ! __kmp_env_consistency_check ) {
				470	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				471	( th -> th.th_dispatch -> th_dispatch_pr_current );
				472	}
				473
				474	KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
				475	#if ! defined( KMP_GOMP_COMPAT )
				476	if ( __kmp_env_consistency_check ) {
				477	if ( pr->ordered_bumped != 0 ) {
				478	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				479	/* How to test it? - OM */
				480	__kmp_error_construct2(
				481	kmp_i18n_msg_CnsMultipleNesting,
				482	ct_ordered_in_pdo, loc_ref,
				483	& p->stack_data[ p->w_top ]
				484	);
				485	}
				486	}
				487	#endif /* !defined(KMP_GOMP_COMPAT) */
				488
				489	KMP_MB(); /* Flush all pending memory write invalidates. */
				490
				491	pr->ordered_bumped += 1;
				492
				493	KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
				494	gtid, pr->ordered_bumped ) );
				495
				496	KMP_MB(); /* Flush all pending memory write invalidates. */
				497
				498	/* TODO use general release procedure? */
				499	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				500
				501	KMP_MB(); /* Flush all pending memory write invalidates. */
				502	}
				503	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
				504	}
				505
				506	/* Computes and returns x to the power of y, where y must a non-negative integer */
				507	template< typename UT >
				508	static __forceinline long double
				509	__kmp_pow(long double x, UT y) {
				510	long double s=1.0L;
				511
				512	KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
				513	//KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
				514	while(y) {
				515	if ( y & 1 )
				516	s *= x;
				517	x *= x;
				518	y >>= 1;
				519	}
				520	return s;
				521	}
				522
				523	/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
				524	(the total number of unassigned iterations in chunks with index greater than or equal to idx).
				525	__forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
				526	(one of the unit tests, sch_guided_analytical_basic.cpp, fails)
				527	*/
				528	template< typename T >
				529	static __inline typename traits_t< T >::unsigned_t
				530	__kmp_dispatch_guided_remaining(
				531	T tc,
				532	typename traits_t< T >::floating_t base,
				533	typename traits_t< T >::unsigned_t idx
				534	) {
				535	/* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
				536	least for ICL 8.1, long double arithmetic may not really have
				537	long double precision, even with /Qlong_double. Currently, we
				538	workaround that in the caller code, by manipulating the FPCW for
				539	Windows* OS on IA-32 architecture. The lack of precision is not
				540	expected to be a correctness issue, though.
				541	*/
				542	typedef typename traits_t< T >::unsigned_t UT;
				543
				544	long double x = tc * __kmp_pow< UT >(base, idx);
				545	UT r = (UT) x;
				546	if ( x == r )
				547	return r;
				548	return r + 1;
				549	}
				550
				551	// Parameters of the guided-iterative algorithm:
				552	// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
				553	// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
				554	// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
				555	// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
				556	static int guided_int_param = 2;
				557	static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
				558
				559	// UT - unsigned flavor of T, ST - signed flavor of T,
				560	// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
				561	template< typename T >
				562	static void
				563	__kmp_dispatch_init(
				564	ident_t * loc,
				565	int gtid,
				566	enum sched_type schedule,
				567	T lb,
				568	T ub,
				569	typename traits_t< T >::signed_t st,
				570	typename traits_t< T >::signed_t chunk,
				571	int push_ws
				572	) {
				573	typedef typename traits_t< T >::unsigned_t UT;
				574	typedef typename traits_t< T >::signed_t ST;
				575	typedef typename traits_t< T >::floating_t DBL;
				576	static const int ___kmp_size_type = sizeof( UT );
				577
				578	int active;
				579	T tc;
				580	kmp_info_t * th;
				581	kmp_team_t * team;
				582	kmp_uint32 my_buffer_index;
				583	dispatch_private_info_template< T > * pr;
				584	dispatch_shared_info_template< UT > volatile * sh;
				585
				586	KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
				587	KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
				588
				589	if ( ! TCR_4( __kmp_init_parallel ) )
				590	__kmp_parallel_initialize();
				591
				592	#ifdef KMP_DEBUG
				593	{
				594	const char * buff;
				595	// create format specifiers before the debug output
				596	buff = __kmp_str_format(
				597	"__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
				598	traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				599	KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
				600	__kmp_str_free( &buff );
				601	}
				602	#endif
				603	/* setup data */
				604	th = __kmp_threads[ gtid ];
				605	team = th -> th.th_team;
				606	active = ! team -> t.t_serialized;
				607	th->th.th_ident = loc;
				608
				609	if ( ! active ) {
				610	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				611	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				612	} else {
				613	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				614	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				615
				616	my_buffer_index = th->th.th_dispatch->th_disp_index ++;
				617
				618	/* What happens when number of threads changes, need to resize buffer? */
				619	pr = reinterpret_cast< dispatch_private_info_template< T > * >
				620	( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				621	sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
				622	( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				623	}
				624
				625	/* Pick up the nomerge/ordered bits from the scheduling type */
				626	if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
				627	pr->nomerge = TRUE;
				628	schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
				629	} else {
				630	pr->nomerge = FALSE;
				631	}
				632	pr->type_size = ___kmp_size_type; // remember the size of variables
				633	if ( kmp_ord_lower & schedule ) {
				634	pr->ordered = TRUE;
				635	schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
				636	} else {
				637	pr->ordered = FALSE;
				638	}
				639	if ( schedule == kmp_sch_static ) {
				640	schedule = __kmp_static;
				641	} else {
				642	if ( schedule == kmp_sch_runtime ) {
				643	#if OMP_30_ENABLED
				644	// Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
				645	schedule = team -> t.t_sched.r_sched_type;
				646	// Detail the schedule if needed (global controls are differentiated appropriately)
				647	if ( schedule == kmp_sch_guided_chunked ) {
				648	schedule = __kmp_guided;
				649	} else if ( schedule == kmp_sch_static ) {
				650	schedule = __kmp_static;
				651	}
				652	// Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
				653	chunk = team -> t.t_sched.chunk;
				654	#else
				655	kmp_r_sched_t r_sched = __kmp_get_schedule_global();
				656	// Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default
				657	schedule = r_sched.r_sched_type;
				658	chunk = r_sched.chunk;
				659	#endif
				660
				661	#ifdef KMP_DEBUG
				662	{
				663	const char * buff;
				664	// create format specifiers before the debug output
				665	buff = __kmp_str_format(
				666	"__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
				667	traits_t< ST >::spec );
				668	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				669	__kmp_str_free( &buff );
				670	}
				671	#endif
				672	} else {
				673	if ( schedule == kmp_sch_guided_chunked ) {
				674	schedule = __kmp_guided;
				675	}
				676	if ( chunk <= 0 ) {
				677	chunk = KMP_DEFAULT_CHUNK;
				678	}
				679	}
				680
				681	#if OMP_30_ENABLED
				682	if ( schedule == kmp_sch_auto ) {
				683	// mapping and differentiation: in the __kmp_do_serial_initialize()
				684	schedule = __kmp_auto;
				685	#ifdef KMP_DEBUG
				686	{
				687	const char * buff;
				688	// create format specifiers before the debug output
				689	buff = __kmp_str_format(
				690	"__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
				691	traits_t< ST >::spec );
				692	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				693	__kmp_str_free( &buff );
				694	}
				695	#endif
				696	}
				697	#endif // OMP_30_ENABLED
				698
				699	/* guided analytical not safe for too many threads */
				700	if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
				701	schedule = kmp_sch_guided_iterative_chunked;
				702	KMP_WARNING( DispatchManyThreads );
				703	}
				704	pr->u.p.parm1 = chunk;
				705	}
				706	KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
				707	"unknown scheduling type" );
				708
				709	pr->u.p.count = 0;
				710
				711	if ( __kmp_env_consistency_check ) {
				712	if ( st == 0 ) {
				713	__kmp_error_construct(
				714	kmp_i18n_msg_CnsLoopIncrZeroProhibited,
				715	( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
				716	);
				717	}
				718	}
				719
				720	tc = ( ub - lb + st );
				721	if ( st != 1 ) {
				722	if ( st < 0 ) {
				723	if ( lb < ub ) {
				724	tc = 0; // zero-trip
				725	} else { // lb >= ub
				726	tc = (ST)tc / st; // convert to signed division
				727	}
				728	} else { // st > 0
				729	if ( ub < lb ) {
				730	tc = 0; // zero-trip
				731	} else { // lb >= ub
				732	tc /= st;
				733	}
				734	}
				735	} else if ( ub < lb ) { // st == 1
				736	tc = 0; // zero-trip
				737	}
				738
				739	pr->u.p.lb = lb;
				740	pr->u.p.ub = ub;
				741	pr->u.p.st = st;
				742	pr->u.p.tc = tc;
				743
				744	#if KMP_OS_WINDOWS
				745	pr->u.p.last_upper = ub + st;
				746	#endif /* KMP_OS_WINDOWS */
				747
				748	/* NOTE: only the active parallel region(s) has active ordered sections */
				749
				750	if ( active ) {
				751	if ( pr->ordered == 0 ) {
				752	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
				753	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
				754	} else {
				755	pr->ordered_bumped = 0;
				756
				757	pr->u.p.ordered_lower = 1;
				758	pr->u.p.ordered_upper = 0;
				759
				760	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
				761	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
				762	}
				763	}
				764
				765	if ( __kmp_env_consistency_check ) {
				766	enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
				767	if ( push_ws ) {
				768	__kmp_push_workshare( gtid, ws, loc );
				769	pr->pushed_ws = ws;
				770	} else {
				771	__kmp_check_workshare( gtid, ws, loc );
				772	pr->pushed_ws = ct_none;
				773	}
				774	}
				775
				776	switch ( schedule ) {
				777	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				778	case kmp_sch_static_steal:
				779	{
				780	T nproc = team->t.t_nproc;
				781	T ntc, init;
				782
				783	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
				784
				785	ntc = (tc % chunk ? 1 : 0) + tc / chunk;
				786	if ( nproc > 1 && ntc >= nproc ) {
				787	T id = __kmp_tid_from_gtid(gtid);
				788	T small_chunk, extras;
				789
				790	small_chunk = ntc / nproc;
				791	extras = ntc % nproc;
				792
				793	init = id * small_chunk + ( id < extras ? id : extras );
				794	pr->u.p.count = init;
				795	pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
				796
				797	pr->u.p.parm2 = lb;
				798	//pr->pfields.parm3 = 0; // it's not used in static_steal
				799	pr->u.p.parm4 = id;
				800	pr->u.p.st = st;
				801	break;
				802	} else {
				803	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
				804	gtid ) );
				805	schedule = kmp_sch_static_balanced;
				806	/* too few iterations: fall-through to kmp_sch_static_balanced */
				807	} // if
				808	/* FALL-THROUGH to static balanced */
				809	} // case
				810	#endif
				811	case kmp_sch_static_balanced:
				812	{
				813	T nproc = team->t.t_nproc;
				814	T init, limit;
				815
				816	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
				817	gtid ) );
				818
				819	if ( nproc > 1 ) {
				820	T id = __kmp_tid_from_gtid(gtid);
				821
				822	if ( tc < nproc ) {
				823	if ( id < tc ) {
				824	init = id;
				825	limit = id;
				826	pr->u.p.parm1 = (id == tc - 1); /* parm1 stores plastiter /
				827	} else {
				828	pr->u.p.count = 1; /* means no more chunks to execute */
				829	pr->u.p.parm1 = FALSE;
				830	break;
				831	}
				832	} else {
				833	T small_chunk = tc / nproc;
				834	T extras = tc % nproc;
				835	init = id * small_chunk + (id < extras ? id : extras);
				836	limit = init + small_chunk - (id < extras ? 0 : 1);
				837	pr->u.p.parm1 = (id == nproc - 1);
				838	}
				839	} else {
				840	if ( tc > 0 ) {
				841	init = 0;
				842	limit = tc - 1;
				843	pr->u.p.parm1 = TRUE;
				844	} else {
				845	// zero trip count
				846	pr->u.p.count = 1; /* means no more chunks to execute */
				847	pr->u.p.parm1 = FALSE;
				848	break;
				849	}
				850	}
				851	if ( st == 1 ) {
				852	pr->u.p.lb = lb + init;
				853	pr->u.p.ub = lb + limit;
				854	} else {
				855	T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
				856	pr->u.p.lb = lb + init * st;
				857	// adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
				858	if ( st > 0 ) {
				859	pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
				860	} else {
				861	pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
				862	}
				863	}
				864	if ( pr->ordered ) {
				865	pr->u.p.ordered_lower = init;
				866	pr->u.p.ordered_upper = limit;
				867	}
				868	break;
				869	} // case
				870	case kmp_sch_guided_iterative_chunked :
				871	{
				872	T nproc = team->t.t_nproc;
				873	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
				874
				875	if ( nproc > 1 ) {
				876	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				877	/* chunk size too large, switch to dynamic */
				878	schedule = kmp_sch_dynamic_chunked;
				879	} else {
				880	// when remaining iters become less than parm2 - switch to dynamic
				881	pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
				882	(double)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
				883	}
				884	} else {
				885	KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
				886	schedule = kmp_sch_static_greedy;
				887	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				888	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				889	pr->u.p.parm1 = tc;
				890	} // if
				891	} // case
				892	break;
				893	case kmp_sch_guided_analytical_chunked:
				894	{
				895	T nproc = team->t.t_nproc;
				896	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
				897
				898	if ( nproc > 1 ) {
				899	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				900	/* chunk size too large, switch to dynamic */
				901	schedule = kmp_sch_dynamic_chunked;
				902	} else {
				903	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				904	DBL x;
				905
				906	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				907	/* Linux* OS already has 64-bit computation by default for
				908	long double, and on Windows* OS on Intel(R) 64,
				909	/Qlong_double doesn't work. On Windows* OS
				910	on IA-32 architecture, we need to set precision to
				911	64-bit instead of the default 53-bit. Even though long
				912	double doesn't work on Windows* OS on Intel(R) 64, the
				913	resulting lack of precision is not expected to impact
				914	the correctness of the algorithm, but this has not been
				915	mathematically proven.
				916	*/
				917	// save original FPCW and set precision to 64-bit, as
				918	// Windows* OS on IA-32 architecture defaults to 53-bit
				919	unsigned int oldFpcw = _control87(0,0x30000);
				920	#endif
				921	/* value used for comparison in solver for cross-over point */
				922	long double target = ((long double)chunk * 2 + 1) * nproc / tc;
				923
				924	/* crossover point--chunk indexes equal to or greater than
				925	this point switch to dynamic-style scheduling */
				926	UT cross;
				927
				928	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				929	x = (long double)1.0 - (long double)0.5 / nproc;
				930
				931	#ifdef KMP_DEBUG
				932	{ // test natural alignment
				933	struct _test_a {
				934	char a;
				935	union {
				936	char b;
				937	DBL d;
				938	};
				939	} t;
				940	ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
				941	//__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
				942	KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
				943	}
				944	#endif // KMP_DEBUG
				945
				946	/* save the term in thread private dispatch structure */
				947	(DBL)&pr->u.p.parm3 = x;
				948
				949	/* solve for the crossover point to the nearest integer i for which C_i <= chunk */
				950	{
				951	UT left, right, mid;
				952	long double p;
				953
				954	/* estimate initial upper and lower bound */
				955
				956	/* doesn't matter what value right is as long as it is positive, but
				957	it affects performance of the solver
				958	*/
				959	right = 229;
				960	p = __kmp_pow< UT >(x,right);
				961	if ( p > target ) {
				962	do{
				963	p *= p;
				964	right <<= 1;
				965	} while(p>target && right < (1<<27));
				966	left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
				967	} else {
				968	left = 0;
				969	}
				970
				971	/* bisection root-finding method */
				972	while ( left + 1 < right ) {
				973	mid = (left + right) / 2;
				974	if ( __kmp_pow< UT >(x,mid) > target ) {
				975	left = mid;
				976	} else {
				977	right = mid;
				978	}
				979	} // while
				980	cross = right;
				981	}
				982	/* assert sanity of computed crossover point */
				983	KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
				984
				985	/* save the crossover point in thread private dispatch structure */
				986	pr->u.p.parm2 = cross;
				987
				988	// C75803
				989	#if ( ( KMP_OS_LINUX \|\| KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
				990	#define GUIDED_ANALYTICAL_WORKAROUND (( DBL )&pr->u.p.parm3)
				991	#else
				992	#define GUIDED_ANALYTICAL_WORKAROUND (x)
				993	#endif
				994	/* dynamic-style scheduling offset */
				995	pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
				996	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				997	// restore FPCW
				998	_control87(oldFpcw,0x30000);
				999	#endif
				1000	} // if
				1001	} else {
				1002	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
				1003	gtid ) );
				1004	schedule = kmp_sch_static_greedy;
				1005	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				1006	pr->u.p.parm1 = tc;
				1007	} // if
				1008	} // case
				1009	break;
				1010	case kmp_sch_static_greedy:
				1011	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				1012	pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
				1013	( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
				1014	tc;
				1015	break;
				1016	case kmp_sch_static_chunked :
				1017	case kmp_sch_dynamic_chunked :
				1018	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
				1019	break;
				1020	case kmp_sch_trapezoidal :
				1021	{
				1022	/* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
				1023
				1024	T parm1, parm2, parm3, parm4;
				1025	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
				1026
				1027	parm1 = chunk;
				1028
				1029	/* F : size of the first cycle */
				1030	parm2 = ( tc / (2 * team->t.t_nproc) );
				1031
				1032	if ( parm2 < 1 ) {
				1033	parm2 = 1;
				1034	}
				1035
				1036	/* L : size of the last cycle. Make sure the last cycle
				1037	* is not larger than the first cycle.
				1038	*/
				1039	if ( parm1 < 1 ) {
				1040	parm1 = 1;
				1041	} else if ( parm1 > parm2 ) {
				1042	parm1 = parm2;
				1043	}
				1044
				1045	/* N : number of cycles */
				1046	parm3 = ( parm2 + parm1 );
				1047	parm3 = ( 2 * tc + parm3 - 1) / parm3;
				1048
				1049	if ( parm3 < 2 ) {
				1050	parm3 = 2;
				1051	}
				1052
				1053	/* sigma : decreasing incr of the trapezoid */
				1054	parm4 = ( parm3 - 1 );
				1055	parm4 = ( parm2 - parm1 ) / parm4;
				1056
				1057	// pointless check, because parm4 >= 0 always
				1058	//if ( parm4 < 0 ) {
				1059	// parm4 = 0;
				1060	//}
				1061
				1062	pr->u.p.parm1 = parm1;
				1063	pr->u.p.parm2 = parm2;
				1064	pr->u.p.parm3 = parm3;
				1065	pr->u.p.parm4 = parm4;
				1066	} // case
				1067	break;
				1068
				1069	default:
				1070	{
				1071	__kmp_msg(
				1072	kmp_ms_fatal, // Severity
				1073	KMP_MSG( UnknownSchedTypeDetected ), // Primary message
				1074	KMP_HNT( GetNewerLibrary ), // Hint
				1075	__kmp_msg_null // Variadic argument list terminator
				1076	);
				1077	}
				1078	break;
				1079	} // switch
				1080	pr->schedule = schedule;
				1081	if ( active ) {
				1082	/* The name of this buffer should be my_buffer_index when it's free to use it */
				1083
				1084	KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1085	gtid, my_buffer_index, sh->buffer_index) );
				1086	__kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
				1087	USE_ITT_BUILD_ARG( NULL )
				1088	);
				1089	// Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
				1090	// always 32-bit integers.
				1091	KMP_MB(); /* is this necessary? */
				1092	KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1093	gtid, my_buffer_index, sh->buffer_index) );
				1094
				1095	th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
				1096	th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
				1097	#if USE_ITT_BUILD
				1098	if ( pr->ordered ) {
				1099	__kmp_itt_ordered_init( gtid );
				1100	}; // if
				1101	#endif /* USE_ITT_BUILD */
				1102	}; // if
				1103	#ifdef KMP_DEBUG
				1104	{
				1105	const char * buff;
				1106	// create format specifiers before the debug output
				1107	buff = __kmp_str_format(
				1108	"__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
				1109	" st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
				1110	" parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
				1111	traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
				1112	traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
				1113	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
				1114	traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
				1115	KD_TRACE(10, ( buff,
				1116	gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
				1117	pr->u.p.st, pr->u.p.tc, pr->u.p.count,
				1118	pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
				1119	pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
				1120	__kmp_str_free( &buff );
				1121	}
				1122	#endif
				1123	#if ( KMP_STATIC_STEAL_ENABLED )
				1124	if ( ___kmp_size_type < 8 ) {
				1125	// It cannot be guaranteed that after execution of a loop with some other schedule kind
				1126	// all the parm3 variables will contain the same value.
				1127	// Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
				1128	// rather than program life-time increment.
				1129	// So the dedicated variable is required. The 'static_steal_counter' is used.
				1130	if( schedule == kmp_sch_static_steal ) {
				1131	// Other threads will inspect this variable when searching for a victim.
				1132	// This is a flag showing that other threads may steal from this thread since then.
				1133	volatile T * p = &pr->u.p.static_steal_counter;
				1134	p = p + 1;
				1135	}
				1136	}
				1137	#endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
				1138	}
				1139
				1140	/*
				1141	* For ordered loops, either __kmp_dispatch_finish() should be called after
				1142	* every iteration, or __kmp_dispatch_finish_chunk() should be called after
				1143	* every chunk of iterations. If the ordered section(s) were not executed
				1144	* for this iteration (or every iteration in this chunk), we need to set the
				1145	* ordered iteration counters so that the next thread can proceed.
				1146	*/
				1147	template< typename UT >
				1148	static void
				1149	__kmp_dispatch_finish( int gtid, ident_t *loc )
				1150	{
				1151	typedef typename traits_t< UT >::signed_t ST;
				1152	kmp_info_t *th = __kmp_threads[ gtid ];
				1153
				1154	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
				1155	if ( ! th -> th.th_team -> t.t_serialized ) {
				1156
				1157	dispatch_private_info_template< UT > * pr =
				1158	reinterpret_cast< dispatch_private_info_template< UT >* >
				1159	( th->th.th_dispatch->th_dispatch_pr_current );
				1160	dispatch_shared_info_template< UT > volatile * sh =
				1161	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1162	( th->th.th_dispatch->th_dispatch_sh_current );
				1163	KMP_DEBUG_ASSERT( pr );
				1164	KMP_DEBUG_ASSERT( sh );
				1165	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1166	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1167
				1168	if ( pr->ordered_bumped ) {
				1169	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1170	gtid ) );
				1171	pr->ordered_bumped = 0;
				1172	} else {
				1173	UT lower = pr->u.p.ordered_lower;
				1174
				1175	#ifdef KMP_DEBUG
				1176	{
				1177	const char * buff;
				1178	// create format specifiers before the debug output
				1179	buff = __kmp_str_format(
				1180	"__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
				1181	traits_t< UT >::spec, traits_t< UT >::spec );
				1182	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1183	__kmp_str_free( &buff );
				1184	}
				1185	#endif
				1186
				1187	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1188	USE_ITT_BUILD_ARG(NULL)
				1189	);
				1190	KMP_MB(); /* is this necessary? */
				1191	#ifdef KMP_DEBUG
				1192	{
				1193	const char * buff;
				1194	// create format specifiers before the debug output
				1195	buff = __kmp_str_format(
				1196	"__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
				1197	traits_t< UT >::spec, traits_t< UT >::spec );
				1198	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1199	__kmp_str_free( &buff );
				1200	}
				1201	#endif
				1202
				1203	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				1204	} // if
				1205	} // if
				1206	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
				1207	}
				1208
				1209	#ifdef KMP_GOMP_COMPAT
				1210
				1211	template< typename UT >
				1212	static void
				1213	__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
				1214	{
				1215	typedef typename traits_t< UT >::signed_t ST;
				1216	kmp_info_t *th = __kmp_threads[ gtid ];
				1217
				1218	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
				1219	if ( ! th -> th.th_team -> t.t_serialized ) {
				1220	// int cid;
				1221	dispatch_private_info_template< UT > * pr =
				1222	reinterpret_cast< dispatch_private_info_template< UT >* >
				1223	( th->th.th_dispatch->th_dispatch_pr_current );
				1224	dispatch_shared_info_template< UT > volatile * sh =
				1225	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1226	( th->th.th_dispatch->th_dispatch_sh_current );
				1227	KMP_DEBUG_ASSERT( pr );
				1228	KMP_DEBUG_ASSERT( sh );
				1229	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1230	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1231
				1232	// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
				1233	UT lower = pr->u.p.ordered_lower;
				1234	UT upper = pr->u.p.ordered_upper;
				1235	UT inc = upper - lower + 1;
				1236
				1237	if ( pr->ordered_bumped == inc ) {
				1238	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1239	gtid ) );
				1240	pr->ordered_bumped = 0;
				1241	} else {
				1242	inc -= pr->ordered_bumped;
				1243
				1244	#ifdef KMP_DEBUG
				1245	{
				1246	const char * buff;
				1247	// create format specifiers before the debug output
				1248	buff = __kmp_str_format(
				1249	"__kmp_dispatch_finish_chunk: T#%%d before wait: " \
				1250	"ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
				1251	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1252	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
				1253	__kmp_str_free( &buff );
				1254	}
				1255	#endif
				1256
				1257	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1258	USE_ITT_BUILD_ARG(NULL)
				1259	);
				1260
				1261	KMP_MB(); /* is this necessary? */
				1262	KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
				1263	gtid ) );
				1264	pr->ordered_bumped = 0;
				1265	//!!!!! TODO check if the inc should be unsigned, or signed???
				1266	#ifdef KMP_DEBUG
				1267	{
				1268	const char * buff;
				1269	// create format specifiers before the debug output
				1270	buff = __kmp_str_format(
				1271	"__kmp_dispatch_finish_chunk: T#%%d after wait: " \
				1272	"ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
				1273	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1274	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
				1275	__kmp_str_free( &buff );
				1276	}
				1277	#endif
				1278
				1279	test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
				1280	}
				1281	// }
				1282	}
				1283	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
				1284	}
				1285
				1286	#endif /* KMP_GOMP_COMPAT */
				1287
				1288	template< typename T >
				1289	static int
				1290	__kmp_dispatch_next(
				1291	ident_t loc, int gtid, kmp_int32 p_last, T p_lb, T p_ub, typename traits_t< T >::signed_t *p_st
				1292	) {
				1293
				1294	typedef typename traits_t< T >::unsigned_t UT;
				1295	typedef typename traits_t< T >::signed_t ST;
				1296	typedef typename traits_t< T >::floating_t DBL;
				1297	static const int ___kmp_size_type = sizeof( UT );
				1298
				1299	int status;
				1300	dispatch_private_info_template< T > * pr;
				1301	kmp_info_t * th = __kmp_threads[ gtid ];
				1302	kmp_team_t * team = th -> th.th_team;
				1303
				1304	#ifdef KMP_DEBUG
				1305	{
				1306	const char * buff;
				1307	// create format specifiers before the debug output
				1308	buff = __kmp_str_format(
				1309	"__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
				1310	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				1311	KD_TRACE(1000, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last ) );
				1312	__kmp_str_free( &buff );
				1313	}
				1314	#endif
				1315
				1316	if ( team -> t.t_serialized ) {
				1317	/* NOTE: serialize this dispatch becase we are not at the active level */
				1318	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1319	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				1320	KMP_DEBUG_ASSERT( pr );
				1321
				1322	if ( (status = (pr->u.p.tc != 0)) == 0 ) {
				1323	*p_lb = 0;
				1324	*p_ub = 0;
				1325	if ( p_st != 0 ) {
				1326	*p_st = 0;
				1327	}
				1328	if ( __kmp_env_consistency_check ) {
				1329	if ( pr->pushed_ws != ct_none ) {
				1330	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1331	}
				1332	}
				1333	} else if ( pr->nomerge ) {
				1334	kmp_int32 last;
				1335	T start;
				1336	UT limit, trip, init;
				1337	ST incr;
				1338	T chunk = pr->u.p.parm1;
				1339
				1340	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
				1341
				1342	init = chunk * pr->u.p.count++;
				1343	trip = pr->u.p.tc - 1;
				1344
				1345	if ( (status = (init <= trip)) == 0 ) {
				1346	*p_lb = 0;
				1347	*p_ub = 0;
				1348	if ( p_st != 0 ) *p_st = 0;
				1349	if ( __kmp_env_consistency_check ) {
				1350	if ( pr->pushed_ws != ct_none ) {
				1351	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1352	}
				1353	}
				1354	} else {
				1355	start = pr->u.p.lb;
				1356	limit = chunk + init - 1;
				1357	incr = pr->u.p.st;
				1358
				1359	if ( (last = (limit >= trip)) != 0 ) {
				1360	limit = trip;
				1361	#if KMP_OS_WINDOWS
				1362	pr->u.p.last_upper = pr->u.p.ub;
				1363	#endif /* KMP_OS_WINDOWS */
				1364	}
				1365	if ( p_last ) {
				1366	*p_last = last;
				1367	}
				1368	if ( p_st != 0 ) {
				1369	*p_st = incr;
				1370	}
				1371	if ( incr == 1 ) {
				1372	*p_lb = start + init;
				1373	*p_ub = start + limit;
				1374	} else {
				1375	p_lb = start + init incr;
				1376	p_ub = start + limit incr;
				1377	}
				1378
				1379	if ( pr->ordered ) {
				1380	pr->u.p.ordered_lower = init;
				1381	pr->u.p.ordered_upper = limit;
				1382	#ifdef KMP_DEBUG
				1383	{
				1384	const char * buff;
				1385	// create format specifiers before the debug output
				1386	buff = __kmp_str_format(
				1387	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1388	traits_t< UT >::spec, traits_t< UT >::spec );
				1389	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1390	__kmp_str_free( &buff );
				1391	}
				1392	#endif
				1393	} // if
				1394	} // if
				1395	} else {
				1396	pr->u.p.tc = 0;
				1397
				1398	*p_lb = pr->u.p.lb;
				1399	*p_ub = pr->u.p.ub;
				1400	#if KMP_OS_WINDOWS
				1401	pr->u.p.last_upper = *p_ub;
				1402	#endif /* KMP_OS_WINDOWS */
				1403
				1404	if ( p_st != 0 ) {
				1405	*p_st = pr->u.p.st;
				1406	}
				1407	if ( p_last ) {
				1408	*p_last = TRUE;
				1409	}
				1410	} // if
				1411	#ifdef KMP_DEBUG
				1412	{
				1413	const char * buff;
				1414	// create format specifiers before the debug output
				1415	buff = __kmp_str_format(
				1416	"__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
				1417	"p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
				1418	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				1419	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, *p_st, p_last, status) );
				1420	__kmp_str_free( &buff );
				1421	}
				1422	#endif
				1423	return status;
				1424	} else {
				1425	kmp_int32 last = 0;
				1426	dispatch_shared_info_template< UT > *sh;
				1427	T start;
				1428	ST incr;
				1429	UT limit, trip, init;
				1430
				1431	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1432	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1433
				1434	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1435	( th->th.th_dispatch->th_dispatch_pr_current );
				1436	KMP_DEBUG_ASSERT( pr );
				1437	sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				1438	( th->th.th_dispatch->th_dispatch_sh_current );
				1439	KMP_DEBUG_ASSERT( sh );
				1440
				1441	if ( pr->u.p.tc == 0 ) {
				1442	// zero trip count
				1443	status = 0;
				1444	} else {
				1445	switch (pr->schedule) {
				1446	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1447	case kmp_sch_static_steal:
				1448	{
				1449	T chunk = pr->u.p.parm1;
				1450
				1451	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
				1452
				1453	trip = pr->u.p.tc - 1;
				1454
				1455	if ( ___kmp_size_type > 4 ) {
				1456	// Other threads do not look into the data of this thread,
				1457	// so it's not necessary to make volatile casting.
				1458	init = ( pr->u.p.count )++;
				1459	status = ( init < (UT)pr->u.p.ub );
				1460	} else {
				1461	typedef union {
				1462	struct {
				1463	UT count;
				1464	T ub;
				1465	} p;
				1466	kmp_int64 b;
				1467	} union_i4;
				1468	// All operations on 'count' or 'ub' must be combined atomically together.
				1469	// stealing implemented only for 4-byte indexes
				1470	{
				1471	union_i4 vold, vnew;
				1472	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1473	vnew = vold;
				1474	vnew.p.count++;
				1475	while( ! KMP_COMPARE_AND_STORE_ACQ64(
				1476	( volatile kmp_int64* )&pr->u.p.count,
				1477	VOLATILE_CAST(kmp_int64 )&vold.b,
				1478	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1479	KMP_CPU_PAUSE();
				1480	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1481	vnew = vold;
				1482	vnew.p.count++;
				1483	}
				1484	vnew = vold;
				1485	init = vnew.p.count;
				1486	status = ( init < (UT)vnew.p.ub ) ;
				1487	}
				1488
				1489	if( !status ) {
				1490	kmp_info_t **other_threads = team->t.t_threads;
				1491	int while_limit = 10;
				1492	int while_index = 0;
				1493
				1494	// TODO: algorithm of searching for a victim
				1495	// should be cleaned up and measured
				1496	while ( ( !status ) && ( while_limit != ++while_index ) ) {
				1497	union_i4 vold, vnew;
				1498	kmp_int32 remaining; // kmp_int32 because KMP_I4 only
				1499	T victimIdx = pr->u.p.parm4;
				1500	T oldVictimIdx = victimIdx;
				1501	dispatch_private_info_template< T > * victim;
				1502
				1503	do {
				1504	if( !victimIdx ) {
				1505	victimIdx = team->t.t_nproc - 1;
				1506	} else {
				1507	--victimIdx;
				1508	}
				1509	victim = reinterpret_cast< dispatch_private_info_template< T >* >
				1510	( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
				1511	} while ( (victim == NULL \|\| victim == pr) && oldVictimIdx != victimIdx );
				1512	// TODO: think about a proper place of this test
				1513	if ( ( !victim ) \|\|
				1514	( (( volatile T )&victim->u.p.static_steal_counter) !=
				1515	(( volatile T )&pr->u.p.static_steal_counter) ) ) {
				1516	// TODO: delay would be nice
				1517	continue;
				1518	// the victim is not ready yet to participate in stealing
				1519	// because the victim is still in kmp_init_dispatch
				1520	}
				1521	if ( oldVictimIdx == victimIdx ) {
				1522	break;
				1523	}
				1524	pr->u.p.parm4 = victimIdx;
				1525
				1526	while( 1 ) {
				1527	vold.b = ( volatile kmp_int64 )( &victim->u.p.count );
				1528	vnew = vold;
				1529
				1530	KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
				1531	if ( vnew.p.count >= (UT)vnew.p.ub \|\| (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
				1532	break;
				1533	}
				1534	vnew.p.ub -= (remaining >> 2);
				1535	KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
				1536	#pragma warning( push )
				1537	// disable warning on pointless comparison of unsigned with 0
				1538	#pragma warning( disable: 186 )
				1539	KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
				1540	#pragma warning( pop )
				1541	// TODO: Should this be acquire or release?
				1542	if ( KMP_COMPARE_AND_STORE_ACQ64(
				1543	( volatile kmp_int64 * )&victim->u.p.count,
				1544	VOLATILE_CAST(kmp_int64 )&vold.b,
				1545	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1546	status = 1;
				1547	while_index = 0;
				1548	// now update own count and ub
				1549	#if KMP_ARCH_X86
				1550	// stealing executed on non-KMP_ARCH_X86 only
				1551	// Atomic 64-bit write on ia32 is
				1552	// unavailable, so we do this in steps.
				1553	// This code is not tested.
				1554	init = vold.p.count;
				1555	pr->u.p.ub = 0;
				1556	pr->u.p.count = init + 1;
				1557	pr->u.p.ub = vnew.p.count;
				1558	#else
				1559	init = vnew.p.ub;
				1560	vold.p.count = init + 1;
				1561	// TODO: is it safe and enough?
				1562	( volatile kmp_int64 )(&pr->u.p.count) = vold.b;
				1563	#endif // KMP_ARCH_X86
				1564	break;
				1565	} // if
				1566	KMP_CPU_PAUSE();
				1567	} // while (1)
				1568	} // while
				1569	} // if
				1570	} // if
				1571	if ( !status ) {
				1572	*p_lb = 0;
				1573	*p_ub = 0;
				1574	if ( p_st != 0 ) *p_st = 0;
				1575	} else {
				1576	start = pr->u.p.parm2;
				1577	init *= chunk;
				1578	limit = chunk + init - 1;
				1579	incr = pr->u.p.st;
				1580
				1581	KMP_DEBUG_ASSERT(init <= trip);
				1582	if ( (last = (limit >= trip)) != 0 )
				1583	limit = trip;
				1584	if ( p_last ) {
				1585	*p_last = last;
				1586	}
				1587	if ( p_st != 0 ) *p_st = incr;
				1588
				1589	if ( incr == 1 ) {
				1590	*p_lb = start + init;
				1591	*p_ub = start + limit;
				1592	} else {
				1593	p_lb = start + init incr;
				1594	p_ub = start + limit incr;
				1595	}
				1596
				1597	if ( pr->ordered ) {
				1598	pr->u.p.ordered_lower = init;
				1599	pr->u.p.ordered_upper = limit;
				1600	#ifdef KMP_DEBUG
				1601	{
				1602	const char * buff;
				1603	// create format specifiers before the debug output
				1604	buff = __kmp_str_format(
				1605	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1606	traits_t< UT >::spec, traits_t< UT >::spec );
				1607	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1608	__kmp_str_free( &buff );
				1609	}
				1610	#endif
				1611	} // if
				1612	} // if
				1613	break;
				1614	} // case
				1615	#endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1616	case kmp_sch_static_balanced:
				1617	{
				1618	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
				1619	if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
				1620	pr->u.p.count = 1;
				1621	*p_lb = pr->u.p.lb;
				1622	*p_ub = pr->u.p.ub;
				1623	last = pr->u.p.parm1;
				1624	if ( p_last ) {
				1625	*p_last = last;
				1626	}
				1627	if ( p_st )
				1628	*p_st = pr->u.p.st;
				1629	} else { /* no iterations to do */
				1630	pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
				1631	}
				1632	if ( pr->ordered ) {
				1633	#ifdef KMP_DEBUG
				1634	{
				1635	const char * buff;
				1636	// create format specifiers before the debug output
				1637	buff = __kmp_str_format(
				1638	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1639	traits_t< UT >::spec, traits_t< UT >::spec );
				1640	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1641	__kmp_str_free( &buff );
				1642	}
				1643	#endif
				1644	} // if
				1645	} // case
				1646	break;
				1647	case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
				1648	case kmp_sch_static_chunked:
				1649	{
				1650	T parm1;
				1651
				1652	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity\|chunked] case\n",
				1653	gtid ) );
				1654	parm1 = pr->u.p.parm1;
				1655
				1656	trip = pr->u.p.tc - 1;
				1657	init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
				1658
				1659	if ( (status = (init <= trip)) != 0 ) {
				1660	start = pr->u.p.lb;
				1661	incr = pr->u.p.st;
				1662	limit = parm1 + init - 1;
				1663
				1664	if ( (last = (limit >= trip)) != 0 )
				1665	limit = trip;
				1666
				1667	if ( p_last ) {
				1668	*p_last = last;
				1669	}
				1670	if ( p_st != 0 ) *p_st = incr;
				1671
				1672	pr->u.p.count += team->t.t_nproc;
				1673
				1674	if ( incr == 1 ) {
				1675	*p_lb = start + init;
				1676	*p_ub = start + limit;
				1677	}
				1678	else {
				1679	p_lb = start + init incr;
				1680	p_ub = start + limit incr;
				1681	}
				1682
				1683	if ( pr->ordered ) {
				1684	pr->u.p.ordered_lower = init;
				1685	pr->u.p.ordered_upper = limit;
				1686	#ifdef KMP_DEBUG
				1687	{
				1688	const char * buff;
				1689	// create format specifiers before the debug output
				1690	buff = __kmp_str_format(
				1691	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1692	traits_t< UT >::spec, traits_t< UT >::spec );
				1693	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1694	__kmp_str_free( &buff );
				1695	}
				1696	#endif
				1697	} // if
				1698	} // if
				1699	} // case
				1700	break;
				1701
				1702	case kmp_sch_dynamic_chunked:
				1703	{
				1704	T chunk = pr->u.p.parm1;
				1705
				1706	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
				1707	gtid ) );
				1708
				1709	init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1710	trip = pr->u.p.tc - 1;
				1711
				1712	if ( (status = (init <= trip)) == 0 ) {
				1713	*p_lb = 0;
				1714	*p_ub = 0;
				1715	if ( p_st != 0 ) *p_st = 0;
				1716	} else {
				1717	start = pr->u.p.lb;
				1718	limit = chunk + init - 1;
				1719	incr = pr->u.p.st;
				1720
				1721	if ( (last = (limit >= trip)) != 0 )
				1722	limit = trip;
				1723	if ( p_last ) {
				1724	*p_last = last;
				1725	}
				1726	if ( p_st != 0 ) *p_st = incr;
				1727
				1728	if ( incr == 1 ) {
				1729	*p_lb = start + init;
				1730	*p_ub = start + limit;
				1731	} else {
				1732	p_lb = start + init incr;
				1733	p_ub = start + limit incr;
				1734	}
				1735
				1736	if ( pr->ordered ) {
				1737	pr->u.p.ordered_lower = init;
				1738	pr->u.p.ordered_upper = limit;
				1739	#ifdef KMP_DEBUG
				1740	{
				1741	const char * buff;
				1742	// create format specifiers before the debug output
				1743	buff = __kmp_str_format(
				1744	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1745	traits_t< UT >::spec, traits_t< UT >::spec );
				1746	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1747	__kmp_str_free( &buff );
				1748	}
				1749	#endif
				1750	} // if
				1751	} // if
				1752	} // case
				1753	break;
				1754
				1755	case kmp_sch_guided_iterative_chunked:
				1756	{
				1757	T chunkspec = pr->u.p.parm1;
				1758	KD_TRACE(100,
				1759	("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
				1760	trip = pr->u.p.tc;
				1761	// Start atomic part of calculations
				1762	while(1) {
				1763	ST remaining; // signed, because can be < 0
				1764	init = sh->u.s.iteration; // shared value
				1765	remaining = trip - init;
				1766	if ( remaining <= 0 ) { // AC: need to compare with 0 first
				1767	// nothing to do, don't try atomic op
				1768	status = 0;
				1769	break;
				1770	}
				1771	if ( (T)remaining < pr->u.p.parm2 ) { // compare with Knproc(chunk+1), K=2 by default
				1772	// use dynamic-style shcedule
				1773	// atomically inrement iterations, get old value
				1774	init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
				1775	remaining = trip - init;
				1776	if (remaining <= 0) {
				1777	status = 0; // all iterations got by other threads
				1778	} else {
				1779	// got some iterations to work on
				1780	status = 1;
				1781	if ( (T)remaining > chunkspec ) {
				1782	limit = init + chunkspec - 1;
				1783	} else {
				1784	last = 1; // the last chunk
				1785	limit = init + remaining - 1;
				1786	} // if
				1787	} // if
				1788	break;
				1789	} // if
				1790	limit = init + (UT)( remaining * (double)&pr->u.p.parm3 ); // divide by K*nproc
				1791	if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
				1792	// CAS was successful, chunk obtained
				1793	status = 1;
				1794	--limit;
				1795	break;
				1796	} // if
				1797	} // while
				1798	if ( status != 0 ) {
				1799	start = pr->u.p.lb;
				1800	incr = pr->u.p.st;
				1801	if ( p_st != NULL )
				1802	*p_st = incr;
				1803	if ( p_last != NULL )
				1804	*p_last = last;
				1805	p_lb = start + init incr;
				1806	p_ub = start + limit incr;
				1807	if ( pr->ordered ) {
				1808	pr->u.p.ordered_lower = init;
				1809	pr->u.p.ordered_upper = limit;
				1810	#ifdef KMP_DEBUG
				1811	{
				1812	const char * buff;
				1813	// create format specifiers before the debug output
				1814	buff = __kmp_str_format(
				1815	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1816	traits_t< UT >::spec, traits_t< UT >::spec );
				1817	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1818	__kmp_str_free( &buff );
				1819	}
				1820	#endif
				1821	} // if
				1822	} else {
				1823	*p_lb = 0;
				1824	*p_ub = 0;
				1825	if ( p_st != NULL )
				1826	*p_st = 0;
				1827	} // if
				1828	} // case
				1829	break;
				1830
				1831	case kmp_sch_guided_analytical_chunked:
				1832	{
				1833	T chunkspec = pr->u.p.parm1;
				1834	UT chunkIdx;
				1835	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1836	/* for storing original FPCW value for Windows* OS on
				1837	IA-32 architecture 8-byte version */
				1838	unsigned int oldFpcw;
				1839	int fpcwSet = 0;
				1840	#endif
				1841	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
				1842	gtid ) );
				1843
				1844	trip = pr->u.p.tc;
				1845
				1846	KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
				1847	KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
				1848
				1849	while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
				1850	chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1851	if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
				1852	--trip;
				1853	/* use dynamic-style scheduling */
				1854	init = chunkIdx * chunkspec + pr->u.p.count;
				1855	/* need to verify init > 0 in case of overflow in the above calculation */
				1856	if ( (status = (init > 0 && init <= trip)) != 0 ) {
				1857	limit = init + chunkspec -1;
				1858
				1859	if ( (last = (limit >= trip)) != 0 )
				1860	limit = trip;
				1861	}
				1862	break;
				1863	} else {
				1864	/* use exponential-style scheduling */
				1865	/* The following check is to workaround the lack of long double precision on Windows* OS.
				1866	This check works around the possible effect that init != 0 for chunkIdx == 0.
				1867	*/
				1868	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1869	/* If we haven't already done so, save original
				1870	FPCW and set precision to 64-bit, as Windows* OS
				1871	on IA-32 architecture defaults to 53-bit */
				1872	if ( !fpcwSet ) {
				1873	oldFpcw = _control87(0,0x30000);
				1874	fpcwSet = 0x30000;
				1875	}
				1876	#endif
				1877	if ( chunkIdx ) {
				1878	init = __kmp_dispatch_guided_remaining< T >(
				1879	trip, ( DBL )&pr->u.p.parm3, chunkIdx );
				1880	KMP_DEBUG_ASSERT(init);
				1881	init = trip - init;
				1882	} else
				1883	init = 0;
				1884	limit = trip - __kmp_dispatch_guided_remaining< T >(
				1885	trip, ( DBL )&pr->u.p.parm3, chunkIdx + 1 );
				1886	KMP_ASSERT(init <= limit);
				1887	if ( init < limit ) {
				1888	KMP_DEBUG_ASSERT(limit <= trip);
				1889	--limit;
				1890	status = 1;
				1891	break;
				1892	} // if
				1893	} // if
				1894	} // while (1)
				1895	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1896	/* restore FPCW if necessary */
				1897	if ( oldFpcw & fpcwSet != 0 )
				1898	_control87(oldFpcw,0x30000);
				1899	#endif
				1900	if ( status != 0 ) {
				1901	start = pr->u.p.lb;
				1902	incr = pr->u.p.st;
				1903	if ( p_st != NULL )
				1904	*p_st = incr;
				1905	if ( p_last != NULL )
				1906	*p_last = last;
				1907	p_lb = start + init incr;
				1908	p_ub = start + limit incr;
				1909	if ( pr->ordered ) {
				1910	pr->u.p.ordered_lower = init;
				1911	pr->u.p.ordered_upper = limit;
				1912	#ifdef KMP_DEBUG
				1913	{
				1914	const char * buff;
				1915	// create format specifiers before the debug output
				1916	buff = __kmp_str_format(
				1917	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1918	traits_t< UT >::spec, traits_t< UT >::spec );
				1919	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1920	__kmp_str_free( &buff );
				1921	}
				1922	#endif
				1923	}
				1924	} else {
				1925	*p_lb = 0;
				1926	*p_ub = 0;
				1927	if ( p_st != NULL )
				1928	*p_st = 0;
				1929	}
				1930	} // case
				1931	break;
				1932
				1933	case kmp_sch_trapezoidal:
				1934	{
				1935	UT index;
				1936	T parm2 = pr->u.p.parm2;
				1937	T parm3 = pr->u.p.parm3;
				1938	T parm4 = pr->u.p.parm4;
				1939	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
				1940	gtid ) );
				1941
				1942	index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
				1943
				1944	init = ( index * ( (2parm2) - (index-1)parm4 ) ) / 2;
				1945	trip = pr->u.p.tc - 1;
				1946
				1947	if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
				1948	*p_lb = 0;
				1949	*p_ub = 0;
				1950	if ( p_st != 0 ) *p_st = 0;
				1951	} else {
				1952	start = pr->u.p.lb;
				1953	limit = ( (index+1) * ( 2parm2 - indexparm4 ) ) / 2 - 1;
				1954	incr = pr->u.p.st;
				1955
				1956	if ( (last = (limit >= trip)) != 0 )
				1957	limit = trip;
				1958
				1959	if ( p_last != 0 ) {
				1960	*p_last = last;
				1961	}
				1962	if ( p_st != 0 ) *p_st = incr;
				1963
				1964	if ( incr == 1 ) {
				1965	*p_lb = start + init;
				1966	*p_ub = start + limit;
				1967	} else {
				1968	p_lb = start + init incr;
				1969	p_ub = start + limit incr;
				1970	}
				1971
				1972	if ( pr->ordered ) {
				1973	pr->u.p.ordered_lower = init;
				1974	pr->u.p.ordered_upper = limit;
				1975	#ifdef KMP_DEBUG
				1976	{
				1977	const char * buff;
				1978	// create format specifiers before the debug output
				1979	buff = __kmp_str_format(
				1980	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1981	traits_t< UT >::spec, traits_t< UT >::spec );
				1982	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1983	__kmp_str_free( &buff );
				1984	}
				1985	#endif
				1986	} // if
				1987	} // if
				1988	} // case
				1989	break;
				1990	} // switch
				1991	} // if tc == 0;
				1992
				1993	if ( status == 0 ) {
				1994	UT num_done;
				1995
				1996	num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
				1997	#ifdef KMP_DEBUG
				1998	{
				1999	const char * buff;
				2000	// create format specifiers before the debug output
				2001	buff = __kmp_str_format(
				2002	"__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
				2003	traits_t< UT >::spec );
				2004	KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
				2005	__kmp_str_free( &buff );
				2006	}
				2007	#endif
				2008
				2009	if ( num_done == team->t.t_nproc-1 ) {
				2010	/* NOTE: release this buffer to be reused */
				2011
				2012	KMP_MB(); /* Flush all pending memory write invalidates. */
				2013
				2014	sh->u.s.num_done = 0;
				2015	sh->u.s.iteration = 0;
				2016
				2017	/* TODO replace with general release procedure? */
				2018	if ( pr->ordered ) {
				2019	sh->u.s.ordered_iteration = 0;
				2020	}
				2021
				2022	KMP_MB(); /* Flush all pending memory write invalidates. */
				2023
				2024	sh -> buffer_index += KMP_MAX_DISP_BUF;
				2025	KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
				2026	gtid, sh->buffer_index) );
				2027
				2028	KMP_MB(); /* Flush all pending memory write invalidates. */
				2029
				2030	} // if
				2031	if ( __kmp_env_consistency_check ) {
				2032	if ( pr->pushed_ws != ct_none ) {
				2033	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				2034	}
				2035	}
				2036
				2037	th -> th.th_dispatch -> th_deo_fcn = NULL;
				2038	th -> th.th_dispatch -> th_dxo_fcn = NULL;
				2039	th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
				2040	th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
				2041	} // if (status == 0)
				2042	#if KMP_OS_WINDOWS
				2043	else if ( last ) {
				2044	pr->u.p.last_upper = pr->u.p.ub;
				2045	}
				2046	#endif /* KMP_OS_WINDOWS */
				2047	} // if
				2048
				2049	#ifdef KMP_DEBUG
				2050	{
				2051	const char * buff;
				2052	// create format specifiers before the debug output
				2053	buff = __kmp_str_format(
				2054	"__kmp_dispatch_next: T#%%d normal case: " \
				2055	"p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
				2056	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				2057	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last, status ) );
				2058	__kmp_str_free( &buff );
				2059	}
				2060	#endif
				2061	return status;
				2062	}
				2063
				2064	//-----------------------------------------------------------------------------------------
				2065	// Dispatch routines
				2066	// Transfer call to template< type T >
				2067	// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
				2068	// T lb, T ub, ST st, ST chunk )
				2069	extern "C" {
				2070
				2071	/*!
				2072	@ingroup WORK_SHARING
				2073	@{
				2074	@param loc Source location
				2075	@param gtid Global thread id
				2076	@param schedule Schedule type
				2077	@param lb Lower bound
				2078	@param ub Upper bound
				2079	@param st Step (or increment if you prefer)
				2080	@param chunk The chunk size to block with
				2081
				2082	This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
				2083	These functions are all identical apart from the types of the arguments.
				2084	*/
				2085
				2086	void
				2087	__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2088	kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
				2089	{
				2090	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2091	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2092	}
				2093	/*!
				2094	See @ref __kmpc_dispatch_init_4
				2095	*/
				2096	void
				2097	__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2098	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
				2099	{
				2100	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2101	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2102	}
				2103
				2104	/*!
				2105	See @ref __kmpc_dispatch_init_4
				2106	*/
				2107	void
				2108	__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2109	kmp_int64 lb, kmp_int64 ub,
				2110	kmp_int64 st, kmp_int64 chunk )
				2111	{
				2112	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2113	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2114	}
				2115
				2116	/*!
				2117	See @ref __kmpc_dispatch_init_4
				2118	*/
				2119	void
				2120	__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2121	kmp_uint64 lb, kmp_uint64 ub,
				2122	kmp_int64 st, kmp_int64 chunk )
				2123	{
				2124	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2125	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2126	}
				2127
				2128	/*!
				2129	@param loc Source code location
				2130	@param gtid Global thread id
				2131	@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
				2132	@param p_lb Pointer to the lower bound for the next chunk of work
				2133	@param p_ub Pointer to the upper bound for the next chunk of work
				2134	@param p_st Pointer to the stride for the next chunk of work
				2135	@return one if there is work to be done, zero otherwise
				2136
				2137	Get the next dynamically allocated chunk of work for this thread.
				2138	If there is no more work, then the lb,ub and stride need not be modified.
				2139	*/
				2140	int
				2141	__kmpc_dispatch_next_4( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2142	kmp_int32 p_lb, kmp_int32 p_ub, kmp_int32 *p_st )
				2143	{
				2144	return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2145	}
				2146
				2147	/*!
				2148	See @ref __kmpc_dispatch_next_4
				2149	*/
				2150	int
				2151	__kmpc_dispatch_next_4u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2152	kmp_uint32 p_lb, kmp_uint32 p_ub, kmp_int32 *p_st )
				2153	{
				2154	return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2155	}
				2156
				2157	/*!
				2158	See @ref __kmpc_dispatch_next_4
				2159	*/
				2160	int
				2161	__kmpc_dispatch_next_8( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2162	kmp_int64 p_lb, kmp_int64 p_ub, kmp_int64 *p_st )
				2163	{
				2164	return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2165	}
				2166
				2167	/*!
				2168	See @ref __kmpc_dispatch_next_4
				2169	*/
				2170	int
				2171	__kmpc_dispatch_next_8u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2172	kmp_uint64 p_lb, kmp_uint64 p_ub, kmp_int64 *p_st )
				2173	{
				2174	return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2175	}
				2176
				2177	/*!
				2178	@param loc Source code location
				2179	@param gtid Global thread id
				2180
				2181	Mark the end of a dynamic loop.
				2182	*/
				2183	void
				2184	__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
				2185	{
				2186	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2187	}
				2188
				2189	/*!
				2190	See @ref __kmpc_dispatch_fini_4
				2191	*/
				2192	void
				2193	__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
				2194	{
				2195	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2196	}
				2197
				2198	/*!
				2199	See @ref __kmpc_dispatch_fini_4
				2200	*/
				2201	void
				2202	__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
				2203	{
				2204	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2205	}
				2206
				2207	/*!
				2208	See @ref __kmpc_dispatch_fini_4
				2209	*/
				2210	void
				2211	__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
				2212	{
				2213	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2214	}
				2215	/! @} /
				2216
				2217	//-----------------------------------------------------------------------------------------
				2218	//Non-template routines from kmp_dispatch.c used in other sources
				2219
				2220	kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
				2221	return value == checker;
				2222	}
				2223
				2224	kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
				2225	return value != checker;
				2226	}
				2227
				2228	kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
				2229	return value < checker;
				2230	}
				2231
				2232	kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
				2233	return value >= checker;
				2234	}
				2235
				2236	kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
				2237	return value <= checker;
				2238	}
				2239	kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
				2240	return value == checker;
				2241	}
				2242
				2243	kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
				2244	return value != checker;
				2245	}
				2246
				2247	kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
				2248	return value < checker;
				2249	}
				2250
				2251	kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
				2252	return value >= checker;
				2253	}
				2254
				2255	kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
				2256	return value <= checker;
				2257	}
				2258
				2259	kmp_uint32
				2260	__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
				2261	kmp_uint32 checker,
				2262	kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
				2263	, void * obj // Higher-level synchronization object, or NULL.
				2264	)
				2265	{
				2266	// note: we may not belong to a team at this point
				2267	register volatile kmp_uint32 * spin = spinner;
				2268	register kmp_uint32 check = checker;
				2269	register kmp_uint32 spins;
				2270	register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
				2271	register kmp_uint32 r;
				2272
				2273	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2274	KMP_INIT_YIELD( spins );
				2275	// main wait spin loop
				2276	while(!f(r = TCR_4(*spin), check)) {
				2277	KMP_FSYNC_SPIN_PREPARE( obj );
				2278	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2279	It causes problems with infinite recursion because of exit lock */
				2280	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2281	__kmp_abort_thread(); */
				2282
				2283	__kmp_static_delay(TRUE);
				2284
				2285	/* if we have waited a bit, or are oversubscribed, yield */
				2286	/* pause is in the following code */
				2287	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2288	KMP_YIELD_SPIN( spins );
				2289	}
				2290	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2291	return r;
				2292	}
				2293
				2294	kmp_uint64
				2295	__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
				2296	kmp_uint64 checker,
				2297	kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
				2298	, void * obj // Higher-level synchronization object, or NULL.
				2299	)
				2300	{
				2301	// note: we may not belong to a team at this point
				2302	register volatile kmp_uint64 * spin = spinner;
				2303	register kmp_uint64 check = checker;
				2304	register kmp_uint32 spins;
				2305	register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
				2306	register kmp_uint64 r;
				2307
				2308	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2309	KMP_INIT_YIELD( spins );
				2310	// main wait spin loop
				2311	while(!f(r = *spin, check))
				2312	{
				2313	KMP_FSYNC_SPIN_PREPARE( obj );
				2314	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2315	It causes problems with infinite recursion because of exit lock */
				2316	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2317	__kmp_abort_thread(); */
				2318
				2319	__kmp_static_delay(TRUE);
				2320
				2321	// if we are oversubscribed,
				2322	// or have waited a bit (and KMP_LIBARRY=throughput, then yield
				2323	// pause is in the following code
				2324	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2325	KMP_YIELD_SPIN( spins );
				2326	}
				2327	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2328	return r;
				2329	}
				2330
				2331	} // extern "C"
				2332
				2333	#ifdef KMP_GOMP_COMPAT
				2334
				2335	void
				2336	__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2337	kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
				2338	kmp_int32 chunk, int push_ws )
				2339	{
				2340	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2341	push_ws );
				2342	}
				2343
				2344	void
				2345	__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2346	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
				2347	kmp_int32 chunk, int push_ws )
				2348	{
				2349	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2350	push_ws );
				2351	}
				2352
				2353	void
				2354	__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2355	kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
				2356	kmp_int64 chunk, int push_ws )
				2357	{
				2358	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2359	push_ws );
				2360	}
				2361
				2362	void
				2363	__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2364	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
				2365	kmp_int64 chunk, int push_ws )
				2366	{
				2367	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2368	push_ws );
				2369	}
				2370
				2371	void
				2372	__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
				2373	{
				2374	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2375	}
				2376
				2377	void
				2378	__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
				2379	{
				2380	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2381	}
				2382
				2383	void
				2384	__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
				2385	{
				2386	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2387	}
				2388
				2389	void
				2390	__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
				2391	{
				2392	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2393	}
				2394
				2395	#endif /* KMP_GOMP_COMPAT */
				2396
				2397	/* ------------------------------------------------------------------------ */
				2398	/* ------------------------------------------------------------------------ */
				2399