Blame - openmp/runtime/src/kmp_dispatch.cpp - toolchain/llvm-project

blob: 0225305303293477ffcd3cce20bda8f26e65dcd0 [file] [log] [blame]

Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1	/*
				2	* kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	3	* $Revision: 42674 $
				4	* $Date: 2013-09-18 11:12:49 -0500 (Wed, 18 Sep 2013) $
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	5	*/
				6
				7
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// The LLVM Compiler Infrastructure
				11	//
				12	// This file is dual licensed under the MIT and the University of Illinois Open
				13	// Source Licenses. See LICENSE.txt for details.
				14	//
				15	//===----------------------------------------------------------------------===//
				16
				17
				18	/*
				19	* Dynamic scheduling initialization and dispatch.
				20	*
				21	* NOTE: __kmp_nth is a constant inside of any dispatch loop, however
				22	* it may change values between parallel regions. __kmp_max_nth
				23	* is the largest value __kmp_nth may take, 1 is the smallest.
				24	*
				25	*/
				26
				27	/* ------------------------------------------------------------------------ */
				28	/* ------------------------------------------------------------------------ */
				29
				30	#include "kmp.h"
				31	#include "kmp_i18n.h"
				32	#include "kmp_itt.h"
				33	#include "kmp_str.h"
				34	#include "kmp_error.h"
				35	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				36	#include <float.h>
				37	#endif
				38
				39	/* ------------------------------------------------------------------------ */
				40	/* ------------------------------------------------------------------------ */
				41
				42	#ifdef KMP_STATIC_STEAL_ENABLED
				43
				44	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				45	template< typename T >
				46	struct dispatch_private_infoXX_template {
				47	typedef typename traits_t< T >::unsigned_t UT;
				48	typedef typename traits_t< T >::signed_t ST;
				49	UT count; // unsigned
				50	T ub;
				51	/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
				52	T lb;
				53	ST st; // signed
				54	UT tc; // unsigned
				55	T static_steal_counter; // for static_steal only; maybe better to put after ub
				56
				57	/* parm[1-4] are used in different ways by different scheduling algorithms */
				58
				59	// KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
				60	// a) parm3 is properly aligned and
				61	// b) all parm1-4 are in the same cache line.
				62	// Because of parm1-4 are used together, performance seems to be better
				63	// if they are in the same line (not measured though).
				64
				65	struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
				66	T parm1;
				67	T parm2;
				68	T parm3;
				69	T parm4;
				70	};
				71
				72	UT ordered_lower; // unsigned
				73	UT ordered_upper; // unsigned
				74	#if KMP_OS_WINDOWS
				75	T last_upper;
				76	#endif /* KMP_OS_WINDOWS */
				77	};
				78
				79	#else /* KMP_STATIC_STEAL_ENABLED */
				80
				81	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				82	template< typename T >
				83	struct dispatch_private_infoXX_template {
				84	typedef typename traits_t< T >::unsigned_t UT;
				85	typedef typename traits_t< T >::signed_t ST;
				86	T lb;
				87	T ub;
				88	ST st; // signed
				89	UT tc; // unsigned
				90
				91	T parm1;
				92	T parm2;
				93	T parm3;
				94	T parm4;
				95
				96	UT count; // unsigned
				97
				98	UT ordered_lower; // unsigned
				99	UT ordered_upper; // unsigned
				100	#if KMP_OS_WINDOWS
				101	T last_upper;
				102	#endif /* KMP_OS_WINDOWS */
				103	};
				104
				105	#endif /* KMP_STATIC_STEAL_ENABLED */
				106
				107	// replaces dispatch_private_info structure and dispatch_private_info_t type
				108	template< typename T >
				109	struct KMP_ALIGN_CACHE dispatch_private_info_template {
				110	// duplicate alignment here, otherwise size of structure is not correct in our compiler
				111	union KMP_ALIGN_CACHE private_info_tmpl {
				112	dispatch_private_infoXX_template< T > p;
				113	dispatch_private_info64_t p64;
				114	} u;
				115	enum sched_type schedule; /* scheduling algorithm */
				116	kmp_uint32 ordered; /* ordered clause specified */
				117	kmp_uint32 ordered_bumped;
				118	kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
				119	dispatch_private_info * next; /* stack of buffers for nest of serial regions */
				120	kmp_uint32 nomerge; /* don't merge iters if serialized */
				121	kmp_uint32 type_size;
				122	enum cons_type pushed_ws;
				123	};
				124
				125
				126	// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
				127	template< typename UT >
				128	struct dispatch_shared_infoXX_template {
				129	/* chunk index under dynamic, number of idle threads under static-steal;
				130	iteration index otherwise */
				131	volatile UT iteration;
				132	volatile UT num_done;
				133	volatile UT ordered_iteration;
				134	UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
				135	};
				136
				137	// replaces dispatch_shared_info structure and dispatch_shared_info_t type
				138	template< typename UT >
				139	struct dispatch_shared_info_template {
				140	// we need union here to keep the structure size
				141	union shared_info_tmpl {
				142	dispatch_shared_infoXX_template< UT > s;
				143	dispatch_shared_info64_t s64;
				144	} u;
				145	volatile kmp_uint32 buffer_index;
				146	};
				147
				148	/* ------------------------------------------------------------------------ */
				149	/* ------------------------------------------------------------------------ */
				150
				151	static void
				152	__kmp_static_delay( int arg )
				153	{
				154	/* Work around weird code-gen bug that causes assert to trip */
				155	#if KMP_ARCH_X86_64 && KMP_OS_LINUX
				156	#else
				157	KMP_ASSERT( arg >= 0 );
				158	#endif
				159	}
				160
				161	static void
				162	__kmp_static_yield( int arg )
				163	{
				164	__kmp_yield( arg );
				165	}
				166
				167	#undef USE_TEST_LOCKS
				168
				169	// test_then_add template (general template should NOT be used)
				170	template< typename T >
				171	static __forceinline T
				172	test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
				173
				174	template<>
				175	__forceinline kmp_int32
				176	test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
				177	{
				178	kmp_int32 r;
				179	r = KMP_TEST_THEN_ADD32( p, d );
				180	return r;
				181	}
				182
				183	template<>
				184	__forceinline kmp_int64
				185	test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
				186	{
				187	kmp_int64 r;
				188	r = KMP_TEST_THEN_ADD64( p, d );
				189	return r;
				190	}
				191
				192	// test_then_inc_acq template (general template should NOT be used)
				193	template< typename T >
				194	static __forceinline T
				195	test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
				196
				197	template<>
				198	__forceinline kmp_int32
				199	test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
				200	{
				201	kmp_int32 r;
				202	r = KMP_TEST_THEN_INC_ACQ32( p );
				203	return r;
				204	}
				205
				206	template<>
				207	__forceinline kmp_int64
				208	test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
				209	{
				210	kmp_int64 r;
				211	r = KMP_TEST_THEN_INC_ACQ64( p );
				212	return r;
				213	}
				214
				215	// test_then_inc template (general template should NOT be used)
				216	template< typename T >
				217	static __forceinline T
				218	test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
				219
				220	template<>
				221	__forceinline kmp_int32
				222	test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
				223	{
				224	kmp_int32 r;
				225	r = KMP_TEST_THEN_INC32( p );
				226	return r;
				227	}
				228
				229	template<>
				230	__forceinline kmp_int64
				231	test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
				232	{
				233	kmp_int64 r;
				234	r = KMP_TEST_THEN_INC64( p );
				235	return r;
				236	}
				237
				238	// compare_and_swap template (general template should NOT be used)
				239	template< typename T >
				240	static __forceinline kmp_int32
				241	compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
				242
				243	template<>
				244	__forceinline kmp_int32
				245	compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
				246	{
				247	return KMP_COMPARE_AND_STORE_REL32( p, c, s );
				248	}
				249
				250	template<>
				251	__forceinline kmp_int32
				252	compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
				253	{
				254	return KMP_COMPARE_AND_STORE_REL64( p, c, s );
				255	}
				256
				257	/*
				258	Spin wait loop that first does pause, then yield.
				259	Waits until function returns non-zero when called with *spinner and check.
				260	Does NOT put threads to sleep.
				261	#if USE_ITT_BUILD
				262	Arguments:
Alp Toker	8f2d3f0	2014-02-24 10:40:15 +0000	[diff] [blame]	263	obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	264	locks consistently. For example, if lock is acquired immediately, its address is
				265	reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
				266	immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
				267	address, not an address of low-level spinner.
				268	#endif // USE_ITT_BUILD
				269	*/
				270	template< typename UT >
				271	// ToDo: make inline function (move to header file for icl)
				272	static UT // unsigned 4- or 8-byte type
				273	__kmp_wait_yield( volatile UT * spinner,
				274	UT checker,
				275	kmp_uint32 (* pred)( UT, UT )
				276	USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
				277	)
				278	{
				279	// note: we may not belong to a team at this point
				280	register volatile UT * spin = spinner;
				281	register UT check = checker;
				282	register kmp_uint32 spins;
				283	register kmp_uint32 (*f) ( UT, UT ) = pred;
				284	register UT r;
				285
				286	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				287	KMP_INIT_YIELD( spins );
				288	// main wait spin loop
				289	while(!f(r = *spin, check))
				290	{
				291	KMP_FSYNC_SPIN_PREPARE( obj );
				292	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				293	It causes problems with infinite recursion because of exit lock */
				294	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				295	__kmp_abort_thread(); */
				296
				297	__kmp_static_delay(TRUE);
				298
				299	// if we are oversubscribed,
				300	// or have waited a bit (and KMP_LIBRARY=throughput, then yield
				301	// pause is in the following code
				302	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				303	KMP_YIELD_SPIN( spins );
				304	}
				305	KMP_FSYNC_SPIN_ACQUIRED( obj );
				306	return r;
				307	}
				308
				309	template< typename UT >
				310	static kmp_uint32 __kmp_eq( UT value, UT checker) {
				311	return value == checker;
				312	}
				313
				314	template< typename UT >
				315	static kmp_uint32 __kmp_neq( UT value, UT checker) {
				316	return value != checker;
				317	}
				318
				319	template< typename UT >
				320	static kmp_uint32 __kmp_lt( UT value, UT checker) {
				321	return value < checker;
				322	}
				323
				324	template< typename UT >
				325	static kmp_uint32 __kmp_ge( UT value, UT checker) {
				326	return value >= checker;
				327	}
				328
				329	template< typename UT >
				330	static kmp_uint32 __kmp_le( UT value, UT checker) {
				331	return value <= checker;
				332	}
				333
				334
				335	/* ------------------------------------------------------------------------ */
				336	/* ------------------------------------------------------------------------ */
				337
				338	static void
				339	__kmp_dispatch_deo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				340	{
				341	kmp_info_t *th;
				342
				343	KMP_DEBUG_ASSERT( gtid_ref );
				344
				345	if ( __kmp_env_consistency_check ) {
				346	th = __kmp_threads[*gtid_ref];
				347	if ( th -> th.th_root -> r.r_active
				348	&& ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
				349	__kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
				350	}
				351	}
				352	}
				353
				354	template< typename UT >
				355	static void
				356	__kmp_dispatch_deo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				357	{
				358	typedef typename traits_t< UT >::signed_t ST;
				359	dispatch_private_info_template< UT > * pr;
				360
				361	int gtid = *gtid_ref;
				362	// int cid = *cid_ref;
				363	kmp_info_t *th = __kmp_threads[ gtid ];
				364	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				365
				366	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
				367	if ( __kmp_env_consistency_check ) {
				368	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				369	( th -> th.th_dispatch -> th_dispatch_pr_current );
				370	if ( pr -> pushed_ws != ct_none ) {
				371	__kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
				372	}
				373	}
				374
				375	if ( ! th -> th.th_team -> t.t_serialized ) {
				376	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				377	( th -> th.th_dispatch -> th_dispatch_sh_current );
				378	UT lower;
				379
				380	if ( ! __kmp_env_consistency_check ) {
				381	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				382	( th -> th.th_dispatch -> th_dispatch_pr_current );
				383	}
				384	lower = pr->u.p.ordered_lower;
				385
				386	#if ! defined( KMP_GOMP_COMPAT )
				387	if ( __kmp_env_consistency_check ) {
				388	if ( pr->ordered_bumped ) {
				389	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				390	__kmp_error_construct2(
				391	kmp_i18n_msg_CnsMultipleNesting,
				392	ct_ordered_in_pdo, loc_ref,
				393	& p->stack_data[ p->w_top ]
				394	);
				395	}
				396	}
				397	#endif /* !defined(KMP_GOMP_COMPAT) */
				398
				399	KMP_MB();
				400	#ifdef KMP_DEBUG
				401	{
				402	const char * buff;
				403	// create format specifiers before the debug output
				404	buff = __kmp_str_format(
				405	"__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
				406	traits_t< UT >::spec, traits_t< UT >::spec );
				407	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				408	__kmp_str_free( &buff );
				409	}
				410	#endif
				411
				412	__kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				413	USE_ITT_BUILD_ARG( NULL )
				414	);
				415	KMP_MB(); /* is this necessary? */
				416	#ifdef KMP_DEBUG
				417	{
				418	const char * buff;
				419	// create format specifiers before the debug output
				420	buff = __kmp_str_format(
				421	"__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
				422	traits_t< UT >::spec, traits_t< UT >::spec );
				423	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				424	__kmp_str_free( &buff );
				425	}
				426	#endif
				427	}
				428	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
				429	}
				430
				431	static void
				432	__kmp_dispatch_dxo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				433	{
				434	kmp_info_t *th;
				435
				436	if ( __kmp_env_consistency_check ) {
				437	th = __kmp_threads[*gtid_ref];
				438	if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
				439	__kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
				440	}
				441	}
				442	}
				443
				444	template< typename UT >
				445	static void
				446	__kmp_dispatch_dxo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				447	{
				448	typedef typename traits_t< UT >::signed_t ST;
				449	dispatch_private_info_template< UT > * pr;
				450
				451	int gtid = *gtid_ref;
				452	// int cid = *cid_ref;
				453	kmp_info_t *th = __kmp_threads[ gtid ];
				454	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				455
				456	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
				457	if ( __kmp_env_consistency_check ) {
				458	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				459	( th -> th.th_dispatch -> th_dispatch_pr_current );
				460	if ( pr -> pushed_ws != ct_none ) {
				461	__kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
				462	}
				463	}
				464
				465	if ( ! th -> th.th_team -> t.t_serialized ) {
				466	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				467	( th -> th.th_dispatch -> th_dispatch_sh_current );
				468
				469	if ( ! __kmp_env_consistency_check ) {
				470	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				471	( th -> th.th_dispatch -> th_dispatch_pr_current );
				472	}
				473
				474	KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
				475	#if ! defined( KMP_GOMP_COMPAT )
				476	if ( __kmp_env_consistency_check ) {
				477	if ( pr->ordered_bumped != 0 ) {
				478	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				479	/* How to test it? - OM */
				480	__kmp_error_construct2(
				481	kmp_i18n_msg_CnsMultipleNesting,
				482	ct_ordered_in_pdo, loc_ref,
				483	& p->stack_data[ p->w_top ]
				484	);
				485	}
				486	}
				487	#endif /* !defined(KMP_GOMP_COMPAT) */
				488
				489	KMP_MB(); /* Flush all pending memory write invalidates. */
				490
				491	pr->ordered_bumped += 1;
				492
				493	KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
				494	gtid, pr->ordered_bumped ) );
				495
				496	KMP_MB(); /* Flush all pending memory write invalidates. */
				497
				498	/* TODO use general release procedure? */
				499	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				500
				501	KMP_MB(); /* Flush all pending memory write invalidates. */
				502	}
				503	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
				504	}
				505
				506	/* Computes and returns x to the power of y, where y must a non-negative integer */
				507	template< typename UT >
				508	static __forceinline long double
				509	__kmp_pow(long double x, UT y) {
				510	long double s=1.0L;
				511
				512	KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
				513	//KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
				514	while(y) {
				515	if ( y & 1 )
				516	s *= x;
				517	x *= x;
				518	y >>= 1;
				519	}
				520	return s;
				521	}
				522
				523	/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
				524	(the total number of unassigned iterations in chunks with index greater than or equal to idx).
				525	__forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
				526	(one of the unit tests, sch_guided_analytical_basic.cpp, fails)
				527	*/
				528	template< typename T >
				529	static __inline typename traits_t< T >::unsigned_t
				530	__kmp_dispatch_guided_remaining(
				531	T tc,
				532	typename traits_t< T >::floating_t base,
				533	typename traits_t< T >::unsigned_t idx
				534	) {
				535	/* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
				536	least for ICL 8.1, long double arithmetic may not really have
				537	long double precision, even with /Qlong_double. Currently, we
				538	workaround that in the caller code, by manipulating the FPCW for
				539	Windows* OS on IA-32 architecture. The lack of precision is not
				540	expected to be a correctness issue, though.
				541	*/
				542	typedef typename traits_t< T >::unsigned_t UT;
				543
				544	long double x = tc * __kmp_pow< UT >(base, idx);
				545	UT r = (UT) x;
				546	if ( x == r )
				547	return r;
				548	return r + 1;
				549	}
				550
				551	// Parameters of the guided-iterative algorithm:
				552	// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
				553	// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
				554	// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
				555	// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
				556	static int guided_int_param = 2;
				557	static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
				558
				559	// UT - unsigned flavor of T, ST - signed flavor of T,
				560	// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
				561	template< typename T >
				562	static void
				563	__kmp_dispatch_init(
				564	ident_t * loc,
				565	int gtid,
				566	enum sched_type schedule,
				567	T lb,
				568	T ub,
				569	typename traits_t< T >::signed_t st,
				570	typename traits_t< T >::signed_t chunk,
				571	int push_ws
				572	) {
				573	typedef typename traits_t< T >::unsigned_t UT;
				574	typedef typename traits_t< T >::signed_t ST;
				575	typedef typename traits_t< T >::floating_t DBL;
				576	static const int ___kmp_size_type = sizeof( UT );
				577
				578	int active;
				579	T tc;
				580	kmp_info_t * th;
				581	kmp_team_t * team;
				582	kmp_uint32 my_buffer_index;
				583	dispatch_private_info_template< T > * pr;
				584	dispatch_shared_info_template< UT > volatile * sh;
				585
				586	KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
				587	KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
				588
				589	if ( ! TCR_4( __kmp_init_parallel ) )
				590	__kmp_parallel_initialize();
				591
				592	#ifdef KMP_DEBUG
				593	{
				594	const char * buff;
				595	// create format specifiers before the debug output
				596	buff = __kmp_str_format(
				597	"__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
				598	traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				599	KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
				600	__kmp_str_free( &buff );
				601	}
				602	#endif
				603	/* setup data */
				604	th = __kmp_threads[ gtid ];
				605	team = th -> th.th_team;
				606	active = ! team -> t.t_serialized;
				607	th->th.th_ident = loc;
				608
				609	if ( ! active ) {
				610	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				611	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				612	} else {
				613	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				614	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				615
				616	my_buffer_index = th->th.th_dispatch->th_disp_index ++;
				617
				618	/* What happens when number of threads changes, need to resize buffer? */
				619	pr = reinterpret_cast< dispatch_private_info_template< T > * >
				620	( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				621	sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
				622	( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				623	}
				624
				625	/* Pick up the nomerge/ordered bits from the scheduling type */
				626	if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
				627	pr->nomerge = TRUE;
				628	schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
				629	} else {
				630	pr->nomerge = FALSE;
				631	}
				632	pr->type_size = ___kmp_size_type; // remember the size of variables
				633	if ( kmp_ord_lower & schedule ) {
				634	pr->ordered = TRUE;
				635	schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
				636	} else {
				637	pr->ordered = FALSE;
				638	}
				639	if ( schedule == kmp_sch_static ) {
				640	schedule = __kmp_static;
				641	} else {
				642	if ( schedule == kmp_sch_runtime ) {
				643	#if OMP_30_ENABLED
				644	// Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
				645	schedule = team -> t.t_sched.r_sched_type;
				646	// Detail the schedule if needed (global controls are differentiated appropriately)
				647	if ( schedule == kmp_sch_guided_chunked ) {
				648	schedule = __kmp_guided;
				649	} else if ( schedule == kmp_sch_static ) {
				650	schedule = __kmp_static;
				651	}
				652	// Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
				653	chunk = team -> t.t_sched.chunk;
				654	#else
				655	kmp_r_sched_t r_sched = __kmp_get_schedule_global();
				656	// Use the scheduling specified by OMP_SCHEDULE and/or KMP_SCHEDULE or default
				657	schedule = r_sched.r_sched_type;
				658	chunk = r_sched.chunk;
				659	#endif
				660
				661	#ifdef KMP_DEBUG
				662	{
				663	const char * buff;
				664	// create format specifiers before the debug output
				665	buff = __kmp_str_format(
				666	"__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
				667	traits_t< ST >::spec );
				668	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				669	__kmp_str_free( &buff );
				670	}
				671	#endif
				672	} else {
				673	if ( schedule == kmp_sch_guided_chunked ) {
				674	schedule = __kmp_guided;
				675	}
				676	if ( chunk <= 0 ) {
				677	chunk = KMP_DEFAULT_CHUNK;
				678	}
				679	}
				680
				681	#if OMP_30_ENABLED
				682	if ( schedule == kmp_sch_auto ) {
				683	// mapping and differentiation: in the __kmp_do_serial_initialize()
				684	schedule = __kmp_auto;
				685	#ifdef KMP_DEBUG
				686	{
				687	const char * buff;
				688	// create format specifiers before the debug output
				689	buff = __kmp_str_format(
				690	"__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
				691	traits_t< ST >::spec );
				692	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				693	__kmp_str_free( &buff );
				694	}
				695	#endif
				696	}
				697	#endif // OMP_30_ENABLED
				698
				699	/* guided analytical not safe for too many threads */
				700	if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
				701	schedule = kmp_sch_guided_iterative_chunked;
				702	KMP_WARNING( DispatchManyThreads );
				703	}
				704	pr->u.p.parm1 = chunk;
				705	}
				706	KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
				707	"unknown scheduling type" );
				708
				709	pr->u.p.count = 0;
				710
				711	if ( __kmp_env_consistency_check ) {
				712	if ( st == 0 ) {
				713	__kmp_error_construct(
				714	kmp_i18n_msg_CnsLoopIncrZeroProhibited,
				715	( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
				716	);
				717	}
				718	}
				719
				720	tc = ( ub - lb + st );
				721	if ( st != 1 ) {
				722	if ( st < 0 ) {
				723	if ( lb < ub ) {
				724	tc = 0; // zero-trip
				725	} else { // lb >= ub
				726	tc = (ST)tc / st; // convert to signed division
				727	}
				728	} else { // st > 0
				729	if ( ub < lb ) {
				730	tc = 0; // zero-trip
				731	} else { // lb >= ub
				732	tc /= st;
				733	}
				734	}
				735	} else if ( ub < lb ) { // st == 1
				736	tc = 0; // zero-trip
				737	}
				738
				739	pr->u.p.lb = lb;
				740	pr->u.p.ub = ub;
				741	pr->u.p.st = st;
				742	pr->u.p.tc = tc;
				743
				744	#if KMP_OS_WINDOWS
				745	pr->u.p.last_upper = ub + st;
				746	#endif /* KMP_OS_WINDOWS */
				747
				748	/* NOTE: only the active parallel region(s) has active ordered sections */
				749
				750	if ( active ) {
				751	if ( pr->ordered == 0 ) {
				752	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
				753	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
				754	} else {
				755	pr->ordered_bumped = 0;
				756
				757	pr->u.p.ordered_lower = 1;
				758	pr->u.p.ordered_upper = 0;
				759
				760	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
				761	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
				762	}
				763	}
				764
				765	if ( __kmp_env_consistency_check ) {
				766	enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
				767	if ( push_ws ) {
				768	__kmp_push_workshare( gtid, ws, loc );
				769	pr->pushed_ws = ws;
				770	} else {
				771	__kmp_check_workshare( gtid, ws, loc );
				772	pr->pushed_ws = ct_none;
				773	}
				774	}
				775
				776	switch ( schedule ) {
				777	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				778	case kmp_sch_static_steal:
				779	{
				780	T nproc = team->t.t_nproc;
				781	T ntc, init;
				782
				783	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
				784
				785	ntc = (tc % chunk ? 1 : 0) + tc / chunk;
				786	if ( nproc > 1 && ntc >= nproc ) {
				787	T id = __kmp_tid_from_gtid(gtid);
				788	T small_chunk, extras;
				789
				790	small_chunk = ntc / nproc;
				791	extras = ntc % nproc;
				792
				793	init = id * small_chunk + ( id < extras ? id : extras );
				794	pr->u.p.count = init;
				795	pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
				796
				797	pr->u.p.parm2 = lb;
				798	//pr->pfields.parm3 = 0; // it's not used in static_steal
				799	pr->u.p.parm4 = id;
				800	pr->u.p.st = st;
				801	break;
				802	} else {
				803	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
				804	gtid ) );
				805	schedule = kmp_sch_static_balanced;
				806	/* too few iterations: fall-through to kmp_sch_static_balanced */
				807	} // if
				808	/* FALL-THROUGH to static balanced */
				809	} // case
				810	#endif
				811	case kmp_sch_static_balanced:
				812	{
				813	T nproc = team->t.t_nproc;
				814	T init, limit;
				815
				816	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
				817	gtid ) );
				818
				819	if ( nproc > 1 ) {
				820	T id = __kmp_tid_from_gtid(gtid);
				821
				822	if ( tc < nproc ) {
				823	if ( id < tc ) {
				824	init = id;
				825	limit = id;
				826	pr->u.p.parm1 = (id == tc - 1); /* parm1 stores plastiter /
				827	} else {
				828	pr->u.p.count = 1; /* means no more chunks to execute */
				829	pr->u.p.parm1 = FALSE;
				830	break;
				831	}
				832	} else {
				833	T small_chunk = tc / nproc;
				834	T extras = tc % nproc;
				835	init = id * small_chunk + (id < extras ? id : extras);
				836	limit = init + small_chunk - (id < extras ? 0 : 1);
				837	pr->u.p.parm1 = (id == nproc - 1);
				838	}
				839	} else {
				840	if ( tc > 0 ) {
				841	init = 0;
				842	limit = tc - 1;
				843	pr->u.p.parm1 = TRUE;
				844	} else {
				845	// zero trip count
				846	pr->u.p.count = 1; /* means no more chunks to execute */
				847	pr->u.p.parm1 = FALSE;
				848	break;
				849	}
				850	}
				851	if ( st == 1 ) {
				852	pr->u.p.lb = lb + init;
				853	pr->u.p.ub = lb + limit;
				854	} else {
				855	T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
				856	pr->u.p.lb = lb + init * st;
				857	// adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
				858	if ( st > 0 ) {
				859	pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
				860	} else {
				861	pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
				862	}
				863	}
				864	if ( pr->ordered ) {
				865	pr->u.p.ordered_lower = init;
				866	pr->u.p.ordered_upper = limit;
				867	}
				868	break;
				869	} // case
				870	case kmp_sch_guided_iterative_chunked :
				871	{
				872	T nproc = team->t.t_nproc;
				873	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
				874
				875	if ( nproc > 1 ) {
				876	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				877	/* chunk size too large, switch to dynamic */
				878	schedule = kmp_sch_dynamic_chunked;
				879	} else {
				880	// when remaining iters become less than parm2 - switch to dynamic
				881	pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
				882	(double)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
				883	}
				884	} else {
				885	KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
				886	schedule = kmp_sch_static_greedy;
				887	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				888	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				889	pr->u.p.parm1 = tc;
				890	} // if
				891	} // case
				892	break;
				893	case kmp_sch_guided_analytical_chunked:
				894	{
				895	T nproc = team->t.t_nproc;
				896	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
				897
				898	if ( nproc > 1 ) {
				899	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				900	/* chunk size too large, switch to dynamic */
				901	schedule = kmp_sch_dynamic_chunked;
				902	} else {
				903	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				904	DBL x;
				905
				906	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				907	/* Linux* OS already has 64-bit computation by default for
				908	long double, and on Windows* OS on Intel(R) 64,
				909	/Qlong_double doesn't work. On Windows* OS
				910	on IA-32 architecture, we need to set precision to
				911	64-bit instead of the default 53-bit. Even though long
				912	double doesn't work on Windows* OS on Intel(R) 64, the
				913	resulting lack of precision is not expected to impact
				914	the correctness of the algorithm, but this has not been
				915	mathematically proven.
				916	*/
				917	// save original FPCW and set precision to 64-bit, as
				918	// Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	919	unsigned int oldFpcw = _control87(0,0);
				920	_control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	921	#endif
				922	/* value used for comparison in solver for cross-over point */
				923	long double target = ((long double)chunk * 2 + 1) * nproc / tc;
				924
				925	/* crossover point--chunk indexes equal to or greater than
				926	this point switch to dynamic-style scheduling */
				927	UT cross;
				928
				929	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				930	x = (long double)1.0 - (long double)0.5 / nproc;
				931
				932	#ifdef KMP_DEBUG
				933	{ // test natural alignment
				934	struct _test_a {
				935	char a;
				936	union {
				937	char b;
				938	DBL d;
				939	};
				940	} t;
				941	ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
				942	//__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
				943	KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
				944	}
				945	#endif // KMP_DEBUG
				946
				947	/* save the term in thread private dispatch structure */
				948	(DBL)&pr->u.p.parm3 = x;
				949
				950	/* solve for the crossover point to the nearest integer i for which C_i <= chunk */
				951	{
				952	UT left, right, mid;
				953	long double p;
				954
				955	/* estimate initial upper and lower bound */
				956
				957	/* doesn't matter what value right is as long as it is positive, but
				958	it affects performance of the solver
				959	*/
				960	right = 229;
				961	p = __kmp_pow< UT >(x,right);
				962	if ( p > target ) {
				963	do{
				964	p *= p;
				965	right <<= 1;
				966	} while(p>target && right < (1<<27));
				967	left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
				968	} else {
				969	left = 0;
				970	}
				971
				972	/* bisection root-finding method */
				973	while ( left + 1 < right ) {
				974	mid = (left + right) / 2;
				975	if ( __kmp_pow< UT >(x,mid) > target ) {
				976	left = mid;
				977	} else {
				978	right = mid;
				979	}
				980	} // while
				981	cross = right;
				982	}
				983	/* assert sanity of computed crossover point */
				984	KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
				985
				986	/* save the crossover point in thread private dispatch structure */
				987	pr->u.p.parm2 = cross;
				988
				989	// C75803
				990	#if ( ( KMP_OS_LINUX \|\| KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
				991	#define GUIDED_ANALYTICAL_WORKAROUND (( DBL )&pr->u.p.parm3)
				992	#else
				993	#define GUIDED_ANALYTICAL_WORKAROUND (x)
				994	#endif
				995	/* dynamic-style scheduling offset */
				996	pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
				997	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				998	// restore FPCW
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	999	_control87(oldFpcw,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1000	#endif
				1001	} // if
				1002	} else {
				1003	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
				1004	gtid ) );
				1005	schedule = kmp_sch_static_greedy;
				1006	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				1007	pr->u.p.parm1 = tc;
				1008	} // if
				1009	} // case
				1010	break;
				1011	case kmp_sch_static_greedy:
				1012	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				1013	pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
				1014	( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
				1015	tc;
				1016	break;
				1017	case kmp_sch_static_chunked :
				1018	case kmp_sch_dynamic_chunked :
				1019	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
				1020	break;
				1021	case kmp_sch_trapezoidal :
				1022	{
				1023	/* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
				1024
				1025	T parm1, parm2, parm3, parm4;
				1026	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
				1027
				1028	parm1 = chunk;
				1029
				1030	/* F : size of the first cycle */
				1031	parm2 = ( tc / (2 * team->t.t_nproc) );
				1032
				1033	if ( parm2 < 1 ) {
				1034	parm2 = 1;
				1035	}
				1036
				1037	/* L : size of the last cycle. Make sure the last cycle
				1038	* is not larger than the first cycle.
				1039	*/
				1040	if ( parm1 < 1 ) {
				1041	parm1 = 1;
				1042	} else if ( parm1 > parm2 ) {
				1043	parm1 = parm2;
				1044	}
				1045
				1046	/* N : number of cycles */
				1047	parm3 = ( parm2 + parm1 );
				1048	parm3 = ( 2 * tc + parm3 - 1) / parm3;
				1049
				1050	if ( parm3 < 2 ) {
				1051	parm3 = 2;
				1052	}
				1053
				1054	/* sigma : decreasing incr of the trapezoid */
				1055	parm4 = ( parm3 - 1 );
				1056	parm4 = ( parm2 - parm1 ) / parm4;
				1057
				1058	// pointless check, because parm4 >= 0 always
				1059	//if ( parm4 < 0 ) {
				1060	// parm4 = 0;
				1061	//}
				1062
				1063	pr->u.p.parm1 = parm1;
				1064	pr->u.p.parm2 = parm2;
				1065	pr->u.p.parm3 = parm3;
				1066	pr->u.p.parm4 = parm4;
				1067	} // case
				1068	break;
				1069
				1070	default:
				1071	{
				1072	__kmp_msg(
				1073	kmp_ms_fatal, // Severity
				1074	KMP_MSG( UnknownSchedTypeDetected ), // Primary message
				1075	KMP_HNT( GetNewerLibrary ), // Hint
				1076	__kmp_msg_null // Variadic argument list terminator
				1077	);
				1078	}
				1079	break;
				1080	} // switch
				1081	pr->schedule = schedule;
				1082	if ( active ) {
				1083	/* The name of this buffer should be my_buffer_index when it's free to use it */
				1084
				1085	KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1086	gtid, my_buffer_index, sh->buffer_index) );
				1087	__kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
				1088	USE_ITT_BUILD_ARG( NULL )
				1089	);
				1090	// Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
				1091	// always 32-bit integers.
				1092	KMP_MB(); /* is this necessary? */
				1093	KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1094	gtid, my_buffer_index, sh->buffer_index) );
				1095
				1096	th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
				1097	th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
				1098	#if USE_ITT_BUILD
				1099	if ( pr->ordered ) {
				1100	__kmp_itt_ordered_init( gtid );
				1101	}; // if
				1102	#endif /* USE_ITT_BUILD */
				1103	}; // if
				1104	#ifdef KMP_DEBUG
				1105	{
				1106	const char * buff;
				1107	// create format specifiers before the debug output
				1108	buff = __kmp_str_format(
				1109	"__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
				1110	" st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
				1111	" parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
				1112	traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
				1113	traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
				1114	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
				1115	traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
				1116	KD_TRACE(10, ( buff,
				1117	gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
				1118	pr->u.p.st, pr->u.p.tc, pr->u.p.count,
				1119	pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
				1120	pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
				1121	__kmp_str_free( &buff );
				1122	}
				1123	#endif
				1124	#if ( KMP_STATIC_STEAL_ENABLED )
				1125	if ( ___kmp_size_type < 8 ) {
				1126	// It cannot be guaranteed that after execution of a loop with some other schedule kind
				1127	// all the parm3 variables will contain the same value.
				1128	// Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
				1129	// rather than program life-time increment.
				1130	// So the dedicated variable is required. The 'static_steal_counter' is used.
				1131	if( schedule == kmp_sch_static_steal ) {
				1132	// Other threads will inspect this variable when searching for a victim.
				1133	// This is a flag showing that other threads may steal from this thread since then.
				1134	volatile T * p = &pr->u.p.static_steal_counter;
				1135	p = p + 1;
				1136	}
				1137	}
				1138	#endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
				1139	}
				1140
				1141	/*
				1142	* For ordered loops, either __kmp_dispatch_finish() should be called after
				1143	* every iteration, or __kmp_dispatch_finish_chunk() should be called after
				1144	* every chunk of iterations. If the ordered section(s) were not executed
				1145	* for this iteration (or every iteration in this chunk), we need to set the
				1146	* ordered iteration counters so that the next thread can proceed.
				1147	*/
				1148	template< typename UT >
				1149	static void
				1150	__kmp_dispatch_finish( int gtid, ident_t *loc )
				1151	{
				1152	typedef typename traits_t< UT >::signed_t ST;
				1153	kmp_info_t *th = __kmp_threads[ gtid ];
				1154
				1155	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
				1156	if ( ! th -> th.th_team -> t.t_serialized ) {
				1157
				1158	dispatch_private_info_template< UT > * pr =
				1159	reinterpret_cast< dispatch_private_info_template< UT >* >
				1160	( th->th.th_dispatch->th_dispatch_pr_current );
				1161	dispatch_shared_info_template< UT > volatile * sh =
				1162	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1163	( th->th.th_dispatch->th_dispatch_sh_current );
				1164	KMP_DEBUG_ASSERT( pr );
				1165	KMP_DEBUG_ASSERT( sh );
				1166	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1167	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1168
				1169	if ( pr->ordered_bumped ) {
				1170	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1171	gtid ) );
				1172	pr->ordered_bumped = 0;
				1173	} else {
				1174	UT lower = pr->u.p.ordered_lower;
				1175
				1176	#ifdef KMP_DEBUG
				1177	{
				1178	const char * buff;
				1179	// create format specifiers before the debug output
				1180	buff = __kmp_str_format(
				1181	"__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
				1182	traits_t< UT >::spec, traits_t< UT >::spec );
				1183	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1184	__kmp_str_free( &buff );
				1185	}
				1186	#endif
				1187
				1188	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1189	USE_ITT_BUILD_ARG(NULL)
				1190	);
				1191	KMP_MB(); /* is this necessary? */
				1192	#ifdef KMP_DEBUG
				1193	{
				1194	const char * buff;
				1195	// create format specifiers before the debug output
				1196	buff = __kmp_str_format(
				1197	"__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
				1198	traits_t< UT >::spec, traits_t< UT >::spec );
				1199	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1200	__kmp_str_free( &buff );
				1201	}
				1202	#endif
				1203
				1204	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				1205	} // if
				1206	} // if
				1207	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
				1208	}
				1209
				1210	#ifdef KMP_GOMP_COMPAT
				1211
				1212	template< typename UT >
				1213	static void
				1214	__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
				1215	{
				1216	typedef typename traits_t< UT >::signed_t ST;
				1217	kmp_info_t *th = __kmp_threads[ gtid ];
				1218
				1219	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
				1220	if ( ! th -> th.th_team -> t.t_serialized ) {
				1221	// int cid;
				1222	dispatch_private_info_template< UT > * pr =
				1223	reinterpret_cast< dispatch_private_info_template< UT >* >
				1224	( th->th.th_dispatch->th_dispatch_pr_current );
				1225	dispatch_shared_info_template< UT > volatile * sh =
				1226	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1227	( th->th.th_dispatch->th_dispatch_sh_current );
				1228	KMP_DEBUG_ASSERT( pr );
				1229	KMP_DEBUG_ASSERT( sh );
				1230	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1231	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1232
				1233	// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
				1234	UT lower = pr->u.p.ordered_lower;
				1235	UT upper = pr->u.p.ordered_upper;
				1236	UT inc = upper - lower + 1;
				1237
				1238	if ( pr->ordered_bumped == inc ) {
				1239	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1240	gtid ) );
				1241	pr->ordered_bumped = 0;
				1242	} else {
				1243	inc -= pr->ordered_bumped;
				1244
				1245	#ifdef KMP_DEBUG
				1246	{
				1247	const char * buff;
				1248	// create format specifiers before the debug output
				1249	buff = __kmp_str_format(
				1250	"__kmp_dispatch_finish_chunk: T#%%d before wait: " \
				1251	"ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
				1252	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1253	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
				1254	__kmp_str_free( &buff );
				1255	}
				1256	#endif
				1257
				1258	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1259	USE_ITT_BUILD_ARG(NULL)
				1260	);
				1261
				1262	KMP_MB(); /* is this necessary? */
				1263	KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
				1264	gtid ) );
				1265	pr->ordered_bumped = 0;
				1266	//!!!!! TODO check if the inc should be unsigned, or signed???
				1267	#ifdef KMP_DEBUG
				1268	{
				1269	const char * buff;
				1270	// create format specifiers before the debug output
				1271	buff = __kmp_str_format(
				1272	"__kmp_dispatch_finish_chunk: T#%%d after wait: " \
				1273	"ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
				1274	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1275	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
				1276	__kmp_str_free( &buff );
				1277	}
				1278	#endif
				1279
				1280	test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
				1281	}
				1282	// }
				1283	}
				1284	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
				1285	}
				1286
				1287	#endif /* KMP_GOMP_COMPAT */
				1288
				1289	template< typename T >
				1290	static int
				1291	__kmp_dispatch_next(
				1292	ident_t loc, int gtid, kmp_int32 p_last, T p_lb, T p_ub, typename traits_t< T >::signed_t *p_st
				1293	) {
				1294
				1295	typedef typename traits_t< T >::unsigned_t UT;
				1296	typedef typename traits_t< T >::signed_t ST;
				1297	typedef typename traits_t< T >::floating_t DBL;
				1298	static const int ___kmp_size_type = sizeof( UT );
				1299
				1300	int status;
				1301	dispatch_private_info_template< T > * pr;
				1302	kmp_info_t * th = __kmp_threads[ gtid ];
				1303	kmp_team_t * team = th -> th.th_team;
				1304
				1305	#ifdef KMP_DEBUG
				1306	{
				1307	const char * buff;
				1308	// create format specifiers before the debug output
				1309	buff = __kmp_str_format(
				1310	"__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
				1311	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				1312	KD_TRACE(1000, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last ) );
				1313	__kmp_str_free( &buff );
				1314	}
				1315	#endif
				1316
				1317	if ( team -> t.t_serialized ) {
				1318	/* NOTE: serialize this dispatch becase we are not at the active level */
				1319	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1320	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				1321	KMP_DEBUG_ASSERT( pr );
				1322
				1323	if ( (status = (pr->u.p.tc != 0)) == 0 ) {
				1324	*p_lb = 0;
				1325	*p_ub = 0;
				1326	if ( p_st != 0 ) {
				1327	*p_st = 0;
				1328	}
				1329	if ( __kmp_env_consistency_check ) {
				1330	if ( pr->pushed_ws != ct_none ) {
				1331	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1332	}
				1333	}
				1334	} else if ( pr->nomerge ) {
				1335	kmp_int32 last;
				1336	T start;
				1337	UT limit, trip, init;
				1338	ST incr;
				1339	T chunk = pr->u.p.parm1;
				1340
				1341	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
				1342
				1343	init = chunk * pr->u.p.count++;
				1344	trip = pr->u.p.tc - 1;
				1345
				1346	if ( (status = (init <= trip)) == 0 ) {
				1347	*p_lb = 0;
				1348	*p_ub = 0;
				1349	if ( p_st != 0 ) *p_st = 0;
				1350	if ( __kmp_env_consistency_check ) {
				1351	if ( pr->pushed_ws != ct_none ) {
				1352	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1353	}
				1354	}
				1355	} else {
				1356	start = pr->u.p.lb;
				1357	limit = chunk + init - 1;
				1358	incr = pr->u.p.st;
				1359
				1360	if ( (last = (limit >= trip)) != 0 ) {
				1361	limit = trip;
				1362	#if KMP_OS_WINDOWS
				1363	pr->u.p.last_upper = pr->u.p.ub;
				1364	#endif /* KMP_OS_WINDOWS */
				1365	}
				1366	if ( p_last ) {
				1367	*p_last = last;
				1368	}
				1369	if ( p_st != 0 ) {
				1370	*p_st = incr;
				1371	}
				1372	if ( incr == 1 ) {
				1373	*p_lb = start + init;
				1374	*p_ub = start + limit;
				1375	} else {
				1376	p_lb = start + init incr;
				1377	p_ub = start + limit incr;
				1378	}
				1379
				1380	if ( pr->ordered ) {
				1381	pr->u.p.ordered_lower = init;
				1382	pr->u.p.ordered_upper = limit;
				1383	#ifdef KMP_DEBUG
				1384	{
				1385	const char * buff;
				1386	// create format specifiers before the debug output
				1387	buff = __kmp_str_format(
				1388	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1389	traits_t< UT >::spec, traits_t< UT >::spec );
				1390	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1391	__kmp_str_free( &buff );
				1392	}
				1393	#endif
				1394	} // if
				1395	} // if
				1396	} else {
				1397	pr->u.p.tc = 0;
				1398
				1399	*p_lb = pr->u.p.lb;
				1400	*p_ub = pr->u.p.ub;
				1401	#if KMP_OS_WINDOWS
				1402	pr->u.p.last_upper = *p_ub;
				1403	#endif /* KMP_OS_WINDOWS */
				1404
				1405	if ( p_st != 0 ) {
				1406	*p_st = pr->u.p.st;
				1407	}
				1408	if ( p_last ) {
				1409	*p_last = TRUE;
				1410	}
				1411	} // if
				1412	#ifdef KMP_DEBUG
				1413	{
				1414	const char * buff;
				1415	// create format specifiers before the debug output
				1416	buff = __kmp_str_format(
				1417	"__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
				1418	"p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
				1419	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				1420	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, *p_st, p_last, status) );
				1421	__kmp_str_free( &buff );
				1422	}
				1423	#endif
				1424	return status;
				1425	} else {
				1426	kmp_int32 last = 0;
				1427	dispatch_shared_info_template< UT > *sh;
				1428	T start;
				1429	ST incr;
				1430	UT limit, trip, init;
				1431
				1432	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1433	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1434
				1435	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1436	( th->th.th_dispatch->th_dispatch_pr_current );
				1437	KMP_DEBUG_ASSERT( pr );
				1438	sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				1439	( th->th.th_dispatch->th_dispatch_sh_current );
				1440	KMP_DEBUG_ASSERT( sh );
				1441
				1442	if ( pr->u.p.tc == 0 ) {
				1443	// zero trip count
				1444	status = 0;
				1445	} else {
				1446	switch (pr->schedule) {
				1447	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1448	case kmp_sch_static_steal:
				1449	{
				1450	T chunk = pr->u.p.parm1;
				1451
				1452	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
				1453
				1454	trip = pr->u.p.tc - 1;
				1455
				1456	if ( ___kmp_size_type > 4 ) {
				1457	// Other threads do not look into the data of this thread,
				1458	// so it's not necessary to make volatile casting.
				1459	init = ( pr->u.p.count )++;
				1460	status = ( init < (UT)pr->u.p.ub );
				1461	} else {
				1462	typedef union {
				1463	struct {
				1464	UT count;
				1465	T ub;
				1466	} p;
				1467	kmp_int64 b;
				1468	} union_i4;
				1469	// All operations on 'count' or 'ub' must be combined atomically together.
				1470	// stealing implemented only for 4-byte indexes
				1471	{
				1472	union_i4 vold, vnew;
				1473	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1474	vnew = vold;
				1475	vnew.p.count++;
				1476	while( ! KMP_COMPARE_AND_STORE_ACQ64(
				1477	( volatile kmp_int64* )&pr->u.p.count,
				1478	VOLATILE_CAST(kmp_int64 )&vold.b,
				1479	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1480	KMP_CPU_PAUSE();
				1481	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1482	vnew = vold;
				1483	vnew.p.count++;
				1484	}
				1485	vnew = vold;
				1486	init = vnew.p.count;
				1487	status = ( init < (UT)vnew.p.ub ) ;
				1488	}
				1489
				1490	if( !status ) {
				1491	kmp_info_t **other_threads = team->t.t_threads;
				1492	int while_limit = 10;
				1493	int while_index = 0;
				1494
				1495	// TODO: algorithm of searching for a victim
				1496	// should be cleaned up and measured
				1497	while ( ( !status ) && ( while_limit != ++while_index ) ) {
				1498	union_i4 vold, vnew;
				1499	kmp_int32 remaining; // kmp_int32 because KMP_I4 only
				1500	T victimIdx = pr->u.p.parm4;
				1501	T oldVictimIdx = victimIdx;
				1502	dispatch_private_info_template< T > * victim;
				1503
				1504	do {
				1505	if( !victimIdx ) {
				1506	victimIdx = team->t.t_nproc - 1;
				1507	} else {
				1508	--victimIdx;
				1509	}
				1510	victim = reinterpret_cast< dispatch_private_info_template< T >* >
				1511	( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
				1512	} while ( (victim == NULL \|\| victim == pr) && oldVictimIdx != victimIdx );
				1513	// TODO: think about a proper place of this test
				1514	if ( ( !victim ) \|\|
				1515	( (( volatile T )&victim->u.p.static_steal_counter) !=
				1516	(( volatile T )&pr->u.p.static_steal_counter) ) ) {
				1517	// TODO: delay would be nice
				1518	continue;
				1519	// the victim is not ready yet to participate in stealing
				1520	// because the victim is still in kmp_init_dispatch
				1521	}
				1522	if ( oldVictimIdx == victimIdx ) {
				1523	break;
				1524	}
				1525	pr->u.p.parm4 = victimIdx;
				1526
				1527	while( 1 ) {
				1528	vold.b = ( volatile kmp_int64 )( &victim->u.p.count );
				1529	vnew = vold;
				1530
				1531	KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
				1532	if ( vnew.p.count >= (UT)vnew.p.ub \|\| (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
				1533	break;
				1534	}
				1535	vnew.p.ub -= (remaining >> 2);
				1536	KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
				1537	#pragma warning( push )
				1538	// disable warning on pointless comparison of unsigned with 0
				1539	#pragma warning( disable: 186 )
				1540	KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
				1541	#pragma warning( pop )
				1542	// TODO: Should this be acquire or release?
				1543	if ( KMP_COMPARE_AND_STORE_ACQ64(
				1544	( volatile kmp_int64 * )&victim->u.p.count,
				1545	VOLATILE_CAST(kmp_int64 )&vold.b,
				1546	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1547	status = 1;
				1548	while_index = 0;
				1549	// now update own count and ub
				1550	#if KMP_ARCH_X86
				1551	// stealing executed on non-KMP_ARCH_X86 only
				1552	// Atomic 64-bit write on ia32 is
				1553	// unavailable, so we do this in steps.
				1554	// This code is not tested.
				1555	init = vold.p.count;
				1556	pr->u.p.ub = 0;
				1557	pr->u.p.count = init + 1;
				1558	pr->u.p.ub = vnew.p.count;
				1559	#else
				1560	init = vnew.p.ub;
				1561	vold.p.count = init + 1;
				1562	// TODO: is it safe and enough?
				1563	( volatile kmp_int64 )(&pr->u.p.count) = vold.b;
				1564	#endif // KMP_ARCH_X86
				1565	break;
				1566	} // if
				1567	KMP_CPU_PAUSE();
				1568	} // while (1)
				1569	} // while
				1570	} // if
				1571	} // if
				1572	if ( !status ) {
				1573	*p_lb = 0;
				1574	*p_ub = 0;
				1575	if ( p_st != 0 ) *p_st = 0;
				1576	} else {
				1577	start = pr->u.p.parm2;
				1578	init *= chunk;
				1579	limit = chunk + init - 1;
				1580	incr = pr->u.p.st;
				1581
				1582	KMP_DEBUG_ASSERT(init <= trip);
				1583	if ( (last = (limit >= trip)) != 0 )
				1584	limit = trip;
				1585	if ( p_last ) {
				1586	*p_last = last;
				1587	}
				1588	if ( p_st != 0 ) *p_st = incr;
				1589
				1590	if ( incr == 1 ) {
				1591	*p_lb = start + init;
				1592	*p_ub = start + limit;
				1593	} else {
				1594	p_lb = start + init incr;
				1595	p_ub = start + limit incr;
				1596	}
				1597
				1598	if ( pr->ordered ) {
				1599	pr->u.p.ordered_lower = init;
				1600	pr->u.p.ordered_upper = limit;
				1601	#ifdef KMP_DEBUG
				1602	{
				1603	const char * buff;
				1604	// create format specifiers before the debug output
				1605	buff = __kmp_str_format(
				1606	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1607	traits_t< UT >::spec, traits_t< UT >::spec );
				1608	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1609	__kmp_str_free( &buff );
				1610	}
				1611	#endif
				1612	} // if
				1613	} // if
				1614	break;
				1615	} // case
				1616	#endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1617	case kmp_sch_static_balanced:
				1618	{
				1619	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
				1620	if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
				1621	pr->u.p.count = 1;
				1622	*p_lb = pr->u.p.lb;
				1623	*p_ub = pr->u.p.ub;
				1624	last = pr->u.p.parm1;
				1625	if ( p_last ) {
				1626	*p_last = last;
				1627	}
				1628	if ( p_st )
				1629	*p_st = pr->u.p.st;
				1630	} else { /* no iterations to do */
				1631	pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
				1632	}
				1633	if ( pr->ordered ) {
				1634	#ifdef KMP_DEBUG
				1635	{
				1636	const char * buff;
				1637	// create format specifiers before the debug output
				1638	buff = __kmp_str_format(
				1639	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1640	traits_t< UT >::spec, traits_t< UT >::spec );
				1641	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1642	__kmp_str_free( &buff );
				1643	}
				1644	#endif
				1645	} // if
				1646	} // case
				1647	break;
				1648	case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
				1649	case kmp_sch_static_chunked:
				1650	{
				1651	T parm1;
				1652
				1653	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity\|chunked] case\n",
				1654	gtid ) );
				1655	parm1 = pr->u.p.parm1;
				1656
				1657	trip = pr->u.p.tc - 1;
				1658	init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
				1659
				1660	if ( (status = (init <= trip)) != 0 ) {
				1661	start = pr->u.p.lb;
				1662	incr = pr->u.p.st;
				1663	limit = parm1 + init - 1;
				1664
				1665	if ( (last = (limit >= trip)) != 0 )
				1666	limit = trip;
				1667
				1668	if ( p_last ) {
				1669	*p_last = last;
				1670	}
				1671	if ( p_st != 0 ) *p_st = incr;
				1672
				1673	pr->u.p.count += team->t.t_nproc;
				1674
				1675	if ( incr == 1 ) {
				1676	*p_lb = start + init;
				1677	*p_ub = start + limit;
				1678	}
				1679	else {
				1680	p_lb = start + init incr;
				1681	p_ub = start + limit incr;
				1682	}
				1683
				1684	if ( pr->ordered ) {
				1685	pr->u.p.ordered_lower = init;
				1686	pr->u.p.ordered_upper = limit;
				1687	#ifdef KMP_DEBUG
				1688	{
				1689	const char * buff;
				1690	// create format specifiers before the debug output
				1691	buff = __kmp_str_format(
				1692	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1693	traits_t< UT >::spec, traits_t< UT >::spec );
				1694	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1695	__kmp_str_free( &buff );
				1696	}
				1697	#endif
				1698	} // if
				1699	} // if
				1700	} // case
				1701	break;
				1702
				1703	case kmp_sch_dynamic_chunked:
				1704	{
				1705	T chunk = pr->u.p.parm1;
				1706
				1707	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
				1708	gtid ) );
				1709
				1710	init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1711	trip = pr->u.p.tc - 1;
				1712
				1713	if ( (status = (init <= trip)) == 0 ) {
				1714	*p_lb = 0;
				1715	*p_ub = 0;
				1716	if ( p_st != 0 ) *p_st = 0;
				1717	} else {
				1718	start = pr->u.p.lb;
				1719	limit = chunk + init - 1;
				1720	incr = pr->u.p.st;
				1721
				1722	if ( (last = (limit >= trip)) != 0 )
				1723	limit = trip;
				1724	if ( p_last ) {
				1725	*p_last = last;
				1726	}
				1727	if ( p_st != 0 ) *p_st = incr;
				1728
				1729	if ( incr == 1 ) {
				1730	*p_lb = start + init;
				1731	*p_ub = start + limit;
				1732	} else {
				1733	p_lb = start + init incr;
				1734	p_ub = start + limit incr;
				1735	}
				1736
				1737	if ( pr->ordered ) {
				1738	pr->u.p.ordered_lower = init;
				1739	pr->u.p.ordered_upper = limit;
				1740	#ifdef KMP_DEBUG
				1741	{
				1742	const char * buff;
				1743	// create format specifiers before the debug output
				1744	buff = __kmp_str_format(
				1745	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1746	traits_t< UT >::spec, traits_t< UT >::spec );
				1747	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1748	__kmp_str_free( &buff );
				1749	}
				1750	#endif
				1751	} // if
				1752	} // if
				1753	} // case
				1754	break;
				1755
				1756	case kmp_sch_guided_iterative_chunked:
				1757	{
				1758	T chunkspec = pr->u.p.parm1;
				1759	KD_TRACE(100,
				1760	("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
				1761	trip = pr->u.p.tc;
				1762	// Start atomic part of calculations
				1763	while(1) {
				1764	ST remaining; // signed, because can be < 0
				1765	init = sh->u.s.iteration; // shared value
				1766	remaining = trip - init;
				1767	if ( remaining <= 0 ) { // AC: need to compare with 0 first
				1768	// nothing to do, don't try atomic op
				1769	status = 0;
				1770	break;
				1771	}
				1772	if ( (T)remaining < pr->u.p.parm2 ) { // compare with Knproc(chunk+1), K=2 by default
				1773	// use dynamic-style shcedule
				1774	// atomically inrement iterations, get old value
				1775	init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
				1776	remaining = trip - init;
				1777	if (remaining <= 0) {
				1778	status = 0; // all iterations got by other threads
				1779	} else {
				1780	// got some iterations to work on
				1781	status = 1;
				1782	if ( (T)remaining > chunkspec ) {
				1783	limit = init + chunkspec - 1;
				1784	} else {
				1785	last = 1; // the last chunk
				1786	limit = init + remaining - 1;
				1787	} // if
				1788	} // if
				1789	break;
				1790	} // if
				1791	limit = init + (UT)( remaining * (double)&pr->u.p.parm3 ); // divide by K*nproc
				1792	if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
				1793	// CAS was successful, chunk obtained
				1794	status = 1;
				1795	--limit;
				1796	break;
				1797	} // if
				1798	} // while
				1799	if ( status != 0 ) {
				1800	start = pr->u.p.lb;
				1801	incr = pr->u.p.st;
				1802	if ( p_st != NULL )
				1803	*p_st = incr;
				1804	if ( p_last != NULL )
				1805	*p_last = last;
				1806	p_lb = start + init incr;
				1807	p_ub = start + limit incr;
				1808	if ( pr->ordered ) {
				1809	pr->u.p.ordered_lower = init;
				1810	pr->u.p.ordered_upper = limit;
				1811	#ifdef KMP_DEBUG
				1812	{
				1813	const char * buff;
				1814	// create format specifiers before the debug output
				1815	buff = __kmp_str_format(
				1816	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1817	traits_t< UT >::spec, traits_t< UT >::spec );
				1818	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1819	__kmp_str_free( &buff );
				1820	}
				1821	#endif
				1822	} // if
				1823	} else {
				1824	*p_lb = 0;
				1825	*p_ub = 0;
				1826	if ( p_st != NULL )
				1827	*p_st = 0;
				1828	} // if
				1829	} // case
				1830	break;
				1831
				1832	case kmp_sch_guided_analytical_chunked:
				1833	{
				1834	T chunkspec = pr->u.p.parm1;
				1835	UT chunkIdx;
				1836	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1837	/* for storing original FPCW value for Windows* OS on
				1838	IA-32 architecture 8-byte version */
				1839	unsigned int oldFpcw;
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1840	unsigned int fpcwSet = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1841	#endif
				1842	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
				1843	gtid ) );
				1844
				1845	trip = pr->u.p.tc;
				1846
				1847	KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
				1848	KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
				1849
				1850	while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
				1851	chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1852	if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
				1853	--trip;
				1854	/* use dynamic-style scheduling */
				1855	init = chunkIdx * chunkspec + pr->u.p.count;
				1856	/* need to verify init > 0 in case of overflow in the above calculation */
				1857	if ( (status = (init > 0 && init <= trip)) != 0 ) {
				1858	limit = init + chunkspec -1;
				1859
				1860	if ( (last = (limit >= trip)) != 0 )
				1861	limit = trip;
				1862	}
				1863	break;
				1864	} else {
				1865	/* use exponential-style scheduling */
				1866	/* The following check is to workaround the lack of long double precision on Windows* OS.
				1867	This check works around the possible effect that init != 0 for chunkIdx == 0.
				1868	*/
				1869	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1870	/* If we haven't already done so, save original
				1871	FPCW and set precision to 64-bit, as Windows* OS
				1872	on IA-32 architecture defaults to 53-bit */
				1873	if ( !fpcwSet ) {
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1874	oldFpcw = _control87(0,0);
				1875	_control87(_PC_64,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1876	fpcwSet = 0x30000;
				1877	}
				1878	#endif
				1879	if ( chunkIdx ) {
				1880	init = __kmp_dispatch_guided_remaining< T >(
				1881	trip, ( DBL )&pr->u.p.parm3, chunkIdx );
				1882	KMP_DEBUG_ASSERT(init);
				1883	init = trip - init;
				1884	} else
				1885	init = 0;
				1886	limit = trip - __kmp_dispatch_guided_remaining< T >(
				1887	trip, ( DBL )&pr->u.p.parm3, chunkIdx + 1 );
				1888	KMP_ASSERT(init <= limit);
				1889	if ( init < limit ) {
				1890	KMP_DEBUG_ASSERT(limit <= trip);
				1891	--limit;
				1892	status = 1;
				1893	break;
				1894	} // if
				1895	} // if
				1896	} // while (1)
				1897	#if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1898	/* restore FPCW if necessary
				1899	AC: check fpcwSet flag first because oldFpcw can be uninitialized here
				1900	*/
				1901	if ( fpcwSet && ( oldFpcw & fpcwSet ) )
				1902	_control87(oldFpcw,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1903	#endif
				1904	if ( status != 0 ) {
				1905	start = pr->u.p.lb;
				1906	incr = pr->u.p.st;
				1907	if ( p_st != NULL )
				1908	*p_st = incr;
				1909	if ( p_last != NULL )
				1910	*p_last = last;
				1911	p_lb = start + init incr;
				1912	p_ub = start + limit incr;
				1913	if ( pr->ordered ) {
				1914	pr->u.p.ordered_lower = init;
				1915	pr->u.p.ordered_upper = limit;
				1916	#ifdef KMP_DEBUG
				1917	{
				1918	const char * buff;
				1919	// create format specifiers before the debug output
				1920	buff = __kmp_str_format(
				1921	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1922	traits_t< UT >::spec, traits_t< UT >::spec );
				1923	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1924	__kmp_str_free( &buff );
				1925	}
				1926	#endif
				1927	}
				1928	} else {
				1929	*p_lb = 0;
				1930	*p_ub = 0;
				1931	if ( p_st != NULL )
				1932	*p_st = 0;
				1933	}
				1934	} // case
				1935	break;
				1936
				1937	case kmp_sch_trapezoidal:
				1938	{
				1939	UT index;
				1940	T parm2 = pr->u.p.parm2;
				1941	T parm3 = pr->u.p.parm3;
				1942	T parm4 = pr->u.p.parm4;
				1943	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
				1944	gtid ) );
				1945
				1946	index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
				1947
				1948	init = ( index * ( (2parm2) - (index-1)parm4 ) ) / 2;
				1949	trip = pr->u.p.tc - 1;
				1950
				1951	if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
				1952	*p_lb = 0;
				1953	*p_ub = 0;
				1954	if ( p_st != 0 ) *p_st = 0;
				1955	} else {
				1956	start = pr->u.p.lb;
				1957	limit = ( (index+1) * ( 2parm2 - indexparm4 ) ) / 2 - 1;
				1958	incr = pr->u.p.st;
				1959
				1960	if ( (last = (limit >= trip)) != 0 )
				1961	limit = trip;
				1962
				1963	if ( p_last != 0 ) {
				1964	*p_last = last;
				1965	}
				1966	if ( p_st != 0 ) *p_st = incr;
				1967
				1968	if ( incr == 1 ) {
				1969	*p_lb = start + init;
				1970	*p_ub = start + limit;
				1971	} else {
				1972	p_lb = start + init incr;
				1973	p_ub = start + limit incr;
				1974	}
				1975
				1976	if ( pr->ordered ) {
				1977	pr->u.p.ordered_lower = init;
				1978	pr->u.p.ordered_upper = limit;
				1979	#ifdef KMP_DEBUG
				1980	{
				1981	const char * buff;
				1982	// create format specifiers before the debug output
				1983	buff = __kmp_str_format(
				1984	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1985	traits_t< UT >::spec, traits_t< UT >::spec );
				1986	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1987	__kmp_str_free( &buff );
				1988	}
				1989	#endif
				1990	} // if
				1991	} // if
				1992	} // case
				1993	break;
				1994	} // switch
				1995	} // if tc == 0;
				1996
				1997	if ( status == 0 ) {
				1998	UT num_done;
				1999
				2000	num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
				2001	#ifdef KMP_DEBUG
				2002	{
				2003	const char * buff;
				2004	// create format specifiers before the debug output
				2005	buff = __kmp_str_format(
				2006	"__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
				2007	traits_t< UT >::spec );
				2008	KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
				2009	__kmp_str_free( &buff );
				2010	}
				2011	#endif
				2012
				2013	if ( num_done == team->t.t_nproc-1 ) {
				2014	/* NOTE: release this buffer to be reused */
				2015
				2016	KMP_MB(); /* Flush all pending memory write invalidates. */
				2017
				2018	sh->u.s.num_done = 0;
				2019	sh->u.s.iteration = 0;
				2020
				2021	/* TODO replace with general release procedure? */
				2022	if ( pr->ordered ) {
				2023	sh->u.s.ordered_iteration = 0;
				2024	}
				2025
				2026	KMP_MB(); /* Flush all pending memory write invalidates. */
				2027
				2028	sh -> buffer_index += KMP_MAX_DISP_BUF;
				2029	KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
				2030	gtid, sh->buffer_index) );
				2031
				2032	KMP_MB(); /* Flush all pending memory write invalidates. */
				2033
				2034	} // if
				2035	if ( __kmp_env_consistency_check ) {
				2036	if ( pr->pushed_ws != ct_none ) {
				2037	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				2038	}
				2039	}
				2040
				2041	th -> th.th_dispatch -> th_deo_fcn = NULL;
				2042	th -> th.th_dispatch -> th_dxo_fcn = NULL;
				2043	th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
				2044	th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
				2045	} // if (status == 0)
				2046	#if KMP_OS_WINDOWS
				2047	else if ( last ) {
				2048	pr->u.p.last_upper = pr->u.p.ub;
				2049	}
				2050	#endif /* KMP_OS_WINDOWS */
				2051	} // if
				2052
				2053	#ifdef KMP_DEBUG
				2054	{
				2055	const char * buff;
				2056	// create format specifiers before the debug output
				2057	buff = __kmp_str_format(
				2058	"__kmp_dispatch_next: T#%%d normal case: " \
				2059	"p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
				2060	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				2061	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last, status ) );
				2062	__kmp_str_free( &buff );
				2063	}
				2064	#endif
				2065	return status;
				2066	}
				2067
				2068	//-----------------------------------------------------------------------------------------
				2069	// Dispatch routines
				2070	// Transfer call to template< type T >
				2071	// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
				2072	// T lb, T ub, ST st, ST chunk )
				2073	extern "C" {
				2074
				2075	/*!
				2076	@ingroup WORK_SHARING
				2077	@{
				2078	@param loc Source location
				2079	@param gtid Global thread id
				2080	@param schedule Schedule type
				2081	@param lb Lower bound
				2082	@param ub Upper bound
				2083	@param st Step (or increment if you prefer)
				2084	@param chunk The chunk size to block with
				2085
				2086	This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
				2087	These functions are all identical apart from the types of the arguments.
				2088	*/
				2089
				2090	void
				2091	__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2092	kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
				2093	{
				2094	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2095	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2096	}
				2097	/*!
				2098	See @ref __kmpc_dispatch_init_4
				2099	*/
				2100	void
				2101	__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2102	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
				2103	{
				2104	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2105	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2106	}
				2107
				2108	/*!
				2109	See @ref __kmpc_dispatch_init_4
				2110	*/
				2111	void
				2112	__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2113	kmp_int64 lb, kmp_int64 ub,
				2114	kmp_int64 st, kmp_int64 chunk )
				2115	{
				2116	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2117	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2118	}
				2119
				2120	/*!
				2121	See @ref __kmpc_dispatch_init_4
				2122	*/
				2123	void
				2124	__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2125	kmp_uint64 lb, kmp_uint64 ub,
				2126	kmp_int64 st, kmp_int64 chunk )
				2127	{
				2128	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2129	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2130	}
				2131
				2132	/*!
				2133	@param loc Source code location
				2134	@param gtid Global thread id
				2135	@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
				2136	@param p_lb Pointer to the lower bound for the next chunk of work
				2137	@param p_ub Pointer to the upper bound for the next chunk of work
				2138	@param p_st Pointer to the stride for the next chunk of work
				2139	@return one if there is work to be done, zero otherwise
				2140
				2141	Get the next dynamically allocated chunk of work for this thread.
				2142	If there is no more work, then the lb,ub and stride need not be modified.
				2143	*/
				2144	int
				2145	__kmpc_dispatch_next_4( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2146	kmp_int32 p_lb, kmp_int32 p_ub, kmp_int32 *p_st )
				2147	{
				2148	return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2149	}
				2150
				2151	/*!
				2152	See @ref __kmpc_dispatch_next_4
				2153	*/
				2154	int
				2155	__kmpc_dispatch_next_4u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2156	kmp_uint32 p_lb, kmp_uint32 p_ub, kmp_int32 *p_st )
				2157	{
				2158	return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2159	}
				2160
				2161	/*!
				2162	See @ref __kmpc_dispatch_next_4
				2163	*/
				2164	int
				2165	__kmpc_dispatch_next_8( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2166	kmp_int64 p_lb, kmp_int64 p_ub, kmp_int64 *p_st )
				2167	{
				2168	return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2169	}
				2170
				2171	/*!
				2172	See @ref __kmpc_dispatch_next_4
				2173	*/
				2174	int
				2175	__kmpc_dispatch_next_8u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2176	kmp_uint64 p_lb, kmp_uint64 p_ub, kmp_int64 *p_st )
				2177	{
				2178	return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2179	}
				2180
				2181	/*!
				2182	@param loc Source code location
				2183	@param gtid Global thread id
				2184
				2185	Mark the end of a dynamic loop.
				2186	*/
				2187	void
				2188	__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
				2189	{
				2190	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2191	}
				2192
				2193	/*!
				2194	See @ref __kmpc_dispatch_fini_4
				2195	*/
				2196	void
				2197	__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
				2198	{
				2199	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2200	}
				2201
				2202	/*!
				2203	See @ref __kmpc_dispatch_fini_4
				2204	*/
				2205	void
				2206	__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
				2207	{
				2208	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2209	}
				2210
				2211	/*!
				2212	See @ref __kmpc_dispatch_fini_4
				2213	*/
				2214	void
				2215	__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
				2216	{
				2217	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2218	}
				2219	/! @} /
				2220
				2221	//-----------------------------------------------------------------------------------------
				2222	//Non-template routines from kmp_dispatch.c used in other sources
				2223
				2224	kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
				2225	return value == checker;
				2226	}
				2227
				2228	kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
				2229	return value != checker;
				2230	}
				2231
				2232	kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
				2233	return value < checker;
				2234	}
				2235
				2236	kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
				2237	return value >= checker;
				2238	}
				2239
				2240	kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
				2241	return value <= checker;
				2242	}
				2243	kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
				2244	return value == checker;
				2245	}
				2246
				2247	kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
				2248	return value != checker;
				2249	}
				2250
				2251	kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
				2252	return value < checker;
				2253	}
				2254
				2255	kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
				2256	return value >= checker;
				2257	}
				2258
				2259	kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
				2260	return value <= checker;
				2261	}
				2262
				2263	kmp_uint32
				2264	__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
				2265	kmp_uint32 checker,
				2266	kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
				2267	, void * obj // Higher-level synchronization object, or NULL.
				2268	)
				2269	{
				2270	// note: we may not belong to a team at this point
				2271	register volatile kmp_uint32 * spin = spinner;
				2272	register kmp_uint32 check = checker;
				2273	register kmp_uint32 spins;
				2274	register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
				2275	register kmp_uint32 r;
				2276
				2277	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2278	KMP_INIT_YIELD( spins );
				2279	// main wait spin loop
				2280	while(!f(r = TCR_4(*spin), check)) {
				2281	KMP_FSYNC_SPIN_PREPARE( obj );
				2282	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2283	It causes problems with infinite recursion because of exit lock */
				2284	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2285	__kmp_abort_thread(); */
				2286
				2287	__kmp_static_delay(TRUE);
				2288
				2289	/* if we have waited a bit, or are oversubscribed, yield */
				2290	/* pause is in the following code */
				2291	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2292	KMP_YIELD_SPIN( spins );
				2293	}
				2294	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2295	return r;
				2296	}
				2297
				2298	kmp_uint64
				2299	__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
				2300	kmp_uint64 checker,
				2301	kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
				2302	, void * obj // Higher-level synchronization object, or NULL.
				2303	)
				2304	{
				2305	// note: we may not belong to a team at this point
				2306	register volatile kmp_uint64 * spin = spinner;
				2307	register kmp_uint64 check = checker;
				2308	register kmp_uint32 spins;
				2309	register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
				2310	register kmp_uint64 r;
				2311
				2312	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2313	KMP_INIT_YIELD( spins );
				2314	// main wait spin loop
				2315	while(!f(r = *spin, check))
				2316	{
				2317	KMP_FSYNC_SPIN_PREPARE( obj );
				2318	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2319	It causes problems with infinite recursion because of exit lock */
				2320	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2321	__kmp_abort_thread(); */
				2322
				2323	__kmp_static_delay(TRUE);
				2324
				2325	// if we are oversubscribed,
				2326	// or have waited a bit (and KMP_LIBARRY=throughput, then yield
				2327	// pause is in the following code
				2328	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2329	KMP_YIELD_SPIN( spins );
				2330	}
				2331	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2332	return r;
				2333	}
				2334
				2335	} // extern "C"
				2336
				2337	#ifdef KMP_GOMP_COMPAT
				2338
				2339	void
				2340	__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2341	kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
				2342	kmp_int32 chunk, int push_ws )
				2343	{
				2344	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2345	push_ws );
				2346	}
				2347
				2348	void
				2349	__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2350	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
				2351	kmp_int32 chunk, int push_ws )
				2352	{
				2353	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2354	push_ws );
				2355	}
				2356
				2357	void
				2358	__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2359	kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
				2360	kmp_int64 chunk, int push_ws )
				2361	{
				2362	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2363	push_ws );
				2364	}
				2365
				2366	void
				2367	__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2368	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
				2369	kmp_int64 chunk, int push_ws )
				2370	{
				2371	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2372	push_ws );
				2373	}
				2374
				2375	void
				2376	__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
				2377	{
				2378	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2379	}
				2380
				2381	void
				2382	__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
				2383	{
				2384	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2385	}
				2386
				2387	void
				2388	__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
				2389	{
				2390	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2391	}
				2392
				2393	void
				2394	__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
				2395	{
				2396	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2397	}
				2398
				2399	#endif /* KMP_GOMP_COMPAT */
				2400
				2401	/* ------------------------------------------------------------------------ */
				2402	/* ------------------------------------------------------------------------ */
				2403