Blame - openmp/runtime/src/kmp_dispatch.cpp - toolchain/llvm-project

blob: 2cf3d7f42cd0faf78005311011efc9e32420a90a [file] [log] [blame]

Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1	/*
				2	* kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	3	*/
				4
				5
				6	//===----------------------------------------------------------------------===//
				7	//
				8	// The LLVM Compiler Infrastructure
				9	//
				10	// This file is dual licensed under the MIT and the University of Illinois Open
				11	// Source Licenses. See LICENSE.txt for details.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15
				16	/*
				17	* Dynamic scheduling initialization and dispatch.
				18	*
				19	* NOTE: __kmp_nth is a constant inside of any dispatch loop, however
				20	* it may change values between parallel regions. __kmp_max_nth
				21	* is the largest value __kmp_nth may take, 1 is the smallest.
				22	*
				23	*/
				24
				25	/* ------------------------------------------------------------------------ */
				26	/* ------------------------------------------------------------------------ */
				27
				28	#include "kmp.h"
				29	#include "kmp_i18n.h"
				30	#include "kmp_itt.h"
				31	#include "kmp_str.h"
				32	#include "kmp_error.h"
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	33	#include "kmp_stats.h"
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	34	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				35	#include <float.h>
				36	#endif
				37
Andrey Churbanov	d7d088f	2015-04-29 16:42:24 +0000	[diff] [blame]	38	#if OMPT_SUPPORT
				39	#include "ompt-internal.h"
				40	#include "ompt-specific.h"
				41	#endif
				42
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	43	/* ------------------------------------------------------------------------ */
				44	/* ------------------------------------------------------------------------ */
				45
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	46	// template for type limits
				47	template< typename T >
				48	struct i_maxmin {
				49	static const T mx;
				50	static const T mn;
				51	};
				52	template<>
				53	struct i_maxmin< int > {
				54	static const int mx = 0x7fffffff;
				55	static const int mn = 0x80000000;
				56	};
				57	template<>
				58	struct i_maxmin< unsigned int > {
				59	static const unsigned int mx = 0xffffffff;
				60	static const unsigned int mn = 0x00000000;
				61	};
				62	template<>
				63	struct i_maxmin< long long > {
				64	static const long long mx = 0x7fffffffffffffffLL;
				65	static const long long mn = 0x8000000000000000LL;
				66	};
				67	template<>
				68	struct i_maxmin< unsigned long long > {
				69	static const unsigned long long mx = 0xffffffffffffffffLL;
				70	static const unsigned long long mn = 0x0000000000000000LL;
				71	};
				72	//-------------------------------------------------------------------------
				73
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	74	#ifdef KMP_STATIC_STEAL_ENABLED
				75
				76	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				77	template< typename T >
				78	struct dispatch_private_infoXX_template {
				79	typedef typename traits_t< T >::unsigned_t UT;
				80	typedef typename traits_t< T >::signed_t ST;
				81	UT count; // unsigned
				82	T ub;
				83	/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
				84	T lb;
				85	ST st; // signed
				86	UT tc; // unsigned
				87	T static_steal_counter; // for static_steal only; maybe better to put after ub
				88
				89	/* parm[1-4] are used in different ways by different scheduling algorithms */
				90
				91	// KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
				92	// a) parm3 is properly aligned and
				93	// b) all parm1-4 are in the same cache line.
				94	// Because of parm1-4 are used together, performance seems to be better
				95	// if they are in the same line (not measured though).
				96
				97	struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
				98	T parm1;
				99	T parm2;
				100	T parm3;
				101	T parm4;
				102	};
				103
				104	UT ordered_lower; // unsigned
				105	UT ordered_upper; // unsigned
				106	#if KMP_OS_WINDOWS
				107	T last_upper;
				108	#endif /* KMP_OS_WINDOWS */
				109	};
				110
				111	#else /* KMP_STATIC_STEAL_ENABLED */
				112
				113	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				114	template< typename T >
				115	struct dispatch_private_infoXX_template {
				116	typedef typename traits_t< T >::unsigned_t UT;
				117	typedef typename traits_t< T >::signed_t ST;
				118	T lb;
				119	T ub;
				120	ST st; // signed
				121	UT tc; // unsigned
				122
				123	T parm1;
				124	T parm2;
				125	T parm3;
				126	T parm4;
				127
				128	UT count; // unsigned
				129
				130	UT ordered_lower; // unsigned
				131	UT ordered_upper; // unsigned
				132	#if KMP_OS_WINDOWS
				133	T last_upper;
				134	#endif /* KMP_OS_WINDOWS */
				135	};
				136
				137	#endif /* KMP_STATIC_STEAL_ENABLED */
				138
				139	// replaces dispatch_private_info structure and dispatch_private_info_t type
				140	template< typename T >
				141	struct KMP_ALIGN_CACHE dispatch_private_info_template {
				142	// duplicate alignment here, otherwise size of structure is not correct in our compiler
				143	union KMP_ALIGN_CACHE private_info_tmpl {
				144	dispatch_private_infoXX_template< T > p;
				145	dispatch_private_info64_t p64;
				146	} u;
				147	enum sched_type schedule; /* scheduling algorithm */
				148	kmp_uint32 ordered; /* ordered clause specified */
				149	kmp_uint32 ordered_bumped;
				150	kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
				151	dispatch_private_info * next; /* stack of buffers for nest of serial regions */
				152	kmp_uint32 nomerge; /* don't merge iters if serialized */
				153	kmp_uint32 type_size;
				154	enum cons_type pushed_ws;
				155	};
				156
				157
				158	// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
				159	template< typename UT >
				160	struct dispatch_shared_infoXX_template {
				161	/* chunk index under dynamic, number of idle threads under static-steal;
				162	iteration index otherwise */
				163	volatile UT iteration;
				164	volatile UT num_done;
				165	volatile UT ordered_iteration;
				166	UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
				167	};
				168
				169	// replaces dispatch_shared_info structure and dispatch_shared_info_t type
				170	template< typename UT >
				171	struct dispatch_shared_info_template {
				172	// we need union here to keep the structure size
				173	union shared_info_tmpl {
				174	dispatch_shared_infoXX_template< UT > s;
				175	dispatch_shared_info64_t s64;
				176	} u;
				177	volatile kmp_uint32 buffer_index;
				178	};
				179
				180	/* ------------------------------------------------------------------------ */
				181	/* ------------------------------------------------------------------------ */
				182
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	183	#undef USE_TEST_LOCKS
				184
				185	// test_then_add template (general template should NOT be used)
				186	template< typename T >
				187	static __forceinline T
				188	test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
				189
				190	template<>
				191	__forceinline kmp_int32
				192	test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
				193	{
				194	kmp_int32 r;
				195	r = KMP_TEST_THEN_ADD32( p, d );
				196	return r;
				197	}
				198
				199	template<>
				200	__forceinline kmp_int64
				201	test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
				202	{
				203	kmp_int64 r;
				204	r = KMP_TEST_THEN_ADD64( p, d );
				205	return r;
				206	}
				207
				208	// test_then_inc_acq template (general template should NOT be used)
				209	template< typename T >
				210	static __forceinline T
				211	test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
				212
				213	template<>
				214	__forceinline kmp_int32
				215	test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
				216	{
				217	kmp_int32 r;
				218	r = KMP_TEST_THEN_INC_ACQ32( p );
				219	return r;
				220	}
				221
				222	template<>
				223	__forceinline kmp_int64
				224	test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
				225	{
				226	kmp_int64 r;
				227	r = KMP_TEST_THEN_INC_ACQ64( p );
				228	return r;
				229	}
				230
				231	// test_then_inc template (general template should NOT be used)
				232	template< typename T >
				233	static __forceinline T
				234	test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
				235
				236	template<>
				237	__forceinline kmp_int32
				238	test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
				239	{
				240	kmp_int32 r;
				241	r = KMP_TEST_THEN_INC32( p );
				242	return r;
				243	}
				244
				245	template<>
				246	__forceinline kmp_int64
				247	test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
				248	{
				249	kmp_int64 r;
				250	r = KMP_TEST_THEN_INC64( p );
				251	return r;
				252	}
				253
				254	// compare_and_swap template (general template should NOT be used)
				255	template< typename T >
				256	static __forceinline kmp_int32
				257	compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
				258
				259	template<>
				260	__forceinline kmp_int32
				261	compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
				262	{
				263	return KMP_COMPARE_AND_STORE_REL32( p, c, s );
				264	}
				265
				266	template<>
				267	__forceinline kmp_int32
				268	compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
				269	{
				270	return KMP_COMPARE_AND_STORE_REL64( p, c, s );
				271	}
				272
				273	/*
				274	Spin wait loop that first does pause, then yield.
				275	Waits until function returns non-zero when called with *spinner and check.
				276	Does NOT put threads to sleep.
				277	#if USE_ITT_BUILD
				278	Arguments:
Alp Toker	8f2d3f0	2014-02-24 10:40:15 +0000	[diff] [blame]	279	obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	280	locks consistently. For example, if lock is acquired immediately, its address is
				281	reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
				282	immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
				283	address, not an address of low-level spinner.
				284	#endif // USE_ITT_BUILD
				285	*/
				286	template< typename UT >
				287	// ToDo: make inline function (move to header file for icl)
				288	static UT // unsigned 4- or 8-byte type
				289	__kmp_wait_yield( volatile UT * spinner,
				290	UT checker,
				291	kmp_uint32 (* pred)( UT, UT )
				292	USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
				293	)
				294	{
				295	// note: we may not belong to a team at this point
				296	register volatile UT * spin = spinner;
				297	register UT check = checker;
				298	register kmp_uint32 spins;
				299	register kmp_uint32 (*f) ( UT, UT ) = pred;
				300	register UT r;
				301
				302	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				303	KMP_INIT_YIELD( spins );
				304	// main wait spin loop
				305	while(!f(r = *spin, check))
				306	{
				307	KMP_FSYNC_SPIN_PREPARE( obj );
				308	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				309	It causes problems with infinite recursion because of exit lock */
				310	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				311	__kmp_abort_thread(); */
				312
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	313	// if we are oversubscribed,
				314	// or have waited a bit (and KMP_LIBRARY=throughput, then yield
				315	// pause is in the following code
				316	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				317	KMP_YIELD_SPIN( spins );
				318	}
				319	KMP_FSYNC_SPIN_ACQUIRED( obj );
				320	return r;
				321	}
				322
				323	template< typename UT >
				324	static kmp_uint32 __kmp_eq( UT value, UT checker) {
				325	return value == checker;
				326	}
				327
				328	template< typename UT >
				329	static kmp_uint32 __kmp_neq( UT value, UT checker) {
				330	return value != checker;
				331	}
				332
				333	template< typename UT >
				334	static kmp_uint32 __kmp_lt( UT value, UT checker) {
				335	return value < checker;
				336	}
				337
				338	template< typename UT >
				339	static kmp_uint32 __kmp_ge( UT value, UT checker) {
				340	return value >= checker;
				341	}
				342
				343	template< typename UT >
				344	static kmp_uint32 __kmp_le( UT value, UT checker) {
				345	return value <= checker;
				346	}
				347
				348
				349	/* ------------------------------------------------------------------------ */
				350	/* ------------------------------------------------------------------------ */
				351
				352	static void
				353	__kmp_dispatch_deo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				354	{
				355	kmp_info_t *th;
				356
				357	KMP_DEBUG_ASSERT( gtid_ref );
				358
				359	if ( __kmp_env_consistency_check ) {
				360	th = __kmp_threads[*gtid_ref];
				361	if ( th -> th.th_root -> r.r_active
				362	&& ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
Andrey Churbanov	5c56fb5	2015-02-20 18:05:17 +0000	[diff] [blame]	363	#if KMP_USE_DYNAMIC_LOCK
				364	__kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
				365	#else
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	366	__kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov	5c56fb5	2015-02-20 18:05:17 +0000	[diff] [blame]	367	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	368	}
				369	}
				370	}
				371
				372	template< typename UT >
				373	static void
				374	__kmp_dispatch_deo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				375	{
				376	typedef typename traits_t< UT >::signed_t ST;
				377	dispatch_private_info_template< UT > * pr;
				378
				379	int gtid = *gtid_ref;
				380	// int cid = *cid_ref;
				381	kmp_info_t *th = __kmp_threads[ gtid ];
				382	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				383
				384	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
				385	if ( __kmp_env_consistency_check ) {
				386	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				387	( th -> th.th_dispatch -> th_dispatch_pr_current );
				388	if ( pr -> pushed_ws != ct_none ) {
Andrey Churbanov	5c56fb5	2015-02-20 18:05:17 +0000	[diff] [blame]	389	#if KMP_USE_DYNAMIC_LOCK
				390	__kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
				391	#else
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	392	__kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov	5c56fb5	2015-02-20 18:05:17 +0000	[diff] [blame]	393	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	394	}
				395	}
				396
				397	if ( ! th -> th.th_team -> t.t_serialized ) {
				398	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				399	( th -> th.th_dispatch -> th_dispatch_sh_current );
				400	UT lower;
				401
				402	if ( ! __kmp_env_consistency_check ) {
				403	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				404	( th -> th.th_dispatch -> th_dispatch_pr_current );
				405	}
				406	lower = pr->u.p.ordered_lower;
				407
				408	#if ! defined( KMP_GOMP_COMPAT )
				409	if ( __kmp_env_consistency_check ) {
				410	if ( pr->ordered_bumped ) {
				411	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				412	__kmp_error_construct2(
				413	kmp_i18n_msg_CnsMultipleNesting,
				414	ct_ordered_in_pdo, loc_ref,
				415	& p->stack_data[ p->w_top ]
				416	);
				417	}
				418	}
				419	#endif /* !defined(KMP_GOMP_COMPAT) */
				420
				421	KMP_MB();
				422	#ifdef KMP_DEBUG
				423	{
				424	const char * buff;
				425	// create format specifiers before the debug output
				426	buff = __kmp_str_format(
				427	"__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
				428	traits_t< UT >::spec, traits_t< UT >::spec );
				429	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				430	__kmp_str_free( &buff );
				431	}
				432	#endif
				433
				434	__kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				435	USE_ITT_BUILD_ARG( NULL )
				436	);
				437	KMP_MB(); /* is this necessary? */
				438	#ifdef KMP_DEBUG
				439	{
				440	const char * buff;
				441	// create format specifiers before the debug output
				442	buff = __kmp_str_format(
				443	"__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
				444	traits_t< UT >::spec, traits_t< UT >::spec );
				445	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				446	__kmp_str_free( &buff );
				447	}
				448	#endif
				449	}
				450	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
				451	}
				452
				453	static void
				454	__kmp_dispatch_dxo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				455	{
				456	kmp_info_t *th;
				457
				458	if ( __kmp_env_consistency_check ) {
				459	th = __kmp_threads[*gtid_ref];
				460	if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
				461	__kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
				462	}
				463	}
				464	}
				465
				466	template< typename UT >
				467	static void
				468	__kmp_dispatch_dxo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				469	{
				470	typedef typename traits_t< UT >::signed_t ST;
				471	dispatch_private_info_template< UT > * pr;
				472
				473	int gtid = *gtid_ref;
				474	// int cid = *cid_ref;
				475	kmp_info_t *th = __kmp_threads[ gtid ];
				476	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				477
				478	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
				479	if ( __kmp_env_consistency_check ) {
				480	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				481	( th -> th.th_dispatch -> th_dispatch_pr_current );
				482	if ( pr -> pushed_ws != ct_none ) {
				483	__kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
				484	}
				485	}
				486
				487	if ( ! th -> th.th_team -> t.t_serialized ) {
				488	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				489	( th -> th.th_dispatch -> th_dispatch_sh_current );
				490
				491	if ( ! __kmp_env_consistency_check ) {
				492	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				493	( th -> th.th_dispatch -> th_dispatch_pr_current );
				494	}
				495
				496	KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
				497	#if ! defined( KMP_GOMP_COMPAT )
				498	if ( __kmp_env_consistency_check ) {
				499	if ( pr->ordered_bumped != 0 ) {
				500	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				501	/* How to test it? - OM */
				502	__kmp_error_construct2(
				503	kmp_i18n_msg_CnsMultipleNesting,
				504	ct_ordered_in_pdo, loc_ref,
				505	& p->stack_data[ p->w_top ]
				506	);
				507	}
				508	}
				509	#endif /* !defined(KMP_GOMP_COMPAT) */
				510
				511	KMP_MB(); /* Flush all pending memory write invalidates. */
				512
				513	pr->ordered_bumped += 1;
				514
				515	KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
				516	gtid, pr->ordered_bumped ) );
				517
				518	KMP_MB(); /* Flush all pending memory write invalidates. */
				519
				520	/* TODO use general release procedure? */
				521	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				522
				523	KMP_MB(); /* Flush all pending memory write invalidates. */
				524	}
				525	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
				526	}
				527
				528	/* Computes and returns x to the power of y, where y must a non-negative integer */
				529	template< typename UT >
				530	static __forceinline long double
				531	__kmp_pow(long double x, UT y) {
				532	long double s=1.0L;
				533
				534	KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
				535	//KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
				536	while(y) {
				537	if ( y & 1 )
				538	s *= x;
				539	x *= x;
				540	y >>= 1;
				541	}
				542	return s;
				543	}
				544
				545	/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
				546	(the total number of unassigned iterations in chunks with index greater than or equal to idx).
				547	__forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
				548	(one of the unit tests, sch_guided_analytical_basic.cpp, fails)
				549	*/
				550	template< typename T >
				551	static __inline typename traits_t< T >::unsigned_t
				552	__kmp_dispatch_guided_remaining(
				553	T tc,
				554	typename traits_t< T >::floating_t base,
				555	typename traits_t< T >::unsigned_t idx
				556	) {
				557	/* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
				558	least for ICL 8.1, long double arithmetic may not really have
				559	long double precision, even with /Qlong_double. Currently, we
				560	workaround that in the caller code, by manipulating the FPCW for
				561	Windows* OS on IA-32 architecture. The lack of precision is not
				562	expected to be a correctness issue, though.
				563	*/
				564	typedef typename traits_t< T >::unsigned_t UT;
				565
				566	long double x = tc * __kmp_pow< UT >(base, idx);
				567	UT r = (UT) x;
				568	if ( x == r )
				569	return r;
				570	return r + 1;
				571	}
				572
				573	// Parameters of the guided-iterative algorithm:
				574	// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
				575	// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
				576	// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
				577	// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
				578	static int guided_int_param = 2;
				579	static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
				580
				581	// UT - unsigned flavor of T, ST - signed flavor of T,
				582	// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
				583	template< typename T >
				584	static void
				585	__kmp_dispatch_init(
				586	ident_t * loc,
				587	int gtid,
				588	enum sched_type schedule,
				589	T lb,
				590	T ub,
				591	typename traits_t< T >::signed_t st,
				592	typename traits_t< T >::signed_t chunk,
				593	int push_ws
				594	) {
				595	typedef typename traits_t< T >::unsigned_t UT;
				596	typedef typename traits_t< T >::signed_t ST;
				597	typedef typename traits_t< T >::floating_t DBL;
				598	static const int ___kmp_size_type = sizeof( UT );
				599
				600	int active;
				601	T tc;
				602	kmp_info_t * th;
				603	kmp_team_t * team;
				604	kmp_uint32 my_buffer_index;
				605	dispatch_private_info_template< T > * pr;
				606	dispatch_shared_info_template< UT > volatile * sh;
				607
				608	KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
				609	KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
				610
				611	if ( ! TCR_4( __kmp_init_parallel ) )
				612	__kmp_parallel_initialize();
				613
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	614	#if INCLUDE_SSC_MARKS
				615	SSC_MARK_DISPATCH_INIT();
				616	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	617	#ifdef KMP_DEBUG
				618	{
				619	const char * buff;
				620	// create format specifiers before the debug output
				621	buff = __kmp_str_format(
				622	"__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
				623	traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				624	KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
				625	__kmp_str_free( &buff );
				626	}
				627	#endif
				628	/* setup data */
				629	th = __kmp_threads[ gtid ];
				630	team = th -> th.th_team;
				631	active = ! team -> t.t_serialized;
				632	th->th.th_ident = loc;
				633
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	634	#if USE_ITT_BUILD
				635	kmp_uint64 cur_chunk = chunk;
				636	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	637	if ( ! active ) {
				638	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				639	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				640	} else {
				641	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				642	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				643
				644	my_buffer_index = th->th.th_dispatch->th_disp_index ++;
				645
				646	/* What happens when number of threads changes, need to resize buffer? */
				647	pr = reinterpret_cast< dispatch_private_info_template< T > * >
				648	( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				649	sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
				650	( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				651	}
				652
				653	/* Pick up the nomerge/ordered bits from the scheduling type */
				654	if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
				655	pr->nomerge = TRUE;
				656	schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
				657	} else {
				658	pr->nomerge = FALSE;
				659	}
				660	pr->type_size = ___kmp_size_type; // remember the size of variables
				661	if ( kmp_ord_lower & schedule ) {
				662	pr->ordered = TRUE;
				663	schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
				664	} else {
				665	pr->ordered = FALSE;
				666	}
				667	if ( schedule == kmp_sch_static ) {
				668	schedule = __kmp_static;
				669	} else {
				670	if ( schedule == kmp_sch_runtime ) {
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	671	// Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
				672	schedule = team -> t.t_sched.r_sched_type;
				673	// Detail the schedule if needed (global controls are differentiated appropriately)
				674	if ( schedule == kmp_sch_guided_chunked ) {
				675	schedule = __kmp_guided;
				676	} else if ( schedule == kmp_sch_static ) {
				677	schedule = __kmp_static;
				678	}
				679	// Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
				680	chunk = team -> t.t_sched.chunk;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	681
				682	#ifdef KMP_DEBUG
				683	{
				684	const char * buff;
				685	// create format specifiers before the debug output
				686	buff = __kmp_str_format(
				687	"__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
				688	traits_t< ST >::spec );
				689	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				690	__kmp_str_free( &buff );
				691	}
				692	#endif
				693	} else {
				694	if ( schedule == kmp_sch_guided_chunked ) {
				695	schedule = __kmp_guided;
				696	}
				697	if ( chunk <= 0 ) {
				698	chunk = KMP_DEFAULT_CHUNK;
				699	}
				700	}
				701
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	702	if ( schedule == kmp_sch_auto ) {
				703	// mapping and differentiation: in the __kmp_do_serial_initialize()
				704	schedule = __kmp_auto;
				705	#ifdef KMP_DEBUG
				706	{
				707	const char * buff;
				708	// create format specifiers before the debug output
				709	buff = __kmp_str_format(
				710	"__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
				711	traits_t< ST >::spec );
				712	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				713	__kmp_str_free( &buff );
				714	}
				715	#endif
				716	}
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	717
				718	/* guided analytical not safe for too many threads */
				719	if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
				720	schedule = kmp_sch_guided_iterative_chunked;
				721	KMP_WARNING( DispatchManyThreads );
				722	}
				723	pr->u.p.parm1 = chunk;
				724	}
				725	KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
				726	"unknown scheduling type" );
				727
				728	pr->u.p.count = 0;
				729
				730	if ( __kmp_env_consistency_check ) {
				731	if ( st == 0 ) {
				732	__kmp_error_construct(
				733	kmp_i18n_msg_CnsLoopIncrZeroProhibited,
				734	( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
				735	);
				736	}
				737	}
				738
				739	tc = ( ub - lb + st );
				740	if ( st != 1 ) {
				741	if ( st < 0 ) {
				742	if ( lb < ub ) {
				743	tc = 0; // zero-trip
				744	} else { // lb >= ub
				745	tc = (ST)tc / st; // convert to signed division
				746	}
				747	} else { // st > 0
				748	if ( ub < lb ) {
				749	tc = 0; // zero-trip
				750	} else { // lb >= ub
				751	tc /= st;
				752	}
				753	}
				754	} else if ( ub < lb ) { // st == 1
				755	tc = 0; // zero-trip
				756	}
				757
				758	pr->u.p.lb = lb;
				759	pr->u.p.ub = ub;
				760	pr->u.p.st = st;
				761	pr->u.p.tc = tc;
				762
				763	#if KMP_OS_WINDOWS
				764	pr->u.p.last_upper = ub + st;
				765	#endif /* KMP_OS_WINDOWS */
				766
				767	/* NOTE: only the active parallel region(s) has active ordered sections */
				768
				769	if ( active ) {
				770	if ( pr->ordered == 0 ) {
				771	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
				772	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
				773	} else {
				774	pr->ordered_bumped = 0;
				775
				776	pr->u.p.ordered_lower = 1;
				777	pr->u.p.ordered_upper = 0;
				778
				779	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
				780	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
				781	}
				782	}
				783
				784	if ( __kmp_env_consistency_check ) {
				785	enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
				786	if ( push_ws ) {
				787	__kmp_push_workshare( gtid, ws, loc );
				788	pr->pushed_ws = ws;
				789	} else {
				790	__kmp_check_workshare( gtid, ws, loc );
				791	pr->pushed_ws = ct_none;
				792	}
				793	}
				794
				795	switch ( schedule ) {
				796	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				797	case kmp_sch_static_steal:
				798	{
				799	T nproc = team->t.t_nproc;
				800	T ntc, init;
				801
				802	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
				803
				804	ntc = (tc % chunk ? 1 : 0) + tc / chunk;
				805	if ( nproc > 1 && ntc >= nproc ) {
				806	T id = __kmp_tid_from_gtid(gtid);
				807	T small_chunk, extras;
				808
				809	small_chunk = ntc / nproc;
				810	extras = ntc % nproc;
				811
				812	init = id * small_chunk + ( id < extras ? id : extras );
				813	pr->u.p.count = init;
				814	pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
				815
				816	pr->u.p.parm2 = lb;
				817	//pr->pfields.parm3 = 0; // it's not used in static_steal
				818	pr->u.p.parm4 = id;
				819	pr->u.p.st = st;
				820	break;
				821	} else {
				822	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
				823	gtid ) );
				824	schedule = kmp_sch_static_balanced;
				825	/* too few iterations: fall-through to kmp_sch_static_balanced */
				826	} // if
				827	/* FALL-THROUGH to static balanced */
				828	} // case
				829	#endif
				830	case kmp_sch_static_balanced:
				831	{
				832	T nproc = team->t.t_nproc;
				833	T init, limit;
				834
				835	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
				836	gtid ) );
				837
				838	if ( nproc > 1 ) {
				839	T id = __kmp_tid_from_gtid(gtid);
				840
				841	if ( tc < nproc ) {
				842	if ( id < tc ) {
				843	init = id;
				844	limit = id;
				845	pr->u.p.parm1 = (id == tc - 1); /* parm1 stores plastiter /
				846	} else {
				847	pr->u.p.count = 1; /* means no more chunks to execute */
				848	pr->u.p.parm1 = FALSE;
				849	break;
				850	}
				851	} else {
				852	T small_chunk = tc / nproc;
				853	T extras = tc % nproc;
				854	init = id * small_chunk + (id < extras ? id : extras);
				855	limit = init + small_chunk - (id < extras ? 0 : 1);
				856	pr->u.p.parm1 = (id == nproc - 1);
				857	}
				858	} else {
				859	if ( tc > 0 ) {
				860	init = 0;
				861	limit = tc - 1;
				862	pr->u.p.parm1 = TRUE;
				863	} else {
				864	// zero trip count
				865	pr->u.p.count = 1; /* means no more chunks to execute */
				866	pr->u.p.parm1 = FALSE;
				867	break;
				868	}
				869	}
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	870	#if USE_ITT_BUILD
				871	// Calculate chunk for metadata report
				872	if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
				873	cur_chunk = limit - init + 1;
				874	}
				875	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	876	if ( st == 1 ) {
				877	pr->u.p.lb = lb + init;
				878	pr->u.p.ub = lb + limit;
				879	} else {
				880	T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
				881	pr->u.p.lb = lb + init * st;
				882	// adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
				883	if ( st > 0 ) {
				884	pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
				885	} else {
				886	pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
				887	}
				888	}
				889	if ( pr->ordered ) {
				890	pr->u.p.ordered_lower = init;
				891	pr->u.p.ordered_upper = limit;
				892	}
				893	break;
				894	} // case
				895	case kmp_sch_guided_iterative_chunked :
				896	{
				897	T nproc = team->t.t_nproc;
				898	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
				899
				900	if ( nproc > 1 ) {
				901	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				902	/* chunk size too large, switch to dynamic */
				903	schedule = kmp_sch_dynamic_chunked;
				904	} else {
				905	// when remaining iters become less than parm2 - switch to dynamic
				906	pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
				907	(double)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
				908	}
				909	} else {
				910	KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
				911	schedule = kmp_sch_static_greedy;
				912	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				913	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				914	pr->u.p.parm1 = tc;
				915	} // if
				916	} // case
				917	break;
				918	case kmp_sch_guided_analytical_chunked:
				919	{
				920	T nproc = team->t.t_nproc;
				921	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
				922
				923	if ( nproc > 1 ) {
				924	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				925	/* chunk size too large, switch to dynamic */
				926	schedule = kmp_sch_dynamic_chunked;
				927	} else {
				928	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				929	DBL x;
				930
				931	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				932	/* Linux* OS already has 64-bit computation by default for
				933	long double, and on Windows* OS on Intel(R) 64,
				934	/Qlong_double doesn't work. On Windows* OS
				935	on IA-32 architecture, we need to set precision to
				936	64-bit instead of the default 53-bit. Even though long
				937	double doesn't work on Windows* OS on Intel(R) 64, the
				938	resulting lack of precision is not expected to impact
				939	the correctness of the algorithm, but this has not been
				940	mathematically proven.
				941	*/
				942	// save original FPCW and set precision to 64-bit, as
				943	// Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	944	unsigned int oldFpcw = _control87(0,0);
				945	_control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	946	#endif
				947	/* value used for comparison in solver for cross-over point */
				948	long double target = ((long double)chunk * 2 + 1) * nproc / tc;
				949
				950	/* crossover point--chunk indexes equal to or greater than
				951	this point switch to dynamic-style scheduling */
				952	UT cross;
				953
				954	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				955	x = (long double)1.0 - (long double)0.5 / nproc;
				956
				957	#ifdef KMP_DEBUG
				958	{ // test natural alignment
				959	struct _test_a {
				960	char a;
				961	union {
				962	char b;
				963	DBL d;
				964	};
				965	} t;
				966	ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
				967	//__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
				968	KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
				969	}
				970	#endif // KMP_DEBUG
				971
				972	/* save the term in thread private dispatch structure */
				973	(DBL)&pr->u.p.parm3 = x;
				974
				975	/* solve for the crossover point to the nearest integer i for which C_i <= chunk */
				976	{
				977	UT left, right, mid;
				978	long double p;
				979
				980	/* estimate initial upper and lower bound */
				981
				982	/* doesn't matter what value right is as long as it is positive, but
				983	it affects performance of the solver
				984	*/
				985	right = 229;
				986	p = __kmp_pow< UT >(x,right);
				987	if ( p > target ) {
				988	do{
				989	p *= p;
				990	right <<= 1;
				991	} while(p>target && right < (1<<27));
				992	left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
				993	} else {
				994	left = 0;
				995	}
				996
				997	/* bisection root-finding method */
				998	while ( left + 1 < right ) {
				999	mid = (left + right) / 2;
				1000	if ( __kmp_pow< UT >(x,mid) > target ) {
				1001	left = mid;
				1002	} else {
				1003	right = mid;
				1004	}
				1005	} // while
				1006	cross = right;
				1007	}
				1008	/* assert sanity of computed crossover point */
				1009	KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
				1010
				1011	/* save the crossover point in thread private dispatch structure */
				1012	pr->u.p.parm2 = cross;
				1013
				1014	// C75803
				1015	#if ( ( KMP_OS_LINUX \|\| KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
				1016	#define GUIDED_ANALYTICAL_WORKAROUND (( DBL )&pr->u.p.parm3)
				1017	#else
				1018	#define GUIDED_ANALYTICAL_WORKAROUND (x)
				1019	#endif
				1020	/* dynamic-style scheduling offset */
				1021	pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
				1022	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1023	// restore FPCW
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1024	_control87(oldFpcw,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1025	#endif
				1026	} // if
				1027	} else {
				1028	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
				1029	gtid ) );
				1030	schedule = kmp_sch_static_greedy;
				1031	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				1032	pr->u.p.parm1 = tc;
				1033	} // if
				1034	} // case
				1035	break;
				1036	case kmp_sch_static_greedy:
				1037	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				1038	pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
				1039	( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
				1040	tc;
				1041	break;
				1042	case kmp_sch_static_chunked :
				1043	case kmp_sch_dynamic_chunked :
				1044	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
				1045	break;
				1046	case kmp_sch_trapezoidal :
				1047	{
				1048	/* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
				1049
				1050	T parm1, parm2, parm3, parm4;
				1051	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
				1052
				1053	parm1 = chunk;
				1054
				1055	/* F : size of the first cycle */
				1056	parm2 = ( tc / (2 * team->t.t_nproc) );
				1057
				1058	if ( parm2 < 1 ) {
				1059	parm2 = 1;
				1060	}
				1061
				1062	/* L : size of the last cycle. Make sure the last cycle
				1063	* is not larger than the first cycle.
				1064	*/
				1065	if ( parm1 < 1 ) {
				1066	parm1 = 1;
				1067	} else if ( parm1 > parm2 ) {
				1068	parm1 = parm2;
				1069	}
				1070
				1071	/* N : number of cycles */
				1072	parm3 = ( parm2 + parm1 );
				1073	parm3 = ( 2 * tc + parm3 - 1) / parm3;
				1074
				1075	if ( parm3 < 2 ) {
				1076	parm3 = 2;
				1077	}
				1078
				1079	/* sigma : decreasing incr of the trapezoid */
				1080	parm4 = ( parm3 - 1 );
				1081	parm4 = ( parm2 - parm1 ) / parm4;
				1082
				1083	// pointless check, because parm4 >= 0 always
				1084	//if ( parm4 < 0 ) {
				1085	// parm4 = 0;
				1086	//}
				1087
				1088	pr->u.p.parm1 = parm1;
				1089	pr->u.p.parm2 = parm2;
				1090	pr->u.p.parm3 = parm3;
				1091	pr->u.p.parm4 = parm4;
				1092	} // case
				1093	break;
				1094
				1095	default:
				1096	{
				1097	__kmp_msg(
				1098	kmp_ms_fatal, // Severity
				1099	KMP_MSG( UnknownSchedTypeDetected ), // Primary message
				1100	KMP_HNT( GetNewerLibrary ), // Hint
				1101	__kmp_msg_null // Variadic argument list terminator
				1102	);
				1103	}
				1104	break;
				1105	} // switch
				1106	pr->schedule = schedule;
				1107	if ( active ) {
				1108	/* The name of this buffer should be my_buffer_index when it's free to use it */
				1109
				1110	KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1111	gtid, my_buffer_index, sh->buffer_index) );
				1112	__kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
				1113	USE_ITT_BUILD_ARG( NULL )
				1114	);
				1115	// Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
				1116	// always 32-bit integers.
				1117	KMP_MB(); /* is this necessary? */
				1118	KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1119	gtid, my_buffer_index, sh->buffer_index) );
				1120
				1121	th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
				1122	th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
				1123	#if USE_ITT_BUILD
				1124	if ( pr->ordered ) {
				1125	__kmp_itt_ordered_init( gtid );
				1126	}; // if
				1127	#endif /* USE_ITT_BUILD */
				1128	}; // if
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1129
				1130	#if USE_ITT_BUILD
				1131	// Report loop metadata
				1132	if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
				1133	kmp_uint32 tid = __kmp_tid_from_gtid( gtid );
				1134	if (KMP_MASTER_TID(tid)) {
				1135	kmp_uint64 schedtype = 0;
				1136
				1137	switch ( schedule ) {
				1138	case kmp_sch_static_chunked:
				1139	case kmp_sch_static_balanced:// Chunk is calculated in the switch above
				1140	break;
				1141	case kmp_sch_static_greedy:
				1142	cur_chunk = pr->u.p.parm1;
				1143	break;
				1144	case kmp_sch_dynamic_chunked:
				1145	schedtype = 1;
				1146	break;
				1147	case kmp_sch_guided_iterative_chunked:
				1148	case kmp_sch_guided_analytical_chunked:
				1149	schedtype = 2;
				1150	break;
				1151	default:
				1152	// Should we put this case under "static"?
				1153	// case kmp_sch_static_steal:
				1154	schedtype = 3;
				1155	break;
				1156	}
				1157	__kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
				1158	}
				1159	}
				1160	#endif /* USE_ITT_BUILD */
				1161
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1162	#ifdef KMP_DEBUG
				1163	{
				1164	const char * buff;
				1165	// create format specifiers before the debug output
				1166	buff = __kmp_str_format(
				1167	"__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
				1168	" st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
				1169	" parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
				1170	traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
				1171	traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
				1172	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
				1173	traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
				1174	KD_TRACE(10, ( buff,
				1175	gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
				1176	pr->u.p.st, pr->u.p.tc, pr->u.p.count,
				1177	pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
				1178	pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
				1179	__kmp_str_free( &buff );
				1180	}
				1181	#endif
				1182	#if ( KMP_STATIC_STEAL_ENABLED )
				1183	if ( ___kmp_size_type < 8 ) {
				1184	// It cannot be guaranteed that after execution of a loop with some other schedule kind
				1185	// all the parm3 variables will contain the same value.
				1186	// Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
				1187	// rather than program life-time increment.
				1188	// So the dedicated variable is required. The 'static_steal_counter' is used.
				1189	if( schedule == kmp_sch_static_steal ) {
				1190	// Other threads will inspect this variable when searching for a victim.
				1191	// This is a flag showing that other threads may steal from this thread since then.
				1192	volatile T * p = &pr->u.p.static_steal_counter;
				1193	p = p + 1;
				1194	}
				1195	}
				1196	#endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
Andrey Churbanov	d7d088f	2015-04-29 16:42:24 +0000	[diff] [blame]	1197
				1198	#if OMPT_SUPPORT && OMPT_TRACE
				1199	if ((ompt_status == ompt_status_track_callback) &&
				1200	ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
				1201	ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
				1202	ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
				1203	ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
				1204	team_info->parallel_id, task_info->task_id, team_info->microtask);
				1205	}
				1206	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1207	}
				1208
				1209	/*
				1210	* For ordered loops, either __kmp_dispatch_finish() should be called after
				1211	* every iteration, or __kmp_dispatch_finish_chunk() should be called after
				1212	* every chunk of iterations. If the ordered section(s) were not executed
				1213	* for this iteration (or every iteration in this chunk), we need to set the
				1214	* ordered iteration counters so that the next thread can proceed.
				1215	*/
				1216	template< typename UT >
				1217	static void
				1218	__kmp_dispatch_finish( int gtid, ident_t *loc )
				1219	{
				1220	typedef typename traits_t< UT >::signed_t ST;
				1221	kmp_info_t *th = __kmp_threads[ gtid ];
				1222
				1223	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
				1224	if ( ! th -> th.th_team -> t.t_serialized ) {
				1225
				1226	dispatch_private_info_template< UT > * pr =
				1227	reinterpret_cast< dispatch_private_info_template< UT >* >
				1228	( th->th.th_dispatch->th_dispatch_pr_current );
				1229	dispatch_shared_info_template< UT > volatile * sh =
				1230	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1231	( th->th.th_dispatch->th_dispatch_sh_current );
				1232	KMP_DEBUG_ASSERT( pr );
				1233	KMP_DEBUG_ASSERT( sh );
				1234	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1235	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1236
				1237	if ( pr->ordered_bumped ) {
				1238	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1239	gtid ) );
				1240	pr->ordered_bumped = 0;
				1241	} else {
				1242	UT lower = pr->u.p.ordered_lower;
				1243
				1244	#ifdef KMP_DEBUG
				1245	{
				1246	const char * buff;
				1247	// create format specifiers before the debug output
				1248	buff = __kmp_str_format(
				1249	"__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
				1250	traits_t< UT >::spec, traits_t< UT >::spec );
				1251	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1252	__kmp_str_free( &buff );
				1253	}
				1254	#endif
				1255
				1256	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1257	USE_ITT_BUILD_ARG(NULL)
				1258	);
				1259	KMP_MB(); /* is this necessary? */
				1260	#ifdef KMP_DEBUG
				1261	{
				1262	const char * buff;
				1263	// create format specifiers before the debug output
				1264	buff = __kmp_str_format(
				1265	"__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
				1266	traits_t< UT >::spec, traits_t< UT >::spec );
				1267	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1268	__kmp_str_free( &buff );
				1269	}
				1270	#endif
				1271
				1272	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				1273	} // if
				1274	} // if
				1275	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
				1276	}
				1277
				1278	#ifdef KMP_GOMP_COMPAT
				1279
				1280	template< typename UT >
				1281	static void
				1282	__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
				1283	{
				1284	typedef typename traits_t< UT >::signed_t ST;
				1285	kmp_info_t *th = __kmp_threads[ gtid ];
				1286
				1287	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
				1288	if ( ! th -> th.th_team -> t.t_serialized ) {
				1289	// int cid;
				1290	dispatch_private_info_template< UT > * pr =
				1291	reinterpret_cast< dispatch_private_info_template< UT >* >
				1292	( th->th.th_dispatch->th_dispatch_pr_current );
				1293	dispatch_shared_info_template< UT > volatile * sh =
				1294	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1295	( th->th.th_dispatch->th_dispatch_sh_current );
				1296	KMP_DEBUG_ASSERT( pr );
				1297	KMP_DEBUG_ASSERT( sh );
				1298	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1299	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1300
				1301	// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
				1302	UT lower = pr->u.p.ordered_lower;
				1303	UT upper = pr->u.p.ordered_upper;
				1304	UT inc = upper - lower + 1;
				1305
				1306	if ( pr->ordered_bumped == inc ) {
				1307	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1308	gtid ) );
				1309	pr->ordered_bumped = 0;
				1310	} else {
				1311	inc -= pr->ordered_bumped;
				1312
				1313	#ifdef KMP_DEBUG
				1314	{
				1315	const char * buff;
				1316	// create format specifiers before the debug output
				1317	buff = __kmp_str_format(
				1318	"__kmp_dispatch_finish_chunk: T#%%d before wait: " \
				1319	"ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
				1320	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1321	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
				1322	__kmp_str_free( &buff );
				1323	}
				1324	#endif
				1325
				1326	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1327	USE_ITT_BUILD_ARG(NULL)
				1328	);
				1329
				1330	KMP_MB(); /* is this necessary? */
				1331	KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
				1332	gtid ) );
				1333	pr->ordered_bumped = 0;
				1334	//!!!!! TODO check if the inc should be unsigned, or signed???
				1335	#ifdef KMP_DEBUG
				1336	{
				1337	const char * buff;
				1338	// create format specifiers before the debug output
				1339	buff = __kmp_str_format(
				1340	"__kmp_dispatch_finish_chunk: T#%%d after wait: " \
				1341	"ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
				1342	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1343	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
				1344	__kmp_str_free( &buff );
				1345	}
				1346	#endif
				1347
				1348	test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
				1349	}
				1350	// }
				1351	}
				1352	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
				1353	}
				1354
				1355	#endif /* KMP_GOMP_COMPAT */
				1356
Andrey Churbanov	d7d088f	2015-04-29 16:42:24 +0000	[diff] [blame]	1357	/* Define a macro for exiting __kmp_dispatch_next(). If status is 0
				1358	* (no more work), then tell OMPT the loop is over. In some cases
				1359	* kmp_dispatch_fini() is not called. */
				1360	#if OMPT_SUPPORT && OMPT_TRACE
				1361	#define OMPT_LOOP_END \
				1362	if (status == 0) { \
				1363	if ((ompt_status == ompt_status_track_callback) && \
				1364	ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
				1365	ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
				1366	ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
				1367	ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
				1368	team_info->parallel_id, task_info->task_id); \
				1369	} \
				1370	}
				1371	#else
				1372	#define OMPT_LOOP_END // no-op
				1373	#endif
				1374
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1375	template< typename T >
				1376	static int
				1377	__kmp_dispatch_next(
				1378	ident_t loc, int gtid, kmp_int32 p_last, T p_lb, T p_ub, typename traits_t< T >::signed_t *p_st
				1379	) {
				1380
				1381	typedef typename traits_t< T >::unsigned_t UT;
				1382	typedef typename traits_t< T >::signed_t ST;
				1383	typedef typename traits_t< T >::floating_t DBL;
				1384	static const int ___kmp_size_type = sizeof( UT );
				1385
				1386	int status;
				1387	dispatch_private_info_template< T > * pr;
				1388	kmp_info_t * th = __kmp_threads[ gtid ];
				1389	kmp_team_t * team = th -> th.th_team;
				1390
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1391	KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1392	#ifdef KMP_DEBUG
				1393	{
				1394	const char * buff;
				1395	// create format specifiers before the debug output
				1396	buff = __kmp_str_format(
				1397	"__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
				1398	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				1399	KD_TRACE(1000, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last ) );
				1400	__kmp_str_free( &buff );
				1401	}
				1402	#endif
				1403
				1404	if ( team -> t.t_serialized ) {
				1405	/* NOTE: serialize this dispatch becase we are not at the active level */
				1406	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1407	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				1408	KMP_DEBUG_ASSERT( pr );
				1409
				1410	if ( (status = (pr->u.p.tc != 0)) == 0 ) {
				1411	*p_lb = 0;
				1412	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1413	// if ( p_last != NULL )
				1414	// *p_last = 0;
				1415	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1416	*p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1417	if ( __kmp_env_consistency_check ) {
				1418	if ( pr->pushed_ws != ct_none ) {
				1419	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1420	}
				1421	}
				1422	} else if ( pr->nomerge ) {
				1423	kmp_int32 last;
				1424	T start;
				1425	UT limit, trip, init;
				1426	ST incr;
				1427	T chunk = pr->u.p.parm1;
				1428
				1429	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
				1430
				1431	init = chunk * pr->u.p.count++;
				1432	trip = pr->u.p.tc - 1;
				1433
				1434	if ( (status = (init <= trip)) == 0 ) {
				1435	*p_lb = 0;
				1436	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1437	// if ( p_last != NULL )
				1438	// *p_last = 0;
				1439	if ( p_st != NULL )
				1440	*p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1441	if ( __kmp_env_consistency_check ) {
				1442	if ( pr->pushed_ws != ct_none ) {
				1443	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1444	}
				1445	}
				1446	} else {
				1447	start = pr->u.p.lb;
				1448	limit = chunk + init - 1;
				1449	incr = pr->u.p.st;
				1450
				1451	if ( (last = (limit >= trip)) != 0 ) {
				1452	limit = trip;
				1453	#if KMP_OS_WINDOWS
				1454	pr->u.p.last_upper = pr->u.p.ub;
				1455	#endif /* KMP_OS_WINDOWS */
				1456	}
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1457	if ( p_last != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1458	*p_last = last;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1459	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1460	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1461	if ( incr == 1 ) {
				1462	*p_lb = start + init;
				1463	*p_ub = start + limit;
				1464	} else {
				1465	p_lb = start + init incr;
				1466	p_ub = start + limit incr;
				1467	}
				1468
				1469	if ( pr->ordered ) {
				1470	pr->u.p.ordered_lower = init;
				1471	pr->u.p.ordered_upper = limit;
				1472	#ifdef KMP_DEBUG
				1473	{
				1474	const char * buff;
				1475	// create format specifiers before the debug output
				1476	buff = __kmp_str_format(
				1477	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1478	traits_t< UT >::spec, traits_t< UT >::spec );
				1479	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1480	__kmp_str_free( &buff );
				1481	}
				1482	#endif
				1483	} // if
				1484	} // if
				1485	} else {
				1486	pr->u.p.tc = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1487	*p_lb = pr->u.p.lb;
				1488	*p_ub = pr->u.p.ub;
				1489	#if KMP_OS_WINDOWS
				1490	pr->u.p.last_upper = *p_ub;
				1491	#endif /* KMP_OS_WINDOWS */
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1492	if ( p_last != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1493	*p_last = TRUE;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1494	if ( p_st != NULL )
				1495	*p_st = pr->u.p.st;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1496	} // if
				1497	#ifdef KMP_DEBUG
				1498	{
				1499	const char * buff;
				1500	// create format specifiers before the debug output
				1501	buff = __kmp_str_format(
				1502	"__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1503	"p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1504	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1505	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, p_st, p_last, p_last, status) );
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1506	__kmp_str_free( &buff );
				1507	}
				1508	#endif
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1509	#if INCLUDE_SSC_MARKS
				1510	SSC_MARK_DISPATCH_NEXT();
				1511	#endif
Andrey Churbanov	d7d088f	2015-04-29 16:42:24 +0000	[diff] [blame]	1512	OMPT_LOOP_END;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1513	return status;
				1514	} else {
				1515	kmp_int32 last = 0;
				1516	dispatch_shared_info_template< UT > *sh;
				1517	T start;
				1518	ST incr;
				1519	UT limit, trip, init;
				1520
				1521	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1522	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1523
				1524	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1525	( th->th.th_dispatch->th_dispatch_pr_current );
				1526	KMP_DEBUG_ASSERT( pr );
				1527	sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				1528	( th->th.th_dispatch->th_dispatch_sh_current );
				1529	KMP_DEBUG_ASSERT( sh );
				1530
				1531	if ( pr->u.p.tc == 0 ) {
				1532	// zero trip count
				1533	status = 0;
				1534	} else {
				1535	switch (pr->schedule) {
				1536	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1537	case kmp_sch_static_steal:
				1538	{
				1539	T chunk = pr->u.p.parm1;
				1540
				1541	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
				1542
				1543	trip = pr->u.p.tc - 1;
				1544
				1545	if ( ___kmp_size_type > 4 ) {
				1546	// Other threads do not look into the data of this thread,
				1547	// so it's not necessary to make volatile casting.
				1548	init = ( pr->u.p.count )++;
				1549	status = ( init < (UT)pr->u.p.ub );
				1550	} else {
				1551	typedef union {
				1552	struct {
				1553	UT count;
				1554	T ub;
				1555	} p;
				1556	kmp_int64 b;
				1557	} union_i4;
				1558	// All operations on 'count' or 'ub' must be combined atomically together.
				1559	// stealing implemented only for 4-byte indexes
				1560	{
				1561	union_i4 vold, vnew;
				1562	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1563	vnew = vold;
				1564	vnew.p.count++;
				1565	while( ! KMP_COMPARE_AND_STORE_ACQ64(
				1566	( volatile kmp_int64* )&pr->u.p.count,
				1567	VOLATILE_CAST(kmp_int64 )&vold.b,
				1568	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1569	KMP_CPU_PAUSE();
				1570	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1571	vnew = vold;
				1572	vnew.p.count++;
				1573	}
				1574	vnew = vold;
				1575	init = vnew.p.count;
				1576	status = ( init < (UT)vnew.p.ub ) ;
				1577	}
				1578
				1579	if( !status ) {
				1580	kmp_info_t **other_threads = team->t.t_threads;
				1581	int while_limit = 10;
				1582	int while_index = 0;
				1583
				1584	// TODO: algorithm of searching for a victim
				1585	// should be cleaned up and measured
				1586	while ( ( !status ) && ( while_limit != ++while_index ) ) {
				1587	union_i4 vold, vnew;
				1588	kmp_int32 remaining; // kmp_int32 because KMP_I4 only
				1589	T victimIdx = pr->u.p.parm4;
				1590	T oldVictimIdx = victimIdx;
				1591	dispatch_private_info_template< T > * victim;
				1592
				1593	do {
				1594	if( !victimIdx ) {
				1595	victimIdx = team->t.t_nproc - 1;
				1596	} else {
				1597	--victimIdx;
				1598	}
				1599	victim = reinterpret_cast< dispatch_private_info_template< T >* >
				1600	( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
				1601	} while ( (victim == NULL \|\| victim == pr) && oldVictimIdx != victimIdx );
				1602	// TODO: think about a proper place of this test
				1603	if ( ( !victim ) \|\|
				1604	( (( volatile T )&victim->u.p.static_steal_counter) !=
				1605	(( volatile T )&pr->u.p.static_steal_counter) ) ) {
				1606	// TODO: delay would be nice
				1607	continue;
				1608	// the victim is not ready yet to participate in stealing
				1609	// because the victim is still in kmp_init_dispatch
				1610	}
				1611	if ( oldVictimIdx == victimIdx ) {
				1612	break;
				1613	}
				1614	pr->u.p.parm4 = victimIdx;
				1615
				1616	while( 1 ) {
				1617	vold.b = ( volatile kmp_int64 )( &victim->u.p.count );
				1618	vnew = vold;
				1619
				1620	KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
				1621	if ( vnew.p.count >= (UT)vnew.p.ub \|\| (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
				1622	break;
				1623	}
				1624	vnew.p.ub -= (remaining >> 2);
				1625	KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
				1626	#pragma warning( push )
				1627	// disable warning on pointless comparison of unsigned with 0
				1628	#pragma warning( disable: 186 )
				1629	KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
				1630	#pragma warning( pop )
				1631	// TODO: Should this be acquire or release?
				1632	if ( KMP_COMPARE_AND_STORE_ACQ64(
				1633	( volatile kmp_int64 * )&victim->u.p.count,
				1634	VOLATILE_CAST(kmp_int64 )&vold.b,
				1635	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1636	status = 1;
				1637	while_index = 0;
				1638	// now update own count and ub
				1639	#if KMP_ARCH_X86
				1640	// stealing executed on non-KMP_ARCH_X86 only
				1641	// Atomic 64-bit write on ia32 is
				1642	// unavailable, so we do this in steps.
				1643	// This code is not tested.
				1644	init = vold.p.count;
				1645	pr->u.p.ub = 0;
				1646	pr->u.p.count = init + 1;
				1647	pr->u.p.ub = vnew.p.count;
				1648	#else
				1649	init = vnew.p.ub;
				1650	vold.p.count = init + 1;
				1651	// TODO: is it safe and enough?
				1652	( volatile kmp_int64 )(&pr->u.p.count) = vold.b;
				1653	#endif // KMP_ARCH_X86
				1654	break;
				1655	} // if
				1656	KMP_CPU_PAUSE();
				1657	} // while (1)
				1658	} // while
				1659	} // if
				1660	} // if
				1661	if ( !status ) {
				1662	*p_lb = 0;
				1663	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1664	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1665	} else {
				1666	start = pr->u.p.parm2;
				1667	init *= chunk;
				1668	limit = chunk + init - 1;
				1669	incr = pr->u.p.st;
				1670
				1671	KMP_DEBUG_ASSERT(init <= trip);
				1672	if ( (last = (limit >= trip)) != 0 )
				1673	limit = trip;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1674	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1675
				1676	if ( incr == 1 ) {
				1677	*p_lb = start + init;
				1678	*p_ub = start + limit;
				1679	} else {
				1680	p_lb = start + init incr;
				1681	p_ub = start + limit incr;
				1682	}
				1683
				1684	if ( pr->ordered ) {
				1685	pr->u.p.ordered_lower = init;
				1686	pr->u.p.ordered_upper = limit;
				1687	#ifdef KMP_DEBUG
				1688	{
				1689	const char * buff;
				1690	// create format specifiers before the debug output
				1691	buff = __kmp_str_format(
				1692	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1693	traits_t< UT >::spec, traits_t< UT >::spec );
				1694	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1695	__kmp_str_free( &buff );
				1696	}
				1697	#endif
				1698	} // if
				1699	} // if
				1700	break;
				1701	} // case
				1702	#endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1703	case kmp_sch_static_balanced:
				1704	{
				1705	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
				1706	if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
				1707	pr->u.p.count = 1;
				1708	*p_lb = pr->u.p.lb;
				1709	*p_ub = pr->u.p.ub;
				1710	last = pr->u.p.parm1;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1711	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1712	*p_st = pr->u.p.st;
				1713	} else { /* no iterations to do */
				1714	pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
				1715	}
				1716	if ( pr->ordered ) {
				1717	#ifdef KMP_DEBUG
				1718	{
				1719	const char * buff;
				1720	// create format specifiers before the debug output
				1721	buff = __kmp_str_format(
				1722	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1723	traits_t< UT >::spec, traits_t< UT >::spec );
				1724	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1725	__kmp_str_free( &buff );
				1726	}
				1727	#endif
				1728	} // if
				1729	} // case
				1730	break;
				1731	case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
				1732	case kmp_sch_static_chunked:
				1733	{
				1734	T parm1;
				1735
				1736	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity\|chunked] case\n",
				1737	gtid ) );
				1738	parm1 = pr->u.p.parm1;
				1739
				1740	trip = pr->u.p.tc - 1;
				1741	init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
				1742
				1743	if ( (status = (init <= trip)) != 0 ) {
				1744	start = pr->u.p.lb;
				1745	incr = pr->u.p.st;
				1746	limit = parm1 + init - 1;
				1747
				1748	if ( (last = (limit >= trip)) != 0 )
				1749	limit = trip;
				1750
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1751	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1752
				1753	pr->u.p.count += team->t.t_nproc;
				1754
				1755	if ( incr == 1 ) {
				1756	*p_lb = start + init;
				1757	*p_ub = start + limit;
				1758	}
				1759	else {
				1760	p_lb = start + init incr;
				1761	p_ub = start + limit incr;
				1762	}
				1763
				1764	if ( pr->ordered ) {
				1765	pr->u.p.ordered_lower = init;
				1766	pr->u.p.ordered_upper = limit;
				1767	#ifdef KMP_DEBUG
				1768	{
				1769	const char * buff;
				1770	// create format specifiers before the debug output
				1771	buff = __kmp_str_format(
				1772	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1773	traits_t< UT >::spec, traits_t< UT >::spec );
				1774	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1775	__kmp_str_free( &buff );
				1776	}
				1777	#endif
				1778	} // if
				1779	} // if
				1780	} // case
				1781	break;
				1782
				1783	case kmp_sch_dynamic_chunked:
				1784	{
				1785	T chunk = pr->u.p.parm1;
				1786
				1787	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
				1788	gtid ) );
				1789
				1790	init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1791	trip = pr->u.p.tc - 1;
				1792
				1793	if ( (status = (init <= trip)) == 0 ) {
				1794	*p_lb = 0;
				1795	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1796	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1797	} else {
				1798	start = pr->u.p.lb;
				1799	limit = chunk + init - 1;
				1800	incr = pr->u.p.st;
				1801
				1802	if ( (last = (limit >= trip)) != 0 )
				1803	limit = trip;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1804
				1805	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1806
				1807	if ( incr == 1 ) {
				1808	*p_lb = start + init;
				1809	*p_ub = start + limit;
				1810	} else {
				1811	p_lb = start + init incr;
				1812	p_ub = start + limit incr;
				1813	}
				1814
				1815	if ( pr->ordered ) {
				1816	pr->u.p.ordered_lower = init;
				1817	pr->u.p.ordered_upper = limit;
				1818	#ifdef KMP_DEBUG
				1819	{
				1820	const char * buff;
				1821	// create format specifiers before the debug output
				1822	buff = __kmp_str_format(
				1823	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1824	traits_t< UT >::spec, traits_t< UT >::spec );
				1825	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1826	__kmp_str_free( &buff );
				1827	}
				1828	#endif
				1829	} // if
				1830	} // if
				1831	} // case
				1832	break;
				1833
				1834	case kmp_sch_guided_iterative_chunked:
				1835	{
				1836	T chunkspec = pr->u.p.parm1;
				1837	KD_TRACE(100,
				1838	("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
				1839	trip = pr->u.p.tc;
				1840	// Start atomic part of calculations
				1841	while(1) {
				1842	ST remaining; // signed, because can be < 0
				1843	init = sh->u.s.iteration; // shared value
				1844	remaining = trip - init;
				1845	if ( remaining <= 0 ) { // AC: need to compare with 0 first
				1846	// nothing to do, don't try atomic op
				1847	status = 0;
				1848	break;
				1849	}
				1850	if ( (T)remaining < pr->u.p.parm2 ) { // compare with Knproc(chunk+1), K=2 by default
				1851	// use dynamic-style shcedule
				1852	// atomically inrement iterations, get old value
				1853	init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
				1854	remaining = trip - init;
				1855	if (remaining <= 0) {
				1856	status = 0; // all iterations got by other threads
				1857	} else {
				1858	// got some iterations to work on
				1859	status = 1;
				1860	if ( (T)remaining > chunkspec ) {
				1861	limit = init + chunkspec - 1;
				1862	} else {
				1863	last = 1; // the last chunk
				1864	limit = init + remaining - 1;
				1865	} // if
				1866	} // if
				1867	break;
				1868	} // if
				1869	limit = init + (UT)( remaining * (double)&pr->u.p.parm3 ); // divide by K*nproc
				1870	if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
				1871	// CAS was successful, chunk obtained
				1872	status = 1;
				1873	--limit;
				1874	break;
				1875	} // if
				1876	} // while
				1877	if ( status != 0 ) {
				1878	start = pr->u.p.lb;
				1879	incr = pr->u.p.st;
				1880	if ( p_st != NULL )
				1881	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1882	p_lb = start + init incr;
				1883	p_ub = start + limit incr;
				1884	if ( pr->ordered ) {
				1885	pr->u.p.ordered_lower = init;
				1886	pr->u.p.ordered_upper = limit;
				1887	#ifdef KMP_DEBUG
				1888	{
				1889	const char * buff;
				1890	// create format specifiers before the debug output
				1891	buff = __kmp_str_format(
				1892	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1893	traits_t< UT >::spec, traits_t< UT >::spec );
				1894	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1895	__kmp_str_free( &buff );
				1896	}
				1897	#endif
				1898	} // if
				1899	} else {
				1900	*p_lb = 0;
				1901	*p_ub = 0;
				1902	if ( p_st != NULL )
				1903	*p_st = 0;
				1904	} // if
				1905	} // case
				1906	break;
				1907
				1908	case kmp_sch_guided_analytical_chunked:
				1909	{
				1910	T chunkspec = pr->u.p.parm1;
				1911	UT chunkIdx;
				1912	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1913	/* for storing original FPCW value for Windows* OS on
				1914	IA-32 architecture 8-byte version */
				1915	unsigned int oldFpcw;
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1916	unsigned int fpcwSet = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1917	#endif
				1918	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
				1919	gtid ) );
				1920
				1921	trip = pr->u.p.tc;
				1922
				1923	KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
				1924	KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
				1925
				1926	while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
				1927	chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1928	if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
				1929	--trip;
				1930	/* use dynamic-style scheduling */
				1931	init = chunkIdx * chunkspec + pr->u.p.count;
				1932	/* need to verify init > 0 in case of overflow in the above calculation */
				1933	if ( (status = (init > 0 && init <= trip)) != 0 ) {
				1934	limit = init + chunkspec -1;
				1935
				1936	if ( (last = (limit >= trip)) != 0 )
				1937	limit = trip;
				1938	}
				1939	break;
				1940	} else {
				1941	/* use exponential-style scheduling */
				1942	/* The following check is to workaround the lack of long double precision on Windows* OS.
				1943	This check works around the possible effect that init != 0 for chunkIdx == 0.
				1944	*/
				1945	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1946	/* If we haven't already done so, save original
				1947	FPCW and set precision to 64-bit, as Windows* OS
				1948	on IA-32 architecture defaults to 53-bit */
				1949	if ( !fpcwSet ) {
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1950	oldFpcw = _control87(0,0);
				1951	_control87(_PC_64,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1952	fpcwSet = 0x30000;
				1953	}
				1954	#endif
				1955	if ( chunkIdx ) {
				1956	init = __kmp_dispatch_guided_remaining< T >(
				1957	trip, ( DBL )&pr->u.p.parm3, chunkIdx );
				1958	KMP_DEBUG_ASSERT(init);
				1959	init = trip - init;
				1960	} else
				1961	init = 0;
				1962	limit = trip - __kmp_dispatch_guided_remaining< T >(
				1963	trip, ( DBL )&pr->u.p.parm3, chunkIdx + 1 );
				1964	KMP_ASSERT(init <= limit);
				1965	if ( init < limit ) {
				1966	KMP_DEBUG_ASSERT(limit <= trip);
				1967	--limit;
				1968	status = 1;
				1969	break;
				1970	} // if
				1971	} // if
				1972	} // while (1)
				1973	#if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1974	/* restore FPCW if necessary
				1975	AC: check fpcwSet flag first because oldFpcw can be uninitialized here
				1976	*/
				1977	if ( fpcwSet && ( oldFpcw & fpcwSet ) )
				1978	_control87(oldFpcw,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1979	#endif
				1980	if ( status != 0 ) {
				1981	start = pr->u.p.lb;
				1982	incr = pr->u.p.st;
				1983	if ( p_st != NULL )
				1984	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1985	p_lb = start + init incr;
				1986	p_ub = start + limit incr;
				1987	if ( pr->ordered ) {
				1988	pr->u.p.ordered_lower = init;
				1989	pr->u.p.ordered_upper = limit;
				1990	#ifdef KMP_DEBUG
				1991	{
				1992	const char * buff;
				1993	// create format specifiers before the debug output
				1994	buff = __kmp_str_format(
				1995	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1996	traits_t< UT >::spec, traits_t< UT >::spec );
				1997	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1998	__kmp_str_free( &buff );
				1999	}
				2000	#endif
				2001	}
				2002	} else {
				2003	*p_lb = 0;
				2004	*p_ub = 0;
				2005	if ( p_st != NULL )
				2006	*p_st = 0;
				2007	}
				2008	} // case
				2009	break;
				2010
				2011	case kmp_sch_trapezoidal:
				2012	{
				2013	UT index;
				2014	T parm2 = pr->u.p.parm2;
				2015	T parm3 = pr->u.p.parm3;
				2016	T parm4 = pr->u.p.parm4;
				2017	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
				2018	gtid ) );
				2019
				2020	index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
				2021
				2022	init = ( index * ( (2parm2) - (index-1)parm4 ) ) / 2;
				2023	trip = pr->u.p.tc - 1;
				2024
				2025	if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
				2026	*p_lb = 0;
				2027	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2028	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2029	} else {
				2030	start = pr->u.p.lb;
				2031	limit = ( (index+1) * ( 2parm2 - indexparm4 ) ) / 2 - 1;
				2032	incr = pr->u.p.st;
				2033
				2034	if ( (last = (limit >= trip)) != 0 )
				2035	limit = trip;
				2036
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2037	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2038
				2039	if ( incr == 1 ) {
				2040	*p_lb = start + init;
				2041	*p_ub = start + limit;
				2042	} else {
				2043	p_lb = start + init incr;
				2044	p_ub = start + limit incr;
				2045	}
				2046
				2047	if ( pr->ordered ) {
				2048	pr->u.p.ordered_lower = init;
				2049	pr->u.p.ordered_upper = limit;
				2050	#ifdef KMP_DEBUG
				2051	{
				2052	const char * buff;
				2053	// create format specifiers before the debug output
				2054	buff = __kmp_str_format(
				2055	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				2056	traits_t< UT >::spec, traits_t< UT >::spec );
				2057	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				2058	__kmp_str_free( &buff );
				2059	}
				2060	#endif
				2061	} // if
				2062	} // if
				2063	} // case
				2064	break;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2065	default:
				2066	{
				2067	status = 0; // to avoid complaints on uninitialized variable use
				2068	__kmp_msg(
				2069	kmp_ms_fatal, // Severity
				2070	KMP_MSG( UnknownSchedTypeDetected ), // Primary message
				2071	KMP_HNT( GetNewerLibrary ), // Hint
				2072	__kmp_msg_null // Variadic argument list terminator
				2073	);
				2074	}
				2075	break;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2076	} // switch
				2077	} // if tc == 0;
				2078
				2079	if ( status == 0 ) {
				2080	UT num_done;
				2081
				2082	num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
				2083	#ifdef KMP_DEBUG
				2084	{
				2085	const char * buff;
				2086	// create format specifiers before the debug output
				2087	buff = __kmp_str_format(
				2088	"__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
				2089	traits_t< UT >::spec );
				2090	KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
				2091	__kmp_str_free( &buff );
				2092	}
				2093	#endif
				2094
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2095	if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2096	/* NOTE: release this buffer to be reused */
				2097
				2098	KMP_MB(); /* Flush all pending memory write invalidates. */
				2099
				2100	sh->u.s.num_done = 0;
				2101	sh->u.s.iteration = 0;
				2102
				2103	/* TODO replace with general release procedure? */
				2104	if ( pr->ordered ) {
				2105	sh->u.s.ordered_iteration = 0;
				2106	}
				2107
				2108	KMP_MB(); /* Flush all pending memory write invalidates. */
				2109
				2110	sh -> buffer_index += KMP_MAX_DISP_BUF;
				2111	KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
				2112	gtid, sh->buffer_index) );
				2113
				2114	KMP_MB(); /* Flush all pending memory write invalidates. */
				2115
				2116	} // if
				2117	if ( __kmp_env_consistency_check ) {
				2118	if ( pr->pushed_ws != ct_none ) {
				2119	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				2120	}
				2121	}
				2122
				2123	th -> th.th_dispatch -> th_deo_fcn = NULL;
				2124	th -> th.th_dispatch -> th_dxo_fcn = NULL;
				2125	th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
				2126	th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
				2127	} // if (status == 0)
				2128	#if KMP_OS_WINDOWS
				2129	else if ( last ) {
				2130	pr->u.p.last_upper = pr->u.p.ub;
				2131	}
				2132	#endif /* KMP_OS_WINDOWS */
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2133	if ( p_last != NULL && status != 0 )
				2134	*p_last = last;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2135	} // if
				2136
				2137	#ifdef KMP_DEBUG
				2138	{
				2139	const char * buff;
				2140	// create format specifiers before the debug output
				2141	buff = __kmp_str_format(
				2142	"__kmp_dispatch_next: T#%%d normal case: " \
				2143	"p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
				2144	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				2145	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last, status ) );
				2146	__kmp_str_free( &buff );
				2147	}
				2148	#endif
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2149	#if INCLUDE_SSC_MARKS
				2150	SSC_MARK_DISPATCH_NEXT();
				2151	#endif
Andrey Churbanov	d7d088f	2015-04-29 16:42:24 +0000	[diff] [blame]	2152	OMPT_LOOP_END;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2153	return status;
				2154	}
				2155
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2156	template< typename T >
				2157	static void
				2158	__kmp_dist_get_bounds(
				2159	ident_t *loc,
				2160	kmp_int32 gtid,
				2161	kmp_int32 *plastiter,
				2162	T *plower,
				2163	T *pupper,
				2164	typename traits_t< T >::signed_t incr
				2165	) {
				2166	KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
				2167	typedef typename traits_t< T >::unsigned_t UT;
				2168	typedef typename traits_t< T >::signed_t ST;
				2169	register kmp_uint32 team_id;
				2170	register kmp_uint32 nteams;
				2171	register UT trip_count;
				2172	register kmp_team_t *team;
				2173	kmp_info_t * th;
				2174
				2175	KMP_DEBUG_ASSERT( plastiter && plower && pupper );
				2176	KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
				2177	#ifdef KMP_DEBUG
				2178	{
				2179	const char * buff;
				2180	// create format specifiers before the debug output
				2181	buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
				2182	"iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
				2183	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
				2184	traits_t< T >::spec );
				2185	KD_TRACE(100, ( buff, gtid, plastiter, plower, *pupper, incr ) );
				2186	__kmp_str_free( &buff );
				2187	}
				2188	#endif
				2189
				2190	if( __kmp_env_consistency_check ) {
				2191	if( incr == 0 ) {
				2192	__kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
				2193	}
				2194	if( incr > 0 ? (pupper < plower) : (plower < pupper) ) {
				2195	// The loop is illegal.
				2196	// Some zero-trip loops maintained by compiler, e.g.:
				2197	// for(i=10;i<0;++i) // lower >= upper - run-time check
				2198	// for(i=0;i>10;--i) // lower <= upper - run-time check
				2199	// for(i=0;i>10;++i) // incr > 0 - compile-time check
				2200	// for(i=10;i<0;--i) // incr < 0 - compile-time check
				2201	// Compiler does not check the following illegal loops:
				2202	// for(i=0;i<10;i+=incr) // where incr<0
				2203	// for(i=10;i>0;i-=incr) // where incr<0
				2204	__kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
				2205	}
				2206	}
				2207	th = __kmp_threads[gtid];
				2208	KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
				2209	team = th->th.th_team;
				2210	#if OMP_40_ENABLED
				2211	nteams = th->th.th_teams_size.nteams;
				2212	#endif
				2213	team_id = team->t.t_master_tid;
				2214	KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
				2215
				2216	// compute global trip count
				2217	if( incr == 1 ) {
				2218	trip_count = pupper - plower + 1;
				2219	} else if(incr == -1) {
				2220	trip_count = plower - pupper + 1;
				2221	} else {
				2222	trip_count = (ST)(pupper - plower) / incr + 1; // cast to signed to cover incr<0 case
				2223	}
				2224	if( trip_count <= nteams ) {
				2225	KMP_DEBUG_ASSERT(
				2226	__kmp_static == kmp_sch_static_greedy \|\| \
				2227	__kmp_static == kmp_sch_static_balanced
				2228	); // Unknown static scheduling type.
				2229	// only some teams get single iteration, others get nothing
				2230	if( team_id < trip_count ) {
				2231	pupper = plower = plower + team_id incr;
				2232	} else {
				2233	plower = pupper + incr; // zero-trip loop
				2234	}
				2235	if( plastiter != NULL )
				2236	*plastiter = ( team_id == trip_count - 1 );
				2237	} else {
				2238	if( __kmp_static == kmp_sch_static_balanced ) {
				2239	register UT chunk = trip_count / nteams;
				2240	register UT extras = trip_count % nteams;
				2241	plower += incr ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
				2242	pupper = plower + chunk * incr - ( team_id < extras ? 0 : incr );
				2243	if( plastiter != NULL )
				2244	*plastiter = ( team_id == nteams - 1 );
				2245	} else {
				2246	register T chunk_inc_count =
				2247	( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
				2248	register T upper = *pupper;
				2249	KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
				2250	// Unknown static scheduling type.
				2251	plower += team_id chunk_inc_count;
				2252	pupper = plower + chunk_inc_count - incr;
				2253	// Check/correct bounds if needed
				2254	if( incr > 0 ) {
				2255	if( pupper < plower )
				2256	*pupper = i_maxmin< T >::mx;
				2257	if( plastiter != NULL )
				2258	plastiter = plower <= upper && *pupper > upper - incr;
				2259	if( *pupper > upper )
				2260	*pupper = upper; // tracker C73258
				2261	} else {
				2262	if( pupper > plower )
				2263	*pupper = i_maxmin< T >::mn;
				2264	if( plastiter != NULL )
				2265	plastiter = plower >= upper && *pupper < upper - incr;
				2266	if( *pupper < upper )
				2267	*pupper = upper; // tracker C73258
				2268	}
				2269	}
				2270	}
				2271	}
				2272
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2273	//-----------------------------------------------------------------------------------------
				2274	// Dispatch routines
				2275	// Transfer call to template< type T >
				2276	// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
				2277	// T lb, T ub, ST st, ST chunk )
				2278	extern "C" {
				2279
				2280	/*!
				2281	@ingroup WORK_SHARING
				2282	@{
				2283	@param loc Source location
				2284	@param gtid Global thread id
				2285	@param schedule Schedule type
				2286	@param lb Lower bound
				2287	@param ub Upper bound
				2288	@param st Step (or increment if you prefer)
				2289	@param chunk The chunk size to block with
				2290
				2291	This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
				2292	These functions are all identical apart from the types of the arguments.
				2293	*/
				2294
				2295	void
				2296	__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2297	kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
				2298	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2299	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2300	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2301	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2302	}
				2303	/*!
				2304	See @ref __kmpc_dispatch_init_4
				2305	*/
				2306	void
				2307	__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2308	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
				2309	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2310	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2311	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2312	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2313	}
				2314
				2315	/*!
				2316	See @ref __kmpc_dispatch_init_4
				2317	*/
				2318	void
				2319	__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2320	kmp_int64 lb, kmp_int64 ub,
				2321	kmp_int64 st, kmp_int64 chunk )
				2322	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2323	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2324	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2325	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2326	}
				2327
				2328	/*!
				2329	See @ref __kmpc_dispatch_init_4
				2330	*/
				2331	void
				2332	__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2333	kmp_uint64 lb, kmp_uint64 ub,
				2334	kmp_int64 st, kmp_int64 chunk )
				2335	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2336	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2337	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2338	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2339	}
				2340
				2341	/*!
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2342	See @ref __kmpc_dispatch_init_4
				2343
				2344	Difference from __kmpc_dispatch_init set of functions is these functions
				2345	are called for composite distribute parallel for construct. Thus before
				2346	regular iterations dispatching we need to calc per-team iteration space.
				2347
				2348	These functions are all identical apart from the types of the arguments.
				2349	*/
				2350	void
				2351	__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2352	kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
				2353	{
				2354	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2355	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2356	__kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
				2357	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2358	}
				2359
				2360	void
				2361	__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2362	kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
				2363	{
				2364	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2365	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2366	__kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
				2367	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2368	}
				2369
				2370	void
				2371	__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2372	kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
				2373	{
				2374	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2375	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2376	__kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
				2377	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2378	}
				2379
				2380	void
				2381	__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2382	kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
				2383	{
				2384	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2385	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2386	__kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
				2387	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2388	}
				2389
				2390	/*!
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2391	@param loc Source code location
				2392	@param gtid Global thread id
				2393	@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
				2394	@param p_lb Pointer to the lower bound for the next chunk of work
				2395	@param p_ub Pointer to the upper bound for the next chunk of work
				2396	@param p_st Pointer to the stride for the next chunk of work
				2397	@return one if there is work to be done, zero otherwise
				2398
				2399	Get the next dynamically allocated chunk of work for this thread.
				2400	If there is no more work, then the lb,ub and stride need not be modified.
				2401	*/
				2402	int
				2403	__kmpc_dispatch_next_4( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2404	kmp_int32 p_lb, kmp_int32 p_ub, kmp_int32 *p_st )
				2405	{
				2406	return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2407	}
				2408
				2409	/*!
				2410	See @ref __kmpc_dispatch_next_4
				2411	*/
				2412	int
				2413	__kmpc_dispatch_next_4u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2414	kmp_uint32 p_lb, kmp_uint32 p_ub, kmp_int32 *p_st )
				2415	{
				2416	return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2417	}
				2418
				2419	/*!
				2420	See @ref __kmpc_dispatch_next_4
				2421	*/
				2422	int
				2423	__kmpc_dispatch_next_8( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2424	kmp_int64 p_lb, kmp_int64 p_ub, kmp_int64 *p_st )
				2425	{
				2426	return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2427	}
				2428
				2429	/*!
				2430	See @ref __kmpc_dispatch_next_4
				2431	*/
				2432	int
				2433	__kmpc_dispatch_next_8u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2434	kmp_uint64 p_lb, kmp_uint64 p_ub, kmp_int64 *p_st )
				2435	{
				2436	return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2437	}
				2438
				2439	/*!
				2440	@param loc Source code location
				2441	@param gtid Global thread id
				2442
				2443	Mark the end of a dynamic loop.
				2444	*/
				2445	void
				2446	__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
				2447	{
				2448	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2449	}
				2450
				2451	/*!
				2452	See @ref __kmpc_dispatch_fini_4
				2453	*/
				2454	void
				2455	__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
				2456	{
				2457	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2458	}
				2459
				2460	/*!
				2461	See @ref __kmpc_dispatch_fini_4
				2462	*/
				2463	void
				2464	__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
				2465	{
				2466	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2467	}
				2468
				2469	/*!
				2470	See @ref __kmpc_dispatch_fini_4
				2471	*/
				2472	void
				2473	__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
				2474	{
				2475	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2476	}
				2477	/! @} /
				2478
				2479	//-----------------------------------------------------------------------------------------
				2480	//Non-template routines from kmp_dispatch.c used in other sources
				2481
				2482	kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
				2483	return value == checker;
				2484	}
				2485
				2486	kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
				2487	return value != checker;
				2488	}
				2489
				2490	kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
				2491	return value < checker;
				2492	}
				2493
				2494	kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
				2495	return value >= checker;
				2496	}
				2497
				2498	kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
				2499	return value <= checker;
				2500	}
				2501	kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
				2502	return value == checker;
				2503	}
				2504
				2505	kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
				2506	return value != checker;
				2507	}
				2508
				2509	kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
				2510	return value < checker;
				2511	}
				2512
				2513	kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
				2514	return value >= checker;
				2515	}
				2516
				2517	kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
				2518	return value <= checker;
				2519	}
				2520
				2521	kmp_uint32
				2522	__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
				2523	kmp_uint32 checker,
				2524	kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
				2525	, void * obj // Higher-level synchronization object, or NULL.
				2526	)
				2527	{
				2528	// note: we may not belong to a team at this point
				2529	register volatile kmp_uint32 * spin = spinner;
				2530	register kmp_uint32 check = checker;
				2531	register kmp_uint32 spins;
				2532	register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
				2533	register kmp_uint32 r;
				2534
				2535	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2536	KMP_INIT_YIELD( spins );
				2537	// main wait spin loop
				2538	while(!f(r = TCR_4(*spin), check)) {
				2539	KMP_FSYNC_SPIN_PREPARE( obj );
				2540	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2541	It causes problems with infinite recursion because of exit lock */
				2542	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2543	__kmp_abort_thread(); */
				2544
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2545	/* if we have waited a bit, or are oversubscribed, yield */
				2546	/* pause is in the following code */
				2547	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2548	KMP_YIELD_SPIN( spins );
				2549	}
				2550	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2551	return r;
				2552	}
				2553
				2554	kmp_uint64
				2555	__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
				2556	kmp_uint64 checker,
				2557	kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
				2558	, void * obj // Higher-level synchronization object, or NULL.
				2559	)
				2560	{
				2561	// note: we may not belong to a team at this point
				2562	register volatile kmp_uint64 * spin = spinner;
				2563	register kmp_uint64 check = checker;
				2564	register kmp_uint32 spins;
				2565	register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
				2566	register kmp_uint64 r;
				2567
				2568	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2569	KMP_INIT_YIELD( spins );
				2570	// main wait spin loop
				2571	while(!f(r = *spin, check))
				2572	{
				2573	KMP_FSYNC_SPIN_PREPARE( obj );
				2574	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2575	It causes problems with infinite recursion because of exit lock */
				2576	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2577	__kmp_abort_thread(); */
				2578
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2579	// if we are oversubscribed,
				2580	// or have waited a bit (and KMP_LIBARRY=throughput, then yield
				2581	// pause is in the following code
				2582	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2583	KMP_YIELD_SPIN( spins );
				2584	}
				2585	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2586	return r;
				2587	}
				2588
				2589	} // extern "C"
				2590
				2591	#ifdef KMP_GOMP_COMPAT
				2592
				2593	void
				2594	__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2595	kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
				2596	kmp_int32 chunk, int push_ws )
				2597	{
				2598	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2599	push_ws );
				2600	}
				2601
				2602	void
				2603	__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2604	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
				2605	kmp_int32 chunk, int push_ws )
				2606	{
				2607	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2608	push_ws );
				2609	}
				2610
				2611	void
				2612	__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2613	kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
				2614	kmp_int64 chunk, int push_ws )
				2615	{
				2616	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2617	push_ws );
				2618	}
				2619
				2620	void
				2621	__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2622	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
				2623	kmp_int64 chunk, int push_ws )
				2624	{
				2625	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2626	push_ws );
				2627	}
				2628
				2629	void
				2630	__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
				2631	{
				2632	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2633	}
				2634
				2635	void
				2636	__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
				2637	{
				2638	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2639	}
				2640
				2641	void
				2642	__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
				2643	{
				2644	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2645	}
				2646
				2647	void
				2648	__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
				2649	{
				2650	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2651	}
				2652
				2653	#endif /* KMP_GOMP_COMPAT */
				2654
				2655	/* ------------------------------------------------------------------------ */
				2656	/* ------------------------------------------------------------------------ */
				2657