Blame - openmp/runtime/src/kmp_dispatch.cpp - toolchain/llvm-project

blob: a39c8f2c3061619ac43851bcc8c542e4b4c454d2 [file] [log] [blame]

Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1	/*
				2	* kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	3	*/
				4
				5
				6	//===----------------------------------------------------------------------===//
				7	//
				8	// The LLVM Compiler Infrastructure
				9	//
				10	// This file is dual licensed under the MIT and the University of Illinois Open
				11	// Source Licenses. See LICENSE.txt for details.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15
				16	/*
				17	* Dynamic scheduling initialization and dispatch.
				18	*
				19	* NOTE: __kmp_nth is a constant inside of any dispatch loop, however
				20	* it may change values between parallel regions. __kmp_max_nth
				21	* is the largest value __kmp_nth may take, 1 is the smallest.
				22	*
				23	*/
				24
				25	/* ------------------------------------------------------------------------ */
				26	/* ------------------------------------------------------------------------ */
				27
				28	#include "kmp.h"
				29	#include "kmp_i18n.h"
				30	#include "kmp_itt.h"
				31	#include "kmp_str.h"
				32	#include "kmp_error.h"
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	33	#include "kmp_stats.h"
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	34	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				35	#include <float.h>
				36	#endif
				37
				38	/* ------------------------------------------------------------------------ */
				39	/* ------------------------------------------------------------------------ */
				40
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	41	// template for type limits
				42	template< typename T >
				43	struct i_maxmin {
				44	static const T mx;
				45	static const T mn;
				46	};
				47	template<>
				48	struct i_maxmin< int > {
				49	static const int mx = 0x7fffffff;
				50	static const int mn = 0x80000000;
				51	};
				52	template<>
				53	struct i_maxmin< unsigned int > {
				54	static const unsigned int mx = 0xffffffff;
				55	static const unsigned int mn = 0x00000000;
				56	};
				57	template<>
				58	struct i_maxmin< long long > {
				59	static const long long mx = 0x7fffffffffffffffLL;
				60	static const long long mn = 0x8000000000000000LL;
				61	};
				62	template<>
				63	struct i_maxmin< unsigned long long > {
				64	static const unsigned long long mx = 0xffffffffffffffffLL;
				65	static const unsigned long long mn = 0x0000000000000000LL;
				66	};
				67	//-------------------------------------------------------------------------
				68
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	69	#ifdef KMP_STATIC_STEAL_ENABLED
				70
				71	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				72	template< typename T >
				73	struct dispatch_private_infoXX_template {
				74	typedef typename traits_t< T >::unsigned_t UT;
				75	typedef typename traits_t< T >::signed_t ST;
				76	UT count; // unsigned
				77	T ub;
				78	/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
				79	T lb;
				80	ST st; // signed
				81	UT tc; // unsigned
				82	T static_steal_counter; // for static_steal only; maybe better to put after ub
				83
				84	/* parm[1-4] are used in different ways by different scheduling algorithms */
				85
				86	// KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
				87	// a) parm3 is properly aligned and
				88	// b) all parm1-4 are in the same cache line.
				89	// Because of parm1-4 are used together, performance seems to be better
				90	// if they are in the same line (not measured though).
				91
				92	struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
				93	T parm1;
				94	T parm2;
				95	T parm3;
				96	T parm4;
				97	};
				98
				99	UT ordered_lower; // unsigned
				100	UT ordered_upper; // unsigned
				101	#if KMP_OS_WINDOWS
				102	T last_upper;
				103	#endif /* KMP_OS_WINDOWS */
				104	};
				105
				106	#else /* KMP_STATIC_STEAL_ENABLED */
				107
				108	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				109	template< typename T >
				110	struct dispatch_private_infoXX_template {
				111	typedef typename traits_t< T >::unsigned_t UT;
				112	typedef typename traits_t< T >::signed_t ST;
				113	T lb;
				114	T ub;
				115	ST st; // signed
				116	UT tc; // unsigned
				117
				118	T parm1;
				119	T parm2;
				120	T parm3;
				121	T parm4;
				122
				123	UT count; // unsigned
				124
				125	UT ordered_lower; // unsigned
				126	UT ordered_upper; // unsigned
				127	#if KMP_OS_WINDOWS
				128	T last_upper;
				129	#endif /* KMP_OS_WINDOWS */
				130	};
				131
				132	#endif /* KMP_STATIC_STEAL_ENABLED */
				133
				134	// replaces dispatch_private_info structure and dispatch_private_info_t type
				135	template< typename T >
				136	struct KMP_ALIGN_CACHE dispatch_private_info_template {
				137	// duplicate alignment here, otherwise size of structure is not correct in our compiler
				138	union KMP_ALIGN_CACHE private_info_tmpl {
				139	dispatch_private_infoXX_template< T > p;
				140	dispatch_private_info64_t p64;
				141	} u;
				142	enum sched_type schedule; /* scheduling algorithm */
				143	kmp_uint32 ordered; /* ordered clause specified */
				144	kmp_uint32 ordered_bumped;
				145	kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
				146	dispatch_private_info * next; /* stack of buffers for nest of serial regions */
				147	kmp_uint32 nomerge; /* don't merge iters if serialized */
				148	kmp_uint32 type_size;
				149	enum cons_type pushed_ws;
				150	};
				151
				152
				153	// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
				154	template< typename UT >
				155	struct dispatch_shared_infoXX_template {
				156	/* chunk index under dynamic, number of idle threads under static-steal;
				157	iteration index otherwise */
				158	volatile UT iteration;
				159	volatile UT num_done;
				160	volatile UT ordered_iteration;
				161	UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
				162	};
				163
				164	// replaces dispatch_shared_info structure and dispatch_shared_info_t type
				165	template< typename UT >
				166	struct dispatch_shared_info_template {
				167	// we need union here to keep the structure size
				168	union shared_info_tmpl {
				169	dispatch_shared_infoXX_template< UT > s;
				170	dispatch_shared_info64_t s64;
				171	} u;
				172	volatile kmp_uint32 buffer_index;
				173	};
				174
				175	/* ------------------------------------------------------------------------ */
				176	/* ------------------------------------------------------------------------ */
				177
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	178	#undef USE_TEST_LOCKS
				179
				180	// test_then_add template (general template should NOT be used)
				181	template< typename T >
				182	static __forceinline T
				183	test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
				184
				185	template<>
				186	__forceinline kmp_int32
				187	test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
				188	{
				189	kmp_int32 r;
				190	r = KMP_TEST_THEN_ADD32( p, d );
				191	return r;
				192	}
				193
				194	template<>
				195	__forceinline kmp_int64
				196	test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
				197	{
				198	kmp_int64 r;
				199	r = KMP_TEST_THEN_ADD64( p, d );
				200	return r;
				201	}
				202
				203	// test_then_inc_acq template (general template should NOT be used)
				204	template< typename T >
				205	static __forceinline T
				206	test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
				207
				208	template<>
				209	__forceinline kmp_int32
				210	test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
				211	{
				212	kmp_int32 r;
				213	r = KMP_TEST_THEN_INC_ACQ32( p );
				214	return r;
				215	}
				216
				217	template<>
				218	__forceinline kmp_int64
				219	test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
				220	{
				221	kmp_int64 r;
				222	r = KMP_TEST_THEN_INC_ACQ64( p );
				223	return r;
				224	}
				225
				226	// test_then_inc template (general template should NOT be used)
				227	template< typename T >
				228	static __forceinline T
				229	test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
				230
				231	template<>
				232	__forceinline kmp_int32
				233	test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
				234	{
				235	kmp_int32 r;
				236	r = KMP_TEST_THEN_INC32( p );
				237	return r;
				238	}
				239
				240	template<>
				241	__forceinline kmp_int64
				242	test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
				243	{
				244	kmp_int64 r;
				245	r = KMP_TEST_THEN_INC64( p );
				246	return r;
				247	}
				248
				249	// compare_and_swap template (general template should NOT be used)
				250	template< typename T >
				251	static __forceinline kmp_int32
				252	compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
				253
				254	template<>
				255	__forceinline kmp_int32
				256	compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
				257	{
				258	return KMP_COMPARE_AND_STORE_REL32( p, c, s );
				259	}
				260
				261	template<>
				262	__forceinline kmp_int32
				263	compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
				264	{
				265	return KMP_COMPARE_AND_STORE_REL64( p, c, s );
				266	}
				267
				268	/*
				269	Spin wait loop that first does pause, then yield.
				270	Waits until function returns non-zero when called with *spinner and check.
				271	Does NOT put threads to sleep.
				272	#if USE_ITT_BUILD
				273	Arguments:
Alp Toker	8f2d3f0	2014-02-24 10:40:15 +0000	[diff] [blame]	274	obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	275	locks consistently. For example, if lock is acquired immediately, its address is
				276	reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
				277	immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
				278	address, not an address of low-level spinner.
				279	#endif // USE_ITT_BUILD
				280	*/
				281	template< typename UT >
				282	// ToDo: make inline function (move to header file for icl)
				283	static UT // unsigned 4- or 8-byte type
				284	__kmp_wait_yield( volatile UT * spinner,
				285	UT checker,
				286	kmp_uint32 (* pred)( UT, UT )
				287	USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
				288	)
				289	{
				290	// note: we may not belong to a team at this point
				291	register volatile UT * spin = spinner;
				292	register UT check = checker;
				293	register kmp_uint32 spins;
				294	register kmp_uint32 (*f) ( UT, UT ) = pred;
				295	register UT r;
				296
				297	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				298	KMP_INIT_YIELD( spins );
				299	// main wait spin loop
				300	while(!f(r = *spin, check))
				301	{
				302	KMP_FSYNC_SPIN_PREPARE( obj );
				303	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				304	It causes problems with infinite recursion because of exit lock */
				305	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				306	__kmp_abort_thread(); */
				307
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	308	// if we are oversubscribed,
				309	// or have waited a bit (and KMP_LIBRARY=throughput, then yield
				310	// pause is in the following code
				311	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				312	KMP_YIELD_SPIN( spins );
				313	}
				314	KMP_FSYNC_SPIN_ACQUIRED( obj );
				315	return r;
				316	}
				317
				318	template< typename UT >
				319	static kmp_uint32 __kmp_eq( UT value, UT checker) {
				320	return value == checker;
				321	}
				322
				323	template< typename UT >
				324	static kmp_uint32 __kmp_neq( UT value, UT checker) {
				325	return value != checker;
				326	}
				327
				328	template< typename UT >
				329	static kmp_uint32 __kmp_lt( UT value, UT checker) {
				330	return value < checker;
				331	}
				332
				333	template< typename UT >
				334	static kmp_uint32 __kmp_ge( UT value, UT checker) {
				335	return value >= checker;
				336	}
				337
				338	template< typename UT >
				339	static kmp_uint32 __kmp_le( UT value, UT checker) {
				340	return value <= checker;
				341	}
				342
				343
				344	/* ------------------------------------------------------------------------ */
				345	/* ------------------------------------------------------------------------ */
				346
				347	static void
				348	__kmp_dispatch_deo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				349	{
				350	kmp_info_t *th;
				351
				352	KMP_DEBUG_ASSERT( gtid_ref );
				353
				354	if ( __kmp_env_consistency_check ) {
				355	th = __kmp_threads[*gtid_ref];
				356	if ( th -> th.th_root -> r.r_active
				357	&& ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
Andrey Churbanov	5c56fb5	2015-02-20 18:05:17 +0000	[diff] [blame]	358	#if KMP_USE_DYNAMIC_LOCK
				359	__kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0 );
				360	#else
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	361	__kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov	5c56fb5	2015-02-20 18:05:17 +0000	[diff] [blame]	362	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	363	}
				364	}
				365	}
				366
				367	template< typename UT >
				368	static void
				369	__kmp_dispatch_deo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				370	{
				371	typedef typename traits_t< UT >::signed_t ST;
				372	dispatch_private_info_template< UT > * pr;
				373
				374	int gtid = *gtid_ref;
				375	// int cid = *cid_ref;
				376	kmp_info_t *th = __kmp_threads[ gtid ];
				377	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				378
				379	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
				380	if ( __kmp_env_consistency_check ) {
				381	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				382	( th -> th.th_dispatch -> th_dispatch_pr_current );
				383	if ( pr -> pushed_ws != ct_none ) {
Andrey Churbanov	5c56fb5	2015-02-20 18:05:17 +0000	[diff] [blame]	384	#if KMP_USE_DYNAMIC_LOCK
				385	__kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL, 0 );
				386	#else
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	387	__kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
Andrey Churbanov	5c56fb5	2015-02-20 18:05:17 +0000	[diff] [blame]	388	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	389	}
				390	}
				391
				392	if ( ! th -> th.th_team -> t.t_serialized ) {
				393	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				394	( th -> th.th_dispatch -> th_dispatch_sh_current );
				395	UT lower;
				396
				397	if ( ! __kmp_env_consistency_check ) {
				398	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				399	( th -> th.th_dispatch -> th_dispatch_pr_current );
				400	}
				401	lower = pr->u.p.ordered_lower;
				402
				403	#if ! defined( KMP_GOMP_COMPAT )
				404	if ( __kmp_env_consistency_check ) {
				405	if ( pr->ordered_bumped ) {
				406	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				407	__kmp_error_construct2(
				408	kmp_i18n_msg_CnsMultipleNesting,
				409	ct_ordered_in_pdo, loc_ref,
				410	& p->stack_data[ p->w_top ]
				411	);
				412	}
				413	}
				414	#endif /* !defined(KMP_GOMP_COMPAT) */
				415
				416	KMP_MB();
				417	#ifdef KMP_DEBUG
				418	{
				419	const char * buff;
				420	// create format specifiers before the debug output
				421	buff = __kmp_str_format(
				422	"__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
				423	traits_t< UT >::spec, traits_t< UT >::spec );
				424	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				425	__kmp_str_free( &buff );
				426	}
				427	#endif
				428
				429	__kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				430	USE_ITT_BUILD_ARG( NULL )
				431	);
				432	KMP_MB(); /* is this necessary? */
				433	#ifdef KMP_DEBUG
				434	{
				435	const char * buff;
				436	// create format specifiers before the debug output
				437	buff = __kmp_str_format(
				438	"__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
				439	traits_t< UT >::spec, traits_t< UT >::spec );
				440	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				441	__kmp_str_free( &buff );
				442	}
				443	#endif
				444	}
				445	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
				446	}
				447
				448	static void
				449	__kmp_dispatch_dxo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				450	{
				451	kmp_info_t *th;
				452
				453	if ( __kmp_env_consistency_check ) {
				454	th = __kmp_threads[*gtid_ref];
				455	if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
				456	__kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
				457	}
				458	}
				459	}
				460
				461	template< typename UT >
				462	static void
				463	__kmp_dispatch_dxo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				464	{
				465	typedef typename traits_t< UT >::signed_t ST;
				466	dispatch_private_info_template< UT > * pr;
				467
				468	int gtid = *gtid_ref;
				469	// int cid = *cid_ref;
				470	kmp_info_t *th = __kmp_threads[ gtid ];
				471	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				472
				473	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
				474	if ( __kmp_env_consistency_check ) {
				475	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				476	( th -> th.th_dispatch -> th_dispatch_pr_current );
				477	if ( pr -> pushed_ws != ct_none ) {
				478	__kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
				479	}
				480	}
				481
				482	if ( ! th -> th.th_team -> t.t_serialized ) {
				483	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				484	( th -> th.th_dispatch -> th_dispatch_sh_current );
				485
				486	if ( ! __kmp_env_consistency_check ) {
				487	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				488	( th -> th.th_dispatch -> th_dispatch_pr_current );
				489	}
				490
				491	KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
				492	#if ! defined( KMP_GOMP_COMPAT )
				493	if ( __kmp_env_consistency_check ) {
				494	if ( pr->ordered_bumped != 0 ) {
				495	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				496	/* How to test it? - OM */
				497	__kmp_error_construct2(
				498	kmp_i18n_msg_CnsMultipleNesting,
				499	ct_ordered_in_pdo, loc_ref,
				500	& p->stack_data[ p->w_top ]
				501	);
				502	}
				503	}
				504	#endif /* !defined(KMP_GOMP_COMPAT) */
				505
				506	KMP_MB(); /* Flush all pending memory write invalidates. */
				507
				508	pr->ordered_bumped += 1;
				509
				510	KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
				511	gtid, pr->ordered_bumped ) );
				512
				513	KMP_MB(); /* Flush all pending memory write invalidates. */
				514
				515	/* TODO use general release procedure? */
				516	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				517
				518	KMP_MB(); /* Flush all pending memory write invalidates. */
				519	}
				520	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
				521	}
				522
				523	/* Computes and returns x to the power of y, where y must a non-negative integer */
				524	template< typename UT >
				525	static __forceinline long double
				526	__kmp_pow(long double x, UT y) {
				527	long double s=1.0L;
				528
				529	KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
				530	//KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
				531	while(y) {
				532	if ( y & 1 )
				533	s *= x;
				534	x *= x;
				535	y >>= 1;
				536	}
				537	return s;
				538	}
				539
				540	/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
				541	(the total number of unassigned iterations in chunks with index greater than or equal to idx).
				542	__forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
				543	(one of the unit tests, sch_guided_analytical_basic.cpp, fails)
				544	*/
				545	template< typename T >
				546	static __inline typename traits_t< T >::unsigned_t
				547	__kmp_dispatch_guided_remaining(
				548	T tc,
				549	typename traits_t< T >::floating_t base,
				550	typename traits_t< T >::unsigned_t idx
				551	) {
				552	/* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
				553	least for ICL 8.1, long double arithmetic may not really have
				554	long double precision, even with /Qlong_double. Currently, we
				555	workaround that in the caller code, by manipulating the FPCW for
				556	Windows* OS on IA-32 architecture. The lack of precision is not
				557	expected to be a correctness issue, though.
				558	*/
				559	typedef typename traits_t< T >::unsigned_t UT;
				560
				561	long double x = tc * __kmp_pow< UT >(base, idx);
				562	UT r = (UT) x;
				563	if ( x == r )
				564	return r;
				565	return r + 1;
				566	}
				567
				568	// Parameters of the guided-iterative algorithm:
				569	// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
				570	// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
				571	// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
				572	// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
				573	static int guided_int_param = 2;
				574	static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
				575
				576	// UT - unsigned flavor of T, ST - signed flavor of T,
				577	// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
				578	template< typename T >
				579	static void
				580	__kmp_dispatch_init(
				581	ident_t * loc,
				582	int gtid,
				583	enum sched_type schedule,
				584	T lb,
				585	T ub,
				586	typename traits_t< T >::signed_t st,
				587	typename traits_t< T >::signed_t chunk,
				588	int push_ws
				589	) {
				590	typedef typename traits_t< T >::unsigned_t UT;
				591	typedef typename traits_t< T >::signed_t ST;
				592	typedef typename traits_t< T >::floating_t DBL;
				593	static const int ___kmp_size_type = sizeof( UT );
				594
				595	int active;
				596	T tc;
				597	kmp_info_t * th;
				598	kmp_team_t * team;
				599	kmp_uint32 my_buffer_index;
				600	dispatch_private_info_template< T > * pr;
				601	dispatch_shared_info_template< UT > volatile * sh;
				602
				603	KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
				604	KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
				605
				606	if ( ! TCR_4( __kmp_init_parallel ) )
				607	__kmp_parallel_initialize();
				608
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	609	#if INCLUDE_SSC_MARKS
				610	SSC_MARK_DISPATCH_INIT();
				611	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	612	#ifdef KMP_DEBUG
				613	{
				614	const char * buff;
				615	// create format specifiers before the debug output
				616	buff = __kmp_str_format(
				617	"__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
				618	traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				619	KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
				620	__kmp_str_free( &buff );
				621	}
				622	#endif
				623	/* setup data */
				624	th = __kmp_threads[ gtid ];
				625	team = th -> th.th_team;
				626	active = ! team -> t.t_serialized;
				627	th->th.th_ident = loc;
				628
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	629	#if USE_ITT_BUILD
				630	kmp_uint64 cur_chunk = chunk;
				631	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	632	if ( ! active ) {
				633	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				634	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				635	} else {
				636	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				637	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				638
				639	my_buffer_index = th->th.th_dispatch->th_disp_index ++;
				640
				641	/* What happens when number of threads changes, need to resize buffer? */
				642	pr = reinterpret_cast< dispatch_private_info_template< T > * >
				643	( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				644	sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
				645	( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				646	}
				647
				648	/* Pick up the nomerge/ordered bits from the scheduling type */
				649	if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
				650	pr->nomerge = TRUE;
				651	schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
				652	} else {
				653	pr->nomerge = FALSE;
				654	}
				655	pr->type_size = ___kmp_size_type; // remember the size of variables
				656	if ( kmp_ord_lower & schedule ) {
				657	pr->ordered = TRUE;
				658	schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
				659	} else {
				660	pr->ordered = FALSE;
				661	}
				662	if ( schedule == kmp_sch_static ) {
				663	schedule = __kmp_static;
				664	} else {
				665	if ( schedule == kmp_sch_runtime ) {
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	666	// Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
				667	schedule = team -> t.t_sched.r_sched_type;
				668	// Detail the schedule if needed (global controls are differentiated appropriately)
				669	if ( schedule == kmp_sch_guided_chunked ) {
				670	schedule = __kmp_guided;
				671	} else if ( schedule == kmp_sch_static ) {
				672	schedule = __kmp_static;
				673	}
				674	// Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
				675	chunk = team -> t.t_sched.chunk;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	676
				677	#ifdef KMP_DEBUG
				678	{
				679	const char * buff;
				680	// create format specifiers before the debug output
				681	buff = __kmp_str_format(
				682	"__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
				683	traits_t< ST >::spec );
				684	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				685	__kmp_str_free( &buff );
				686	}
				687	#endif
				688	} else {
				689	if ( schedule == kmp_sch_guided_chunked ) {
				690	schedule = __kmp_guided;
				691	}
				692	if ( chunk <= 0 ) {
				693	chunk = KMP_DEFAULT_CHUNK;
				694	}
				695	}
				696
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	697	if ( schedule == kmp_sch_auto ) {
				698	// mapping and differentiation: in the __kmp_do_serial_initialize()
				699	schedule = __kmp_auto;
				700	#ifdef KMP_DEBUG
				701	{
				702	const char * buff;
				703	// create format specifiers before the debug output
				704	buff = __kmp_str_format(
				705	"__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
				706	traits_t< ST >::spec );
				707	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				708	__kmp_str_free( &buff );
				709	}
				710	#endif
				711	}
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	712
				713	/* guided analytical not safe for too many threads */
				714	if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
				715	schedule = kmp_sch_guided_iterative_chunked;
				716	KMP_WARNING( DispatchManyThreads );
				717	}
				718	pr->u.p.parm1 = chunk;
				719	}
				720	KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
				721	"unknown scheduling type" );
				722
				723	pr->u.p.count = 0;
				724
				725	if ( __kmp_env_consistency_check ) {
				726	if ( st == 0 ) {
				727	__kmp_error_construct(
				728	kmp_i18n_msg_CnsLoopIncrZeroProhibited,
				729	( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
				730	);
				731	}
				732	}
				733
				734	tc = ( ub - lb + st );
				735	if ( st != 1 ) {
				736	if ( st < 0 ) {
				737	if ( lb < ub ) {
				738	tc = 0; // zero-trip
				739	} else { // lb >= ub
				740	tc = (ST)tc / st; // convert to signed division
				741	}
				742	} else { // st > 0
				743	if ( ub < lb ) {
				744	tc = 0; // zero-trip
				745	} else { // lb >= ub
				746	tc /= st;
				747	}
				748	}
				749	} else if ( ub < lb ) { // st == 1
				750	tc = 0; // zero-trip
				751	}
				752
				753	pr->u.p.lb = lb;
				754	pr->u.p.ub = ub;
				755	pr->u.p.st = st;
				756	pr->u.p.tc = tc;
				757
				758	#if KMP_OS_WINDOWS
				759	pr->u.p.last_upper = ub + st;
				760	#endif /* KMP_OS_WINDOWS */
				761
				762	/* NOTE: only the active parallel region(s) has active ordered sections */
				763
				764	if ( active ) {
				765	if ( pr->ordered == 0 ) {
				766	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
				767	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
				768	} else {
				769	pr->ordered_bumped = 0;
				770
				771	pr->u.p.ordered_lower = 1;
				772	pr->u.p.ordered_upper = 0;
				773
				774	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
				775	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
				776	}
				777	}
				778
				779	if ( __kmp_env_consistency_check ) {
				780	enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
				781	if ( push_ws ) {
				782	__kmp_push_workshare( gtid, ws, loc );
				783	pr->pushed_ws = ws;
				784	} else {
				785	__kmp_check_workshare( gtid, ws, loc );
				786	pr->pushed_ws = ct_none;
				787	}
				788	}
				789
				790	switch ( schedule ) {
				791	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				792	case kmp_sch_static_steal:
				793	{
				794	T nproc = team->t.t_nproc;
				795	T ntc, init;
				796
				797	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
				798
				799	ntc = (tc % chunk ? 1 : 0) + tc / chunk;
				800	if ( nproc > 1 && ntc >= nproc ) {
				801	T id = __kmp_tid_from_gtid(gtid);
				802	T small_chunk, extras;
				803
				804	small_chunk = ntc / nproc;
				805	extras = ntc % nproc;
				806
				807	init = id * small_chunk + ( id < extras ? id : extras );
				808	pr->u.p.count = init;
				809	pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
				810
				811	pr->u.p.parm2 = lb;
				812	//pr->pfields.parm3 = 0; // it's not used in static_steal
				813	pr->u.p.parm4 = id;
				814	pr->u.p.st = st;
				815	break;
				816	} else {
				817	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
				818	gtid ) );
				819	schedule = kmp_sch_static_balanced;
				820	/* too few iterations: fall-through to kmp_sch_static_balanced */
				821	} // if
				822	/* FALL-THROUGH to static balanced */
				823	} // case
				824	#endif
				825	case kmp_sch_static_balanced:
				826	{
				827	T nproc = team->t.t_nproc;
				828	T init, limit;
				829
				830	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
				831	gtid ) );
				832
				833	if ( nproc > 1 ) {
				834	T id = __kmp_tid_from_gtid(gtid);
				835
				836	if ( tc < nproc ) {
				837	if ( id < tc ) {
				838	init = id;
				839	limit = id;
				840	pr->u.p.parm1 = (id == tc - 1); /* parm1 stores plastiter /
				841	} else {
				842	pr->u.p.count = 1; /* means no more chunks to execute */
				843	pr->u.p.parm1 = FALSE;
				844	break;
				845	}
				846	} else {
				847	T small_chunk = tc / nproc;
				848	T extras = tc % nproc;
				849	init = id * small_chunk + (id < extras ? id : extras);
				850	limit = init + small_chunk - (id < extras ? 0 : 1);
				851	pr->u.p.parm1 = (id == nproc - 1);
				852	}
				853	} else {
				854	if ( tc > 0 ) {
				855	init = 0;
				856	limit = tc - 1;
				857	pr->u.p.parm1 = TRUE;
				858	} else {
				859	// zero trip count
				860	pr->u.p.count = 1; /* means no more chunks to execute */
				861	pr->u.p.parm1 = FALSE;
				862	break;
				863	}
				864	}
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	865	#if USE_ITT_BUILD
				866	// Calculate chunk for metadata report
				867	if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
				868	cur_chunk = limit - init + 1;
				869	}
				870	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	871	if ( st == 1 ) {
				872	pr->u.p.lb = lb + init;
				873	pr->u.p.ub = lb + limit;
				874	} else {
				875	T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
				876	pr->u.p.lb = lb + init * st;
				877	// adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
				878	if ( st > 0 ) {
				879	pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
				880	} else {
				881	pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
				882	}
				883	}
				884	if ( pr->ordered ) {
				885	pr->u.p.ordered_lower = init;
				886	pr->u.p.ordered_upper = limit;
				887	}
				888	break;
				889	} // case
				890	case kmp_sch_guided_iterative_chunked :
				891	{
				892	T nproc = team->t.t_nproc;
				893	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
				894
				895	if ( nproc > 1 ) {
				896	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				897	/* chunk size too large, switch to dynamic */
				898	schedule = kmp_sch_dynamic_chunked;
				899	} else {
				900	// when remaining iters become less than parm2 - switch to dynamic
				901	pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
				902	(double)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
				903	}
				904	} else {
				905	KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
				906	schedule = kmp_sch_static_greedy;
				907	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				908	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				909	pr->u.p.parm1 = tc;
				910	} // if
				911	} // case
				912	break;
				913	case kmp_sch_guided_analytical_chunked:
				914	{
				915	T nproc = team->t.t_nproc;
				916	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
				917
				918	if ( nproc > 1 ) {
				919	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				920	/* chunk size too large, switch to dynamic */
				921	schedule = kmp_sch_dynamic_chunked;
				922	} else {
				923	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				924	DBL x;
				925
				926	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				927	/* Linux* OS already has 64-bit computation by default for
				928	long double, and on Windows* OS on Intel(R) 64,
				929	/Qlong_double doesn't work. On Windows* OS
				930	on IA-32 architecture, we need to set precision to
				931	64-bit instead of the default 53-bit. Even though long
				932	double doesn't work on Windows* OS on Intel(R) 64, the
				933	resulting lack of precision is not expected to impact
				934	the correctness of the algorithm, but this has not been
				935	mathematically proven.
				936	*/
				937	// save original FPCW and set precision to 64-bit, as
				938	// Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	939	unsigned int oldFpcw = _control87(0,0);
				940	_control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	941	#endif
				942	/* value used for comparison in solver for cross-over point */
				943	long double target = ((long double)chunk * 2 + 1) * nproc / tc;
				944
				945	/* crossover point--chunk indexes equal to or greater than
				946	this point switch to dynamic-style scheduling */
				947	UT cross;
				948
				949	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				950	x = (long double)1.0 - (long double)0.5 / nproc;
				951
				952	#ifdef KMP_DEBUG
				953	{ // test natural alignment
				954	struct _test_a {
				955	char a;
				956	union {
				957	char b;
				958	DBL d;
				959	};
				960	} t;
				961	ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
				962	//__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
				963	KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
				964	}
				965	#endif // KMP_DEBUG
				966
				967	/* save the term in thread private dispatch structure */
				968	(DBL)&pr->u.p.parm3 = x;
				969
				970	/* solve for the crossover point to the nearest integer i for which C_i <= chunk */
				971	{
				972	UT left, right, mid;
				973	long double p;
				974
				975	/* estimate initial upper and lower bound */
				976
				977	/* doesn't matter what value right is as long as it is positive, but
				978	it affects performance of the solver
				979	*/
				980	right = 229;
				981	p = __kmp_pow< UT >(x,right);
				982	if ( p > target ) {
				983	do{
				984	p *= p;
				985	right <<= 1;
				986	} while(p>target && right < (1<<27));
				987	left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
				988	} else {
				989	left = 0;
				990	}
				991
				992	/* bisection root-finding method */
				993	while ( left + 1 < right ) {
				994	mid = (left + right) / 2;
				995	if ( __kmp_pow< UT >(x,mid) > target ) {
				996	left = mid;
				997	} else {
				998	right = mid;
				999	}
				1000	} // while
				1001	cross = right;
				1002	}
				1003	/* assert sanity of computed crossover point */
				1004	KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
				1005
				1006	/* save the crossover point in thread private dispatch structure */
				1007	pr->u.p.parm2 = cross;
				1008
				1009	// C75803
				1010	#if ( ( KMP_OS_LINUX \|\| KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
				1011	#define GUIDED_ANALYTICAL_WORKAROUND (( DBL )&pr->u.p.parm3)
				1012	#else
				1013	#define GUIDED_ANALYTICAL_WORKAROUND (x)
				1014	#endif
				1015	/* dynamic-style scheduling offset */
				1016	pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
				1017	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1018	// restore FPCW
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1019	_control87(oldFpcw,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1020	#endif
				1021	} // if
				1022	} else {
				1023	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
				1024	gtid ) );
				1025	schedule = kmp_sch_static_greedy;
				1026	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				1027	pr->u.p.parm1 = tc;
				1028	} // if
				1029	} // case
				1030	break;
				1031	case kmp_sch_static_greedy:
				1032	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				1033	pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
				1034	( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
				1035	tc;
				1036	break;
				1037	case kmp_sch_static_chunked :
				1038	case kmp_sch_dynamic_chunked :
				1039	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
				1040	break;
				1041	case kmp_sch_trapezoidal :
				1042	{
				1043	/* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
				1044
				1045	T parm1, parm2, parm3, parm4;
				1046	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
				1047
				1048	parm1 = chunk;
				1049
				1050	/* F : size of the first cycle */
				1051	parm2 = ( tc / (2 * team->t.t_nproc) );
				1052
				1053	if ( parm2 < 1 ) {
				1054	parm2 = 1;
				1055	}
				1056
				1057	/* L : size of the last cycle. Make sure the last cycle
				1058	* is not larger than the first cycle.
				1059	*/
				1060	if ( parm1 < 1 ) {
				1061	parm1 = 1;
				1062	} else if ( parm1 > parm2 ) {
				1063	parm1 = parm2;
				1064	}
				1065
				1066	/* N : number of cycles */
				1067	parm3 = ( parm2 + parm1 );
				1068	parm3 = ( 2 * tc + parm3 - 1) / parm3;
				1069
				1070	if ( parm3 < 2 ) {
				1071	parm3 = 2;
				1072	}
				1073
				1074	/* sigma : decreasing incr of the trapezoid */
				1075	parm4 = ( parm3 - 1 );
				1076	parm4 = ( parm2 - parm1 ) / parm4;
				1077
				1078	// pointless check, because parm4 >= 0 always
				1079	//if ( parm4 < 0 ) {
				1080	// parm4 = 0;
				1081	//}
				1082
				1083	pr->u.p.parm1 = parm1;
				1084	pr->u.p.parm2 = parm2;
				1085	pr->u.p.parm3 = parm3;
				1086	pr->u.p.parm4 = parm4;
				1087	} // case
				1088	break;
				1089
				1090	default:
				1091	{
				1092	__kmp_msg(
				1093	kmp_ms_fatal, // Severity
				1094	KMP_MSG( UnknownSchedTypeDetected ), // Primary message
				1095	KMP_HNT( GetNewerLibrary ), // Hint
				1096	__kmp_msg_null // Variadic argument list terminator
				1097	);
				1098	}
				1099	break;
				1100	} // switch
				1101	pr->schedule = schedule;
				1102	if ( active ) {
				1103	/* The name of this buffer should be my_buffer_index when it's free to use it */
				1104
				1105	KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1106	gtid, my_buffer_index, sh->buffer_index) );
				1107	__kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
				1108	USE_ITT_BUILD_ARG( NULL )
				1109	);
				1110	// Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
				1111	// always 32-bit integers.
				1112	KMP_MB(); /* is this necessary? */
				1113	KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1114	gtid, my_buffer_index, sh->buffer_index) );
				1115
				1116	th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
				1117	th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
				1118	#if USE_ITT_BUILD
				1119	if ( pr->ordered ) {
				1120	__kmp_itt_ordered_init( gtid );
				1121	}; // if
				1122	#endif /* USE_ITT_BUILD */
				1123	}; // if
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1124
				1125	#if USE_ITT_BUILD
				1126	// Report loop metadata
				1127	if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
				1128	kmp_uint32 tid = __kmp_tid_from_gtid( gtid );
				1129	if (KMP_MASTER_TID(tid)) {
				1130	kmp_uint64 schedtype = 0;
				1131
				1132	switch ( schedule ) {
				1133	case kmp_sch_static_chunked:
				1134	case kmp_sch_static_balanced:// Chunk is calculated in the switch above
				1135	break;
				1136	case kmp_sch_static_greedy:
				1137	cur_chunk = pr->u.p.parm1;
				1138	break;
				1139	case kmp_sch_dynamic_chunked:
				1140	schedtype = 1;
				1141	break;
				1142	case kmp_sch_guided_iterative_chunked:
				1143	case kmp_sch_guided_analytical_chunked:
				1144	schedtype = 2;
				1145	break;
				1146	default:
				1147	// Should we put this case under "static"?
				1148	// case kmp_sch_static_steal:
				1149	schedtype = 3;
				1150	break;
				1151	}
				1152	__kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
				1153	}
				1154	}
				1155	#endif /* USE_ITT_BUILD */
				1156
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1157	#ifdef KMP_DEBUG
				1158	{
				1159	const char * buff;
				1160	// create format specifiers before the debug output
				1161	buff = __kmp_str_format(
				1162	"__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
				1163	" st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
				1164	" parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
				1165	traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
				1166	traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
				1167	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
				1168	traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
				1169	KD_TRACE(10, ( buff,
				1170	gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
				1171	pr->u.p.st, pr->u.p.tc, pr->u.p.count,
				1172	pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
				1173	pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
				1174	__kmp_str_free( &buff );
				1175	}
				1176	#endif
				1177	#if ( KMP_STATIC_STEAL_ENABLED )
				1178	if ( ___kmp_size_type < 8 ) {
				1179	// It cannot be guaranteed that after execution of a loop with some other schedule kind
				1180	// all the parm3 variables will contain the same value.
				1181	// Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
				1182	// rather than program life-time increment.
				1183	// So the dedicated variable is required. The 'static_steal_counter' is used.
				1184	if( schedule == kmp_sch_static_steal ) {
				1185	// Other threads will inspect this variable when searching for a victim.
				1186	// This is a flag showing that other threads may steal from this thread since then.
				1187	volatile T * p = &pr->u.p.static_steal_counter;
				1188	p = p + 1;
				1189	}
				1190	}
				1191	#endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
				1192	}
				1193
				1194	/*
				1195	* For ordered loops, either __kmp_dispatch_finish() should be called after
				1196	* every iteration, or __kmp_dispatch_finish_chunk() should be called after
				1197	* every chunk of iterations. If the ordered section(s) were not executed
				1198	* for this iteration (or every iteration in this chunk), we need to set the
				1199	* ordered iteration counters so that the next thread can proceed.
				1200	*/
				1201	template< typename UT >
				1202	static void
				1203	__kmp_dispatch_finish( int gtid, ident_t *loc )
				1204	{
				1205	typedef typename traits_t< UT >::signed_t ST;
				1206	kmp_info_t *th = __kmp_threads[ gtid ];
				1207
				1208	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
				1209	if ( ! th -> th.th_team -> t.t_serialized ) {
				1210
				1211	dispatch_private_info_template< UT > * pr =
				1212	reinterpret_cast< dispatch_private_info_template< UT >* >
				1213	( th->th.th_dispatch->th_dispatch_pr_current );
				1214	dispatch_shared_info_template< UT > volatile * sh =
				1215	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1216	( th->th.th_dispatch->th_dispatch_sh_current );
				1217	KMP_DEBUG_ASSERT( pr );
				1218	KMP_DEBUG_ASSERT( sh );
				1219	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1220	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1221
				1222	if ( pr->ordered_bumped ) {
				1223	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1224	gtid ) );
				1225	pr->ordered_bumped = 0;
				1226	} else {
				1227	UT lower = pr->u.p.ordered_lower;
				1228
				1229	#ifdef KMP_DEBUG
				1230	{
				1231	const char * buff;
				1232	// create format specifiers before the debug output
				1233	buff = __kmp_str_format(
				1234	"__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
				1235	traits_t< UT >::spec, traits_t< UT >::spec );
				1236	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1237	__kmp_str_free( &buff );
				1238	}
				1239	#endif
				1240
				1241	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1242	USE_ITT_BUILD_ARG(NULL)
				1243	);
				1244	KMP_MB(); /* is this necessary? */
				1245	#ifdef KMP_DEBUG
				1246	{
				1247	const char * buff;
				1248	// create format specifiers before the debug output
				1249	buff = __kmp_str_format(
				1250	"__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
				1251	traits_t< UT >::spec, traits_t< UT >::spec );
				1252	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1253	__kmp_str_free( &buff );
				1254	}
				1255	#endif
				1256
				1257	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				1258	} // if
				1259	} // if
				1260	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
				1261	}
				1262
				1263	#ifdef KMP_GOMP_COMPAT
				1264
				1265	template< typename UT >
				1266	static void
				1267	__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
				1268	{
				1269	typedef typename traits_t< UT >::signed_t ST;
				1270	kmp_info_t *th = __kmp_threads[ gtid ];
				1271
				1272	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
				1273	if ( ! th -> th.th_team -> t.t_serialized ) {
				1274	// int cid;
				1275	dispatch_private_info_template< UT > * pr =
				1276	reinterpret_cast< dispatch_private_info_template< UT >* >
				1277	( th->th.th_dispatch->th_dispatch_pr_current );
				1278	dispatch_shared_info_template< UT > volatile * sh =
				1279	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1280	( th->th.th_dispatch->th_dispatch_sh_current );
				1281	KMP_DEBUG_ASSERT( pr );
				1282	KMP_DEBUG_ASSERT( sh );
				1283	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1284	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1285
				1286	// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
				1287	UT lower = pr->u.p.ordered_lower;
				1288	UT upper = pr->u.p.ordered_upper;
				1289	UT inc = upper - lower + 1;
				1290
				1291	if ( pr->ordered_bumped == inc ) {
				1292	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1293	gtid ) );
				1294	pr->ordered_bumped = 0;
				1295	} else {
				1296	inc -= pr->ordered_bumped;
				1297
				1298	#ifdef KMP_DEBUG
				1299	{
				1300	const char * buff;
				1301	// create format specifiers before the debug output
				1302	buff = __kmp_str_format(
				1303	"__kmp_dispatch_finish_chunk: T#%%d before wait: " \
				1304	"ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
				1305	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1306	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
				1307	__kmp_str_free( &buff );
				1308	}
				1309	#endif
				1310
				1311	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1312	USE_ITT_BUILD_ARG(NULL)
				1313	);
				1314
				1315	KMP_MB(); /* is this necessary? */
				1316	KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
				1317	gtid ) );
				1318	pr->ordered_bumped = 0;
				1319	//!!!!! TODO check if the inc should be unsigned, or signed???
				1320	#ifdef KMP_DEBUG
				1321	{
				1322	const char * buff;
				1323	// create format specifiers before the debug output
				1324	buff = __kmp_str_format(
				1325	"__kmp_dispatch_finish_chunk: T#%%d after wait: " \
				1326	"ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
				1327	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1328	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
				1329	__kmp_str_free( &buff );
				1330	}
				1331	#endif
				1332
				1333	test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
				1334	}
				1335	// }
				1336	}
				1337	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
				1338	}
				1339
				1340	#endif /* KMP_GOMP_COMPAT */
				1341
				1342	template< typename T >
				1343	static int
				1344	__kmp_dispatch_next(
				1345	ident_t loc, int gtid, kmp_int32 p_last, T p_lb, T p_ub, typename traits_t< T >::signed_t *p_st
				1346	) {
				1347
				1348	typedef typename traits_t< T >::unsigned_t UT;
				1349	typedef typename traits_t< T >::signed_t ST;
				1350	typedef typename traits_t< T >::floating_t DBL;
				1351	static const int ___kmp_size_type = sizeof( UT );
				1352
				1353	int status;
				1354	dispatch_private_info_template< T > * pr;
				1355	kmp_info_t * th = __kmp_threads[ gtid ];
				1356	kmp_team_t * team = th -> th.th_team;
				1357
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1358	KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1359	#ifdef KMP_DEBUG
				1360	{
				1361	const char * buff;
				1362	// create format specifiers before the debug output
				1363	buff = __kmp_str_format(
				1364	"__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
				1365	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				1366	KD_TRACE(1000, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last ) );
				1367	__kmp_str_free( &buff );
				1368	}
				1369	#endif
				1370
				1371	if ( team -> t.t_serialized ) {
				1372	/* NOTE: serialize this dispatch becase we are not at the active level */
				1373	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1374	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				1375	KMP_DEBUG_ASSERT( pr );
				1376
				1377	if ( (status = (pr->u.p.tc != 0)) == 0 ) {
				1378	*p_lb = 0;
				1379	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1380	// if ( p_last != NULL )
				1381	// *p_last = 0;
				1382	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1383	*p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1384	if ( __kmp_env_consistency_check ) {
				1385	if ( pr->pushed_ws != ct_none ) {
				1386	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1387	}
				1388	}
				1389	} else if ( pr->nomerge ) {
				1390	kmp_int32 last;
				1391	T start;
				1392	UT limit, trip, init;
				1393	ST incr;
				1394	T chunk = pr->u.p.parm1;
				1395
				1396	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
				1397
				1398	init = chunk * pr->u.p.count++;
				1399	trip = pr->u.p.tc - 1;
				1400
				1401	if ( (status = (init <= trip)) == 0 ) {
				1402	*p_lb = 0;
				1403	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1404	// if ( p_last != NULL )
				1405	// *p_last = 0;
				1406	if ( p_st != NULL )
				1407	*p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1408	if ( __kmp_env_consistency_check ) {
				1409	if ( pr->pushed_ws != ct_none ) {
				1410	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1411	}
				1412	}
				1413	} else {
				1414	start = pr->u.p.lb;
				1415	limit = chunk + init - 1;
				1416	incr = pr->u.p.st;
				1417
				1418	if ( (last = (limit >= trip)) != 0 ) {
				1419	limit = trip;
				1420	#if KMP_OS_WINDOWS
				1421	pr->u.p.last_upper = pr->u.p.ub;
				1422	#endif /* KMP_OS_WINDOWS */
				1423	}
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1424	if ( p_last != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1425	*p_last = last;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1426	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1427	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1428	if ( incr == 1 ) {
				1429	*p_lb = start + init;
				1430	*p_ub = start + limit;
				1431	} else {
				1432	p_lb = start + init incr;
				1433	p_ub = start + limit incr;
				1434	}
				1435
				1436	if ( pr->ordered ) {
				1437	pr->u.p.ordered_lower = init;
				1438	pr->u.p.ordered_upper = limit;
				1439	#ifdef KMP_DEBUG
				1440	{
				1441	const char * buff;
				1442	// create format specifiers before the debug output
				1443	buff = __kmp_str_format(
				1444	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1445	traits_t< UT >::spec, traits_t< UT >::spec );
				1446	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1447	__kmp_str_free( &buff );
				1448	}
				1449	#endif
				1450	} // if
				1451	} // if
				1452	} else {
				1453	pr->u.p.tc = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1454	*p_lb = pr->u.p.lb;
				1455	*p_ub = pr->u.p.ub;
				1456	#if KMP_OS_WINDOWS
				1457	pr->u.p.last_upper = *p_ub;
				1458	#endif /* KMP_OS_WINDOWS */
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1459	if ( p_last != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1460	*p_last = TRUE;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1461	if ( p_st != NULL )
				1462	*p_st = pr->u.p.st;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1463	} // if
				1464	#ifdef KMP_DEBUG
				1465	{
				1466	const char * buff;
				1467	// create format specifiers before the debug output
				1468	buff = __kmp_str_format(
				1469	"__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1470	"p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1471	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1472	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, p_st, p_last, p_last, status) );
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1473	__kmp_str_free( &buff );
				1474	}
				1475	#endif
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1476	#if INCLUDE_SSC_MARKS
				1477	SSC_MARK_DISPATCH_NEXT();
				1478	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1479	return status;
				1480	} else {
				1481	kmp_int32 last = 0;
				1482	dispatch_shared_info_template< UT > *sh;
				1483	T start;
				1484	ST incr;
				1485	UT limit, trip, init;
				1486
				1487	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1488	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1489
				1490	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1491	( th->th.th_dispatch->th_dispatch_pr_current );
				1492	KMP_DEBUG_ASSERT( pr );
				1493	sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				1494	( th->th.th_dispatch->th_dispatch_sh_current );
				1495	KMP_DEBUG_ASSERT( sh );
				1496
				1497	if ( pr->u.p.tc == 0 ) {
				1498	// zero trip count
				1499	status = 0;
				1500	} else {
				1501	switch (pr->schedule) {
				1502	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1503	case kmp_sch_static_steal:
				1504	{
				1505	T chunk = pr->u.p.parm1;
				1506
				1507	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
				1508
				1509	trip = pr->u.p.tc - 1;
				1510
				1511	if ( ___kmp_size_type > 4 ) {
				1512	// Other threads do not look into the data of this thread,
				1513	// so it's not necessary to make volatile casting.
				1514	init = ( pr->u.p.count )++;
				1515	status = ( init < (UT)pr->u.p.ub );
				1516	} else {
				1517	typedef union {
				1518	struct {
				1519	UT count;
				1520	T ub;
				1521	} p;
				1522	kmp_int64 b;
				1523	} union_i4;
				1524	// All operations on 'count' or 'ub' must be combined atomically together.
				1525	// stealing implemented only for 4-byte indexes
				1526	{
				1527	union_i4 vold, vnew;
				1528	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1529	vnew = vold;
				1530	vnew.p.count++;
				1531	while( ! KMP_COMPARE_AND_STORE_ACQ64(
				1532	( volatile kmp_int64* )&pr->u.p.count,
				1533	VOLATILE_CAST(kmp_int64 )&vold.b,
				1534	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1535	KMP_CPU_PAUSE();
				1536	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1537	vnew = vold;
				1538	vnew.p.count++;
				1539	}
				1540	vnew = vold;
				1541	init = vnew.p.count;
				1542	status = ( init < (UT)vnew.p.ub ) ;
				1543	}
				1544
				1545	if( !status ) {
				1546	kmp_info_t **other_threads = team->t.t_threads;
				1547	int while_limit = 10;
				1548	int while_index = 0;
				1549
				1550	// TODO: algorithm of searching for a victim
				1551	// should be cleaned up and measured
				1552	while ( ( !status ) && ( while_limit != ++while_index ) ) {
				1553	union_i4 vold, vnew;
				1554	kmp_int32 remaining; // kmp_int32 because KMP_I4 only
				1555	T victimIdx = pr->u.p.parm4;
				1556	T oldVictimIdx = victimIdx;
				1557	dispatch_private_info_template< T > * victim;
				1558
				1559	do {
				1560	if( !victimIdx ) {
				1561	victimIdx = team->t.t_nproc - 1;
				1562	} else {
				1563	--victimIdx;
				1564	}
				1565	victim = reinterpret_cast< dispatch_private_info_template< T >* >
				1566	( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
				1567	} while ( (victim == NULL \|\| victim == pr) && oldVictimIdx != victimIdx );
				1568	// TODO: think about a proper place of this test
				1569	if ( ( !victim ) \|\|
				1570	( (( volatile T )&victim->u.p.static_steal_counter) !=
				1571	(( volatile T )&pr->u.p.static_steal_counter) ) ) {
				1572	// TODO: delay would be nice
				1573	continue;
				1574	// the victim is not ready yet to participate in stealing
				1575	// because the victim is still in kmp_init_dispatch
				1576	}
				1577	if ( oldVictimIdx == victimIdx ) {
				1578	break;
				1579	}
				1580	pr->u.p.parm4 = victimIdx;
				1581
				1582	while( 1 ) {
				1583	vold.b = ( volatile kmp_int64 )( &victim->u.p.count );
				1584	vnew = vold;
				1585
				1586	KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
				1587	if ( vnew.p.count >= (UT)vnew.p.ub \|\| (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
				1588	break;
				1589	}
				1590	vnew.p.ub -= (remaining >> 2);
				1591	KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
				1592	#pragma warning( push )
				1593	// disable warning on pointless comparison of unsigned with 0
				1594	#pragma warning( disable: 186 )
				1595	KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
				1596	#pragma warning( pop )
				1597	// TODO: Should this be acquire or release?
				1598	if ( KMP_COMPARE_AND_STORE_ACQ64(
				1599	( volatile kmp_int64 * )&victim->u.p.count,
				1600	VOLATILE_CAST(kmp_int64 )&vold.b,
				1601	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1602	status = 1;
				1603	while_index = 0;
				1604	// now update own count and ub
				1605	#if KMP_ARCH_X86
				1606	// stealing executed on non-KMP_ARCH_X86 only
				1607	// Atomic 64-bit write on ia32 is
				1608	// unavailable, so we do this in steps.
				1609	// This code is not tested.
				1610	init = vold.p.count;
				1611	pr->u.p.ub = 0;
				1612	pr->u.p.count = init + 1;
				1613	pr->u.p.ub = vnew.p.count;
				1614	#else
				1615	init = vnew.p.ub;
				1616	vold.p.count = init + 1;
				1617	// TODO: is it safe and enough?
				1618	( volatile kmp_int64 )(&pr->u.p.count) = vold.b;
				1619	#endif // KMP_ARCH_X86
				1620	break;
				1621	} // if
				1622	KMP_CPU_PAUSE();
				1623	} // while (1)
				1624	} // while
				1625	} // if
				1626	} // if
				1627	if ( !status ) {
				1628	*p_lb = 0;
				1629	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1630	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1631	} else {
				1632	start = pr->u.p.parm2;
				1633	init *= chunk;
				1634	limit = chunk + init - 1;
				1635	incr = pr->u.p.st;
				1636
				1637	KMP_DEBUG_ASSERT(init <= trip);
				1638	if ( (last = (limit >= trip)) != 0 )
				1639	limit = trip;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1640	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1641
				1642	if ( incr == 1 ) {
				1643	*p_lb = start + init;
				1644	*p_ub = start + limit;
				1645	} else {
				1646	p_lb = start + init incr;
				1647	p_ub = start + limit incr;
				1648	}
				1649
				1650	if ( pr->ordered ) {
				1651	pr->u.p.ordered_lower = init;
				1652	pr->u.p.ordered_upper = limit;
				1653	#ifdef KMP_DEBUG
				1654	{
				1655	const char * buff;
				1656	// create format specifiers before the debug output
				1657	buff = __kmp_str_format(
				1658	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1659	traits_t< UT >::spec, traits_t< UT >::spec );
				1660	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1661	__kmp_str_free( &buff );
				1662	}
				1663	#endif
				1664	} // if
				1665	} // if
				1666	break;
				1667	} // case
				1668	#endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1669	case kmp_sch_static_balanced:
				1670	{
				1671	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
				1672	if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
				1673	pr->u.p.count = 1;
				1674	*p_lb = pr->u.p.lb;
				1675	*p_ub = pr->u.p.ub;
				1676	last = pr->u.p.parm1;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1677	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1678	*p_st = pr->u.p.st;
				1679	} else { /* no iterations to do */
				1680	pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
				1681	}
				1682	if ( pr->ordered ) {
				1683	#ifdef KMP_DEBUG
				1684	{
				1685	const char * buff;
				1686	// create format specifiers before the debug output
				1687	buff = __kmp_str_format(
				1688	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1689	traits_t< UT >::spec, traits_t< UT >::spec );
				1690	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1691	__kmp_str_free( &buff );
				1692	}
				1693	#endif
				1694	} // if
				1695	} // case
				1696	break;
				1697	case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
				1698	case kmp_sch_static_chunked:
				1699	{
				1700	T parm1;
				1701
				1702	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity\|chunked] case\n",
				1703	gtid ) );
				1704	parm1 = pr->u.p.parm1;
				1705
				1706	trip = pr->u.p.tc - 1;
				1707	init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
				1708
				1709	if ( (status = (init <= trip)) != 0 ) {
				1710	start = pr->u.p.lb;
				1711	incr = pr->u.p.st;
				1712	limit = parm1 + init - 1;
				1713
				1714	if ( (last = (limit >= trip)) != 0 )
				1715	limit = trip;
				1716
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1717	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1718
				1719	pr->u.p.count += team->t.t_nproc;
				1720
				1721	if ( incr == 1 ) {
				1722	*p_lb = start + init;
				1723	*p_ub = start + limit;
				1724	}
				1725	else {
				1726	p_lb = start + init incr;
				1727	p_ub = start + limit incr;
				1728	}
				1729
				1730	if ( pr->ordered ) {
				1731	pr->u.p.ordered_lower = init;
				1732	pr->u.p.ordered_upper = limit;
				1733	#ifdef KMP_DEBUG
				1734	{
				1735	const char * buff;
				1736	// create format specifiers before the debug output
				1737	buff = __kmp_str_format(
				1738	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1739	traits_t< UT >::spec, traits_t< UT >::spec );
				1740	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1741	__kmp_str_free( &buff );
				1742	}
				1743	#endif
				1744	} // if
				1745	} // if
				1746	} // case
				1747	break;
				1748
				1749	case kmp_sch_dynamic_chunked:
				1750	{
				1751	T chunk = pr->u.p.parm1;
				1752
				1753	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
				1754	gtid ) );
				1755
				1756	init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1757	trip = pr->u.p.tc - 1;
				1758
				1759	if ( (status = (init <= trip)) == 0 ) {
				1760	*p_lb = 0;
				1761	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1762	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1763	} else {
				1764	start = pr->u.p.lb;
				1765	limit = chunk + init - 1;
				1766	incr = pr->u.p.st;
				1767
				1768	if ( (last = (limit >= trip)) != 0 )
				1769	limit = trip;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1770
				1771	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1772
				1773	if ( incr == 1 ) {
				1774	*p_lb = start + init;
				1775	*p_ub = start + limit;
				1776	} else {
				1777	p_lb = start + init incr;
				1778	p_ub = start + limit incr;
				1779	}
				1780
				1781	if ( pr->ordered ) {
				1782	pr->u.p.ordered_lower = init;
				1783	pr->u.p.ordered_upper = limit;
				1784	#ifdef KMP_DEBUG
				1785	{
				1786	const char * buff;
				1787	// create format specifiers before the debug output
				1788	buff = __kmp_str_format(
				1789	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1790	traits_t< UT >::spec, traits_t< UT >::spec );
				1791	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1792	__kmp_str_free( &buff );
				1793	}
				1794	#endif
				1795	} // if
				1796	} // if
				1797	} // case
				1798	break;
				1799
				1800	case kmp_sch_guided_iterative_chunked:
				1801	{
				1802	T chunkspec = pr->u.p.parm1;
				1803	KD_TRACE(100,
				1804	("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
				1805	trip = pr->u.p.tc;
				1806	// Start atomic part of calculations
				1807	while(1) {
				1808	ST remaining; // signed, because can be < 0
				1809	init = sh->u.s.iteration; // shared value
				1810	remaining = trip - init;
				1811	if ( remaining <= 0 ) { // AC: need to compare with 0 first
				1812	// nothing to do, don't try atomic op
				1813	status = 0;
				1814	break;
				1815	}
				1816	if ( (T)remaining < pr->u.p.parm2 ) { // compare with Knproc(chunk+1), K=2 by default
				1817	// use dynamic-style shcedule
				1818	// atomically inrement iterations, get old value
				1819	init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
				1820	remaining = trip - init;
				1821	if (remaining <= 0) {
				1822	status = 0; // all iterations got by other threads
				1823	} else {
				1824	// got some iterations to work on
				1825	status = 1;
				1826	if ( (T)remaining > chunkspec ) {
				1827	limit = init + chunkspec - 1;
				1828	} else {
				1829	last = 1; // the last chunk
				1830	limit = init + remaining - 1;
				1831	} // if
				1832	} // if
				1833	break;
				1834	} // if
				1835	limit = init + (UT)( remaining * (double)&pr->u.p.parm3 ); // divide by K*nproc
				1836	if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
				1837	// CAS was successful, chunk obtained
				1838	status = 1;
				1839	--limit;
				1840	break;
				1841	} // if
				1842	} // while
				1843	if ( status != 0 ) {
				1844	start = pr->u.p.lb;
				1845	incr = pr->u.p.st;
				1846	if ( p_st != NULL )
				1847	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1848	p_lb = start + init incr;
				1849	p_ub = start + limit incr;
				1850	if ( pr->ordered ) {
				1851	pr->u.p.ordered_lower = init;
				1852	pr->u.p.ordered_upper = limit;
				1853	#ifdef KMP_DEBUG
				1854	{
				1855	const char * buff;
				1856	// create format specifiers before the debug output
				1857	buff = __kmp_str_format(
				1858	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1859	traits_t< UT >::spec, traits_t< UT >::spec );
				1860	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1861	__kmp_str_free( &buff );
				1862	}
				1863	#endif
				1864	} // if
				1865	} else {
				1866	*p_lb = 0;
				1867	*p_ub = 0;
				1868	if ( p_st != NULL )
				1869	*p_st = 0;
				1870	} // if
				1871	} // case
				1872	break;
				1873
				1874	case kmp_sch_guided_analytical_chunked:
				1875	{
				1876	T chunkspec = pr->u.p.parm1;
				1877	UT chunkIdx;
				1878	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1879	/* for storing original FPCW value for Windows* OS on
				1880	IA-32 architecture 8-byte version */
				1881	unsigned int oldFpcw;
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1882	unsigned int fpcwSet = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1883	#endif
				1884	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
				1885	gtid ) );
				1886
				1887	trip = pr->u.p.tc;
				1888
				1889	KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
				1890	KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
				1891
				1892	while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
				1893	chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1894	if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
				1895	--trip;
				1896	/* use dynamic-style scheduling */
				1897	init = chunkIdx * chunkspec + pr->u.p.count;
				1898	/* need to verify init > 0 in case of overflow in the above calculation */
				1899	if ( (status = (init > 0 && init <= trip)) != 0 ) {
				1900	limit = init + chunkspec -1;
				1901
				1902	if ( (last = (limit >= trip)) != 0 )
				1903	limit = trip;
				1904	}
				1905	break;
				1906	} else {
				1907	/* use exponential-style scheduling */
				1908	/* The following check is to workaround the lack of long double precision on Windows* OS.
				1909	This check works around the possible effect that init != 0 for chunkIdx == 0.
				1910	*/
				1911	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1912	/* If we haven't already done so, save original
				1913	FPCW and set precision to 64-bit, as Windows* OS
				1914	on IA-32 architecture defaults to 53-bit */
				1915	if ( !fpcwSet ) {
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1916	oldFpcw = _control87(0,0);
				1917	_control87(_PC_64,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1918	fpcwSet = 0x30000;
				1919	}
				1920	#endif
				1921	if ( chunkIdx ) {
				1922	init = __kmp_dispatch_guided_remaining< T >(
				1923	trip, ( DBL )&pr->u.p.parm3, chunkIdx );
				1924	KMP_DEBUG_ASSERT(init);
				1925	init = trip - init;
				1926	} else
				1927	init = 0;
				1928	limit = trip - __kmp_dispatch_guided_remaining< T >(
				1929	trip, ( DBL )&pr->u.p.parm3, chunkIdx + 1 );
				1930	KMP_ASSERT(init <= limit);
				1931	if ( init < limit ) {
				1932	KMP_DEBUG_ASSERT(limit <= trip);
				1933	--limit;
				1934	status = 1;
				1935	break;
				1936	} // if
				1937	} // if
				1938	} // while (1)
				1939	#if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1940	/* restore FPCW if necessary
				1941	AC: check fpcwSet flag first because oldFpcw can be uninitialized here
				1942	*/
				1943	if ( fpcwSet && ( oldFpcw & fpcwSet ) )
				1944	_control87(oldFpcw,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1945	#endif
				1946	if ( status != 0 ) {
				1947	start = pr->u.p.lb;
				1948	incr = pr->u.p.st;
				1949	if ( p_st != NULL )
				1950	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1951	p_lb = start + init incr;
				1952	p_ub = start + limit incr;
				1953	if ( pr->ordered ) {
				1954	pr->u.p.ordered_lower = init;
				1955	pr->u.p.ordered_upper = limit;
				1956	#ifdef KMP_DEBUG
				1957	{
				1958	const char * buff;
				1959	// create format specifiers before the debug output
				1960	buff = __kmp_str_format(
				1961	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1962	traits_t< UT >::spec, traits_t< UT >::spec );
				1963	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1964	__kmp_str_free( &buff );
				1965	}
				1966	#endif
				1967	}
				1968	} else {
				1969	*p_lb = 0;
				1970	*p_ub = 0;
				1971	if ( p_st != NULL )
				1972	*p_st = 0;
				1973	}
				1974	} // case
				1975	break;
				1976
				1977	case kmp_sch_trapezoidal:
				1978	{
				1979	UT index;
				1980	T parm2 = pr->u.p.parm2;
				1981	T parm3 = pr->u.p.parm3;
				1982	T parm4 = pr->u.p.parm4;
				1983	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
				1984	gtid ) );
				1985
				1986	index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
				1987
				1988	init = ( index * ( (2parm2) - (index-1)parm4 ) ) / 2;
				1989	trip = pr->u.p.tc - 1;
				1990
				1991	if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
				1992	*p_lb = 0;
				1993	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1994	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1995	} else {
				1996	start = pr->u.p.lb;
				1997	limit = ( (index+1) * ( 2parm2 - indexparm4 ) ) / 2 - 1;
				1998	incr = pr->u.p.st;
				1999
				2000	if ( (last = (limit >= trip)) != 0 )
				2001	limit = trip;
				2002
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2003	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2004
				2005	if ( incr == 1 ) {
				2006	*p_lb = start + init;
				2007	*p_ub = start + limit;
				2008	} else {
				2009	p_lb = start + init incr;
				2010	p_ub = start + limit incr;
				2011	}
				2012
				2013	if ( pr->ordered ) {
				2014	pr->u.p.ordered_lower = init;
				2015	pr->u.p.ordered_upper = limit;
				2016	#ifdef KMP_DEBUG
				2017	{
				2018	const char * buff;
				2019	// create format specifiers before the debug output
				2020	buff = __kmp_str_format(
				2021	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				2022	traits_t< UT >::spec, traits_t< UT >::spec );
				2023	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				2024	__kmp_str_free( &buff );
				2025	}
				2026	#endif
				2027	} // if
				2028	} // if
				2029	} // case
				2030	break;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2031	default:
				2032	{
				2033	status = 0; // to avoid complaints on uninitialized variable use
				2034	__kmp_msg(
				2035	kmp_ms_fatal, // Severity
				2036	KMP_MSG( UnknownSchedTypeDetected ), // Primary message
				2037	KMP_HNT( GetNewerLibrary ), // Hint
				2038	__kmp_msg_null // Variadic argument list terminator
				2039	);
				2040	}
				2041	break;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2042	} // switch
				2043	} // if tc == 0;
				2044
				2045	if ( status == 0 ) {
				2046	UT num_done;
				2047
				2048	num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
				2049	#ifdef KMP_DEBUG
				2050	{
				2051	const char * buff;
				2052	// create format specifiers before the debug output
				2053	buff = __kmp_str_format(
				2054	"__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
				2055	traits_t< UT >::spec );
				2056	KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
				2057	__kmp_str_free( &buff );
				2058	}
				2059	#endif
				2060
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2061	if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2062	/* NOTE: release this buffer to be reused */
				2063
				2064	KMP_MB(); /* Flush all pending memory write invalidates. */
				2065
				2066	sh->u.s.num_done = 0;
				2067	sh->u.s.iteration = 0;
				2068
				2069	/* TODO replace with general release procedure? */
				2070	if ( pr->ordered ) {
				2071	sh->u.s.ordered_iteration = 0;
				2072	}
				2073
				2074	KMP_MB(); /* Flush all pending memory write invalidates. */
				2075
				2076	sh -> buffer_index += KMP_MAX_DISP_BUF;
				2077	KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
				2078	gtid, sh->buffer_index) );
				2079
				2080	KMP_MB(); /* Flush all pending memory write invalidates. */
				2081
				2082	} // if
				2083	if ( __kmp_env_consistency_check ) {
				2084	if ( pr->pushed_ws != ct_none ) {
				2085	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				2086	}
				2087	}
				2088
				2089	th -> th.th_dispatch -> th_deo_fcn = NULL;
				2090	th -> th.th_dispatch -> th_dxo_fcn = NULL;
				2091	th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
				2092	th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
				2093	} // if (status == 0)
				2094	#if KMP_OS_WINDOWS
				2095	else if ( last ) {
				2096	pr->u.p.last_upper = pr->u.p.ub;
				2097	}
				2098	#endif /* KMP_OS_WINDOWS */
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2099	if ( p_last != NULL && status != 0 )
				2100	*p_last = last;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2101	} // if
				2102
				2103	#ifdef KMP_DEBUG
				2104	{
				2105	const char * buff;
				2106	// create format specifiers before the debug output
				2107	buff = __kmp_str_format(
				2108	"__kmp_dispatch_next: T#%%d normal case: " \
				2109	"p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
				2110	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				2111	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last, status ) );
				2112	__kmp_str_free( &buff );
				2113	}
				2114	#endif
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2115	#if INCLUDE_SSC_MARKS
				2116	SSC_MARK_DISPATCH_NEXT();
				2117	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2118	return status;
				2119	}
				2120
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2121	template< typename T >
				2122	static void
				2123	__kmp_dist_get_bounds(
				2124	ident_t *loc,
				2125	kmp_int32 gtid,
				2126	kmp_int32 *plastiter,
				2127	T *plower,
				2128	T *pupper,
				2129	typename traits_t< T >::signed_t incr
				2130	) {
				2131	KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
				2132	typedef typename traits_t< T >::unsigned_t UT;
				2133	typedef typename traits_t< T >::signed_t ST;
				2134	register kmp_uint32 team_id;
				2135	register kmp_uint32 nteams;
				2136	register UT trip_count;
				2137	register kmp_team_t *team;
				2138	kmp_info_t * th;
				2139
				2140	KMP_DEBUG_ASSERT( plastiter && plower && pupper );
				2141	KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
				2142	#ifdef KMP_DEBUG
				2143	{
				2144	const char * buff;
				2145	// create format specifiers before the debug output
				2146	buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
				2147	"iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
				2148	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
				2149	traits_t< T >::spec );
				2150	KD_TRACE(100, ( buff, gtid, plastiter, plower, *pupper, incr ) );
				2151	__kmp_str_free( &buff );
				2152	}
				2153	#endif
				2154
				2155	if( __kmp_env_consistency_check ) {
				2156	if( incr == 0 ) {
				2157	__kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
				2158	}
				2159	if( incr > 0 ? (pupper < plower) : (plower < pupper) ) {
				2160	// The loop is illegal.
				2161	// Some zero-trip loops maintained by compiler, e.g.:
				2162	// for(i=10;i<0;++i) // lower >= upper - run-time check
				2163	// for(i=0;i>10;--i) // lower <= upper - run-time check
				2164	// for(i=0;i>10;++i) // incr > 0 - compile-time check
				2165	// for(i=10;i<0;--i) // incr < 0 - compile-time check
				2166	// Compiler does not check the following illegal loops:
				2167	// for(i=0;i<10;i+=incr) // where incr<0
				2168	// for(i=10;i>0;i-=incr) // where incr<0
				2169	__kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
				2170	}
				2171	}
				2172	th = __kmp_threads[gtid];
				2173	KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
				2174	team = th->th.th_team;
				2175	#if OMP_40_ENABLED
				2176	nteams = th->th.th_teams_size.nteams;
				2177	#endif
				2178	team_id = team->t.t_master_tid;
				2179	KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
				2180
				2181	// compute global trip count
				2182	if( incr == 1 ) {
				2183	trip_count = pupper - plower + 1;
				2184	} else if(incr == -1) {
				2185	trip_count = plower - pupper + 1;
				2186	} else {
				2187	trip_count = (ST)(pupper - plower) / incr + 1; // cast to signed to cover incr<0 case
				2188	}
				2189	if( trip_count <= nteams ) {
				2190	KMP_DEBUG_ASSERT(
				2191	__kmp_static == kmp_sch_static_greedy \|\| \
				2192	__kmp_static == kmp_sch_static_balanced
				2193	); // Unknown static scheduling type.
				2194	// only some teams get single iteration, others get nothing
				2195	if( team_id < trip_count ) {
				2196	pupper = plower = plower + team_id incr;
				2197	} else {
				2198	plower = pupper + incr; // zero-trip loop
				2199	}
				2200	if( plastiter != NULL )
				2201	*plastiter = ( team_id == trip_count - 1 );
				2202	} else {
				2203	if( __kmp_static == kmp_sch_static_balanced ) {
				2204	register UT chunk = trip_count / nteams;
				2205	register UT extras = trip_count % nteams;
				2206	plower += incr ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
				2207	pupper = plower + chunk * incr - ( team_id < extras ? 0 : incr );
				2208	if( plastiter != NULL )
				2209	*plastiter = ( team_id == nteams - 1 );
				2210	} else {
				2211	register T chunk_inc_count =
				2212	( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
				2213	register T upper = *pupper;
				2214	KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
				2215	// Unknown static scheduling type.
				2216	plower += team_id chunk_inc_count;
				2217	pupper = plower + chunk_inc_count - incr;
				2218	// Check/correct bounds if needed
				2219	if( incr > 0 ) {
				2220	if( pupper < plower )
				2221	*pupper = i_maxmin< T >::mx;
				2222	if( plastiter != NULL )
				2223	plastiter = plower <= upper && *pupper > upper - incr;
				2224	if( *pupper > upper )
				2225	*pupper = upper; // tracker C73258
				2226	} else {
				2227	if( pupper > plower )
				2228	*pupper = i_maxmin< T >::mn;
				2229	if( plastiter != NULL )
				2230	plastiter = plower >= upper && *pupper < upper - incr;
				2231	if( *pupper < upper )
				2232	*pupper = upper; // tracker C73258
				2233	}
				2234	}
				2235	}
				2236	}
				2237
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2238	//-----------------------------------------------------------------------------------------
				2239	// Dispatch routines
				2240	// Transfer call to template< type T >
				2241	// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
				2242	// T lb, T ub, ST st, ST chunk )
				2243	extern "C" {
				2244
				2245	/*!
				2246	@ingroup WORK_SHARING
				2247	@{
				2248	@param loc Source location
				2249	@param gtid Global thread id
				2250	@param schedule Schedule type
				2251	@param lb Lower bound
				2252	@param ub Upper bound
				2253	@param st Step (or increment if you prefer)
				2254	@param chunk The chunk size to block with
				2255
				2256	This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
				2257	These functions are all identical apart from the types of the arguments.
				2258	*/
				2259
				2260	void
				2261	__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2262	kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
				2263	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2264	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2265	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2266	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2267	}
				2268	/*!
				2269	See @ref __kmpc_dispatch_init_4
				2270	*/
				2271	void
				2272	__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2273	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
				2274	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2275	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2276	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2277	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2278	}
				2279
				2280	/*!
				2281	See @ref __kmpc_dispatch_init_4
				2282	*/
				2283	void
				2284	__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2285	kmp_int64 lb, kmp_int64 ub,
				2286	kmp_int64 st, kmp_int64 chunk )
				2287	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2288	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2289	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2290	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2291	}
				2292
				2293	/*!
				2294	See @ref __kmpc_dispatch_init_4
				2295	*/
				2296	void
				2297	__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2298	kmp_uint64 lb, kmp_uint64 ub,
				2299	kmp_int64 st, kmp_int64 chunk )
				2300	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2301	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2302	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2303	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2304	}
				2305
				2306	/*!
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2307	See @ref __kmpc_dispatch_init_4
				2308
				2309	Difference from __kmpc_dispatch_init set of functions is these functions
				2310	are called for composite distribute parallel for construct. Thus before
				2311	regular iterations dispatching we need to calc per-team iteration space.
				2312
				2313	These functions are all identical apart from the types of the arguments.
				2314	*/
				2315	void
				2316	__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2317	kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
				2318	{
				2319	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2320	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2321	__kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
				2322	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2323	}
				2324
				2325	void
				2326	__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2327	kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
				2328	{
				2329	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2330	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2331	__kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
				2332	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2333	}
				2334
				2335	void
				2336	__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2337	kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
				2338	{
				2339	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2340	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2341	__kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
				2342	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2343	}
				2344
				2345	void
				2346	__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2347	kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
				2348	{
				2349	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2350	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2351	__kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
				2352	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2353	}
				2354
				2355	/*!
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2356	@param loc Source code location
				2357	@param gtid Global thread id
				2358	@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
				2359	@param p_lb Pointer to the lower bound for the next chunk of work
				2360	@param p_ub Pointer to the upper bound for the next chunk of work
				2361	@param p_st Pointer to the stride for the next chunk of work
				2362	@return one if there is work to be done, zero otherwise
				2363
				2364	Get the next dynamically allocated chunk of work for this thread.
				2365	If there is no more work, then the lb,ub and stride need not be modified.
				2366	*/
				2367	int
				2368	__kmpc_dispatch_next_4( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2369	kmp_int32 p_lb, kmp_int32 p_ub, kmp_int32 *p_st )
				2370	{
				2371	return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2372	}
				2373
				2374	/*!
				2375	See @ref __kmpc_dispatch_next_4
				2376	*/
				2377	int
				2378	__kmpc_dispatch_next_4u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2379	kmp_uint32 p_lb, kmp_uint32 p_ub, kmp_int32 *p_st )
				2380	{
				2381	return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2382	}
				2383
				2384	/*!
				2385	See @ref __kmpc_dispatch_next_4
				2386	*/
				2387	int
				2388	__kmpc_dispatch_next_8( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2389	kmp_int64 p_lb, kmp_int64 p_ub, kmp_int64 *p_st )
				2390	{
				2391	return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2392	}
				2393
				2394	/*!
				2395	See @ref __kmpc_dispatch_next_4
				2396	*/
				2397	int
				2398	__kmpc_dispatch_next_8u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2399	kmp_uint64 p_lb, kmp_uint64 p_ub, kmp_int64 *p_st )
				2400	{
				2401	return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2402	}
				2403
				2404	/*!
				2405	@param loc Source code location
				2406	@param gtid Global thread id
				2407
				2408	Mark the end of a dynamic loop.
				2409	*/
				2410	void
				2411	__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
				2412	{
				2413	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2414	}
				2415
				2416	/*!
				2417	See @ref __kmpc_dispatch_fini_4
				2418	*/
				2419	void
				2420	__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
				2421	{
				2422	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2423	}
				2424
				2425	/*!
				2426	See @ref __kmpc_dispatch_fini_4
				2427	*/
				2428	void
				2429	__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
				2430	{
				2431	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2432	}
				2433
				2434	/*!
				2435	See @ref __kmpc_dispatch_fini_4
				2436	*/
				2437	void
				2438	__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
				2439	{
				2440	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2441	}
				2442	/! @} /
				2443
				2444	//-----------------------------------------------------------------------------------------
				2445	//Non-template routines from kmp_dispatch.c used in other sources
				2446
				2447	kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
				2448	return value == checker;
				2449	}
				2450
				2451	kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
				2452	return value != checker;
				2453	}
				2454
				2455	kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
				2456	return value < checker;
				2457	}
				2458
				2459	kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
				2460	return value >= checker;
				2461	}
				2462
				2463	kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
				2464	return value <= checker;
				2465	}
				2466	kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
				2467	return value == checker;
				2468	}
				2469
				2470	kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
				2471	return value != checker;
				2472	}
				2473
				2474	kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
				2475	return value < checker;
				2476	}
				2477
				2478	kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
				2479	return value >= checker;
				2480	}
				2481
				2482	kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
				2483	return value <= checker;
				2484	}
				2485
				2486	kmp_uint32
				2487	__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
				2488	kmp_uint32 checker,
				2489	kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
				2490	, void * obj // Higher-level synchronization object, or NULL.
				2491	)
				2492	{
				2493	// note: we may not belong to a team at this point
				2494	register volatile kmp_uint32 * spin = spinner;
				2495	register kmp_uint32 check = checker;
				2496	register kmp_uint32 spins;
				2497	register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
				2498	register kmp_uint32 r;
				2499
				2500	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2501	KMP_INIT_YIELD( spins );
				2502	// main wait spin loop
				2503	while(!f(r = TCR_4(*spin), check)) {
				2504	KMP_FSYNC_SPIN_PREPARE( obj );
				2505	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2506	It causes problems with infinite recursion because of exit lock */
				2507	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2508	__kmp_abort_thread(); */
				2509
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2510	/* if we have waited a bit, or are oversubscribed, yield */
				2511	/* pause is in the following code */
				2512	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2513	KMP_YIELD_SPIN( spins );
				2514	}
				2515	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2516	return r;
				2517	}
				2518
				2519	kmp_uint64
				2520	__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
				2521	kmp_uint64 checker,
				2522	kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
				2523	, void * obj // Higher-level synchronization object, or NULL.
				2524	)
				2525	{
				2526	// note: we may not belong to a team at this point
				2527	register volatile kmp_uint64 * spin = spinner;
				2528	register kmp_uint64 check = checker;
				2529	register kmp_uint32 spins;
				2530	register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
				2531	register kmp_uint64 r;
				2532
				2533	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2534	KMP_INIT_YIELD( spins );
				2535	// main wait spin loop
				2536	while(!f(r = *spin, check))
				2537	{
				2538	KMP_FSYNC_SPIN_PREPARE( obj );
				2539	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2540	It causes problems with infinite recursion because of exit lock */
				2541	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2542	__kmp_abort_thread(); */
				2543
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2544	// if we are oversubscribed,
				2545	// or have waited a bit (and KMP_LIBARRY=throughput, then yield
				2546	// pause is in the following code
				2547	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2548	KMP_YIELD_SPIN( spins );
				2549	}
				2550	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2551	return r;
				2552	}
				2553
				2554	} // extern "C"
				2555
				2556	#ifdef KMP_GOMP_COMPAT
				2557
				2558	void
				2559	__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2560	kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
				2561	kmp_int32 chunk, int push_ws )
				2562	{
				2563	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2564	push_ws );
				2565	}
				2566
				2567	void
				2568	__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2569	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
				2570	kmp_int32 chunk, int push_ws )
				2571	{
				2572	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2573	push_ws );
				2574	}
				2575
				2576	void
				2577	__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2578	kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
				2579	kmp_int64 chunk, int push_ws )
				2580	{
				2581	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2582	push_ws );
				2583	}
				2584
				2585	void
				2586	__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2587	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
				2588	kmp_int64 chunk, int push_ws )
				2589	{
				2590	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2591	push_ws );
				2592	}
				2593
				2594	void
				2595	__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
				2596	{
				2597	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2598	}
				2599
				2600	void
				2601	__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
				2602	{
				2603	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2604	}
				2605
				2606	void
				2607	__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
				2608	{
				2609	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2610	}
				2611
				2612	void
				2613	__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
				2614	{
				2615	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2616	}
				2617
				2618	#endif /* KMP_GOMP_COMPAT */
				2619
				2620	/* ------------------------------------------------------------------------ */
				2621	/* ------------------------------------------------------------------------ */
				2622