Blame - openmp/runtime/src/kmp_dispatch.cpp - toolchain/llvm-project

blob: cc58f493a69a3f31bd752d195ed9f8ef9781a12c [file] [log] [blame]

Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1	/*
				2	* kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	3	* $Revision: 43457 $
				4	* $Date: 2014-09-17 03:57:22 -0500 (Wed, 17 Sep 2014) $
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	5	*/
				6
				7
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// The LLVM Compiler Infrastructure
				11	//
				12	// This file is dual licensed under the MIT and the University of Illinois Open
				13	// Source Licenses. See LICENSE.txt for details.
				14	//
				15	//===----------------------------------------------------------------------===//
				16
				17
				18	/*
				19	* Dynamic scheduling initialization and dispatch.
				20	*
				21	* NOTE: __kmp_nth is a constant inside of any dispatch loop, however
				22	* it may change values between parallel regions. __kmp_max_nth
				23	* is the largest value __kmp_nth may take, 1 is the smallest.
				24	*
				25	*/
				26
				27	/* ------------------------------------------------------------------------ */
				28	/* ------------------------------------------------------------------------ */
				29
				30	#include "kmp.h"
				31	#include "kmp_i18n.h"
				32	#include "kmp_itt.h"
				33	#include "kmp_str.h"
				34	#include "kmp_error.h"
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	35	#include "kmp_stats.h"
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	36	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				37	#include <float.h>
				38	#endif
				39
				40	/* ------------------------------------------------------------------------ */
				41	/* ------------------------------------------------------------------------ */
				42
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	43	// template for type limits
				44	template< typename T >
				45	struct i_maxmin {
				46	static const T mx;
				47	static const T mn;
				48	};
				49	template<>
				50	struct i_maxmin< int > {
				51	static const int mx = 0x7fffffff;
				52	static const int mn = 0x80000000;
				53	};
				54	template<>
				55	struct i_maxmin< unsigned int > {
				56	static const unsigned int mx = 0xffffffff;
				57	static const unsigned int mn = 0x00000000;
				58	};
				59	template<>
				60	struct i_maxmin< long long > {
				61	static const long long mx = 0x7fffffffffffffffLL;
				62	static const long long mn = 0x8000000000000000LL;
				63	};
				64	template<>
				65	struct i_maxmin< unsigned long long > {
				66	static const unsigned long long mx = 0xffffffffffffffffLL;
				67	static const unsigned long long mn = 0x0000000000000000LL;
				68	};
				69	//-------------------------------------------------------------------------
				70
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	71	#ifdef KMP_STATIC_STEAL_ENABLED
				72
				73	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				74	template< typename T >
				75	struct dispatch_private_infoXX_template {
				76	typedef typename traits_t< T >::unsigned_t UT;
				77	typedef typename traits_t< T >::signed_t ST;
				78	UT count; // unsigned
				79	T ub;
				80	/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
				81	T lb;
				82	ST st; // signed
				83	UT tc; // unsigned
				84	T static_steal_counter; // for static_steal only; maybe better to put after ub
				85
				86	/* parm[1-4] are used in different ways by different scheduling algorithms */
				87
				88	// KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
				89	// a) parm3 is properly aligned and
				90	// b) all parm1-4 are in the same cache line.
				91	// Because of parm1-4 are used together, performance seems to be better
				92	// if they are in the same line (not measured though).
				93
				94	struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
				95	T parm1;
				96	T parm2;
				97	T parm3;
				98	T parm4;
				99	};
				100
				101	UT ordered_lower; // unsigned
				102	UT ordered_upper; // unsigned
				103	#if KMP_OS_WINDOWS
				104	T last_upper;
				105	#endif /* KMP_OS_WINDOWS */
				106	};
				107
				108	#else /* KMP_STATIC_STEAL_ENABLED */
				109
				110	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				111	template< typename T >
				112	struct dispatch_private_infoXX_template {
				113	typedef typename traits_t< T >::unsigned_t UT;
				114	typedef typename traits_t< T >::signed_t ST;
				115	T lb;
				116	T ub;
				117	ST st; // signed
				118	UT tc; // unsigned
				119
				120	T parm1;
				121	T parm2;
				122	T parm3;
				123	T parm4;
				124
				125	UT count; // unsigned
				126
				127	UT ordered_lower; // unsigned
				128	UT ordered_upper; // unsigned
				129	#if KMP_OS_WINDOWS
				130	T last_upper;
				131	#endif /* KMP_OS_WINDOWS */
				132	};
				133
				134	#endif /* KMP_STATIC_STEAL_ENABLED */
				135
				136	// replaces dispatch_private_info structure and dispatch_private_info_t type
				137	template< typename T >
				138	struct KMP_ALIGN_CACHE dispatch_private_info_template {
				139	// duplicate alignment here, otherwise size of structure is not correct in our compiler
				140	union KMP_ALIGN_CACHE private_info_tmpl {
				141	dispatch_private_infoXX_template< T > p;
				142	dispatch_private_info64_t p64;
				143	} u;
				144	enum sched_type schedule; /* scheduling algorithm */
				145	kmp_uint32 ordered; /* ordered clause specified */
				146	kmp_uint32 ordered_bumped;
				147	kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
				148	dispatch_private_info * next; /* stack of buffers for nest of serial regions */
				149	kmp_uint32 nomerge; /* don't merge iters if serialized */
				150	kmp_uint32 type_size;
				151	enum cons_type pushed_ws;
				152	};
				153
				154
				155	// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
				156	template< typename UT >
				157	struct dispatch_shared_infoXX_template {
				158	/* chunk index under dynamic, number of idle threads under static-steal;
				159	iteration index otherwise */
				160	volatile UT iteration;
				161	volatile UT num_done;
				162	volatile UT ordered_iteration;
				163	UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
				164	};
				165
				166	// replaces dispatch_shared_info structure and dispatch_shared_info_t type
				167	template< typename UT >
				168	struct dispatch_shared_info_template {
				169	// we need union here to keep the structure size
				170	union shared_info_tmpl {
				171	dispatch_shared_infoXX_template< UT > s;
				172	dispatch_shared_info64_t s64;
				173	} u;
				174	volatile kmp_uint32 buffer_index;
				175	};
				176
				177	/* ------------------------------------------------------------------------ */
				178	/* ------------------------------------------------------------------------ */
				179
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	180	#undef USE_TEST_LOCKS
				181
				182	// test_then_add template (general template should NOT be used)
				183	template< typename T >
				184	static __forceinline T
				185	test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
				186
				187	template<>
				188	__forceinline kmp_int32
				189	test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
				190	{
				191	kmp_int32 r;
				192	r = KMP_TEST_THEN_ADD32( p, d );
				193	return r;
				194	}
				195
				196	template<>
				197	__forceinline kmp_int64
				198	test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
				199	{
				200	kmp_int64 r;
				201	r = KMP_TEST_THEN_ADD64( p, d );
				202	return r;
				203	}
				204
				205	// test_then_inc_acq template (general template should NOT be used)
				206	template< typename T >
				207	static __forceinline T
				208	test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
				209
				210	template<>
				211	__forceinline kmp_int32
				212	test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
				213	{
				214	kmp_int32 r;
				215	r = KMP_TEST_THEN_INC_ACQ32( p );
				216	return r;
				217	}
				218
				219	template<>
				220	__forceinline kmp_int64
				221	test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
				222	{
				223	kmp_int64 r;
				224	r = KMP_TEST_THEN_INC_ACQ64( p );
				225	return r;
				226	}
				227
				228	// test_then_inc template (general template should NOT be used)
				229	template< typename T >
				230	static __forceinline T
				231	test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
				232
				233	template<>
				234	__forceinline kmp_int32
				235	test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
				236	{
				237	kmp_int32 r;
				238	r = KMP_TEST_THEN_INC32( p );
				239	return r;
				240	}
				241
				242	template<>
				243	__forceinline kmp_int64
				244	test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
				245	{
				246	kmp_int64 r;
				247	r = KMP_TEST_THEN_INC64( p );
				248	return r;
				249	}
				250
				251	// compare_and_swap template (general template should NOT be used)
				252	template< typename T >
				253	static __forceinline kmp_int32
				254	compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
				255
				256	template<>
				257	__forceinline kmp_int32
				258	compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
				259	{
				260	return KMP_COMPARE_AND_STORE_REL32( p, c, s );
				261	}
				262
				263	template<>
				264	__forceinline kmp_int32
				265	compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
				266	{
				267	return KMP_COMPARE_AND_STORE_REL64( p, c, s );
				268	}
				269
				270	/*
				271	Spin wait loop that first does pause, then yield.
				272	Waits until function returns non-zero when called with *spinner and check.
				273	Does NOT put threads to sleep.
				274	#if USE_ITT_BUILD
				275	Arguments:
Alp Toker	8f2d3f0	2014-02-24 10:40:15 +0000	[diff] [blame]	276	obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	277	locks consistently. For example, if lock is acquired immediately, its address is
				278	reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
				279	immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
				280	address, not an address of low-level spinner.
				281	#endif // USE_ITT_BUILD
				282	*/
				283	template< typename UT >
				284	// ToDo: make inline function (move to header file for icl)
				285	static UT // unsigned 4- or 8-byte type
				286	__kmp_wait_yield( volatile UT * spinner,
				287	UT checker,
				288	kmp_uint32 (* pred)( UT, UT )
				289	USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
				290	)
				291	{
				292	// note: we may not belong to a team at this point
				293	register volatile UT * spin = spinner;
				294	register UT check = checker;
				295	register kmp_uint32 spins;
				296	register kmp_uint32 (*f) ( UT, UT ) = pred;
				297	register UT r;
				298
				299	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				300	KMP_INIT_YIELD( spins );
				301	// main wait spin loop
				302	while(!f(r = *spin, check))
				303	{
				304	KMP_FSYNC_SPIN_PREPARE( obj );
				305	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				306	It causes problems with infinite recursion because of exit lock */
				307	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				308	__kmp_abort_thread(); */
				309
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	310	// if we are oversubscribed,
				311	// or have waited a bit (and KMP_LIBRARY=throughput, then yield
				312	// pause is in the following code
				313	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				314	KMP_YIELD_SPIN( spins );
				315	}
				316	KMP_FSYNC_SPIN_ACQUIRED( obj );
				317	return r;
				318	}
				319
				320	template< typename UT >
				321	static kmp_uint32 __kmp_eq( UT value, UT checker) {
				322	return value == checker;
				323	}
				324
				325	template< typename UT >
				326	static kmp_uint32 __kmp_neq( UT value, UT checker) {
				327	return value != checker;
				328	}
				329
				330	template< typename UT >
				331	static kmp_uint32 __kmp_lt( UT value, UT checker) {
				332	return value < checker;
				333	}
				334
				335	template< typename UT >
				336	static kmp_uint32 __kmp_ge( UT value, UT checker) {
				337	return value >= checker;
				338	}
				339
				340	template< typename UT >
				341	static kmp_uint32 __kmp_le( UT value, UT checker) {
				342	return value <= checker;
				343	}
				344
				345
				346	/* ------------------------------------------------------------------------ */
				347	/* ------------------------------------------------------------------------ */
				348
				349	static void
				350	__kmp_dispatch_deo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				351	{
				352	kmp_info_t *th;
				353
				354	KMP_DEBUG_ASSERT( gtid_ref );
				355
				356	if ( __kmp_env_consistency_check ) {
				357	th = __kmp_threads[*gtid_ref];
				358	if ( th -> th.th_root -> r.r_active
				359	&& ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
				360	__kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
				361	}
				362	}
				363	}
				364
				365	template< typename UT >
				366	static void
				367	__kmp_dispatch_deo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				368	{
				369	typedef typename traits_t< UT >::signed_t ST;
				370	dispatch_private_info_template< UT > * pr;
				371
				372	int gtid = *gtid_ref;
				373	// int cid = *cid_ref;
				374	kmp_info_t *th = __kmp_threads[ gtid ];
				375	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				376
				377	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
				378	if ( __kmp_env_consistency_check ) {
				379	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				380	( th -> th.th_dispatch -> th_dispatch_pr_current );
				381	if ( pr -> pushed_ws != ct_none ) {
				382	__kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
				383	}
				384	}
				385
				386	if ( ! th -> th.th_team -> t.t_serialized ) {
				387	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				388	( th -> th.th_dispatch -> th_dispatch_sh_current );
				389	UT lower;
				390
				391	if ( ! __kmp_env_consistency_check ) {
				392	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				393	( th -> th.th_dispatch -> th_dispatch_pr_current );
				394	}
				395	lower = pr->u.p.ordered_lower;
				396
				397	#if ! defined( KMP_GOMP_COMPAT )
				398	if ( __kmp_env_consistency_check ) {
				399	if ( pr->ordered_bumped ) {
				400	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				401	__kmp_error_construct2(
				402	kmp_i18n_msg_CnsMultipleNesting,
				403	ct_ordered_in_pdo, loc_ref,
				404	& p->stack_data[ p->w_top ]
				405	);
				406	}
				407	}
				408	#endif /* !defined(KMP_GOMP_COMPAT) */
				409
				410	KMP_MB();
				411	#ifdef KMP_DEBUG
				412	{
				413	const char * buff;
				414	// create format specifiers before the debug output
				415	buff = __kmp_str_format(
				416	"__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
				417	traits_t< UT >::spec, traits_t< UT >::spec );
				418	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				419	__kmp_str_free( &buff );
				420	}
				421	#endif
				422
				423	__kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				424	USE_ITT_BUILD_ARG( NULL )
				425	);
				426	KMP_MB(); /* is this necessary? */
				427	#ifdef KMP_DEBUG
				428	{
				429	const char * buff;
				430	// create format specifiers before the debug output
				431	buff = __kmp_str_format(
				432	"__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
				433	traits_t< UT >::spec, traits_t< UT >::spec );
				434	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				435	__kmp_str_free( &buff );
				436	}
				437	#endif
				438	}
				439	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
				440	}
				441
				442	static void
				443	__kmp_dispatch_dxo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				444	{
				445	kmp_info_t *th;
				446
				447	if ( __kmp_env_consistency_check ) {
				448	th = __kmp_threads[*gtid_ref];
				449	if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
				450	__kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
				451	}
				452	}
				453	}
				454
				455	template< typename UT >
				456	static void
				457	__kmp_dispatch_dxo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				458	{
				459	typedef typename traits_t< UT >::signed_t ST;
				460	dispatch_private_info_template< UT > * pr;
				461
				462	int gtid = *gtid_ref;
				463	// int cid = *cid_ref;
				464	kmp_info_t *th = __kmp_threads[ gtid ];
				465	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				466
				467	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
				468	if ( __kmp_env_consistency_check ) {
				469	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				470	( th -> th.th_dispatch -> th_dispatch_pr_current );
				471	if ( pr -> pushed_ws != ct_none ) {
				472	__kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
				473	}
				474	}
				475
				476	if ( ! th -> th.th_team -> t.t_serialized ) {
				477	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				478	( th -> th.th_dispatch -> th_dispatch_sh_current );
				479
				480	if ( ! __kmp_env_consistency_check ) {
				481	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				482	( th -> th.th_dispatch -> th_dispatch_pr_current );
				483	}
				484
				485	KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
				486	#if ! defined( KMP_GOMP_COMPAT )
				487	if ( __kmp_env_consistency_check ) {
				488	if ( pr->ordered_bumped != 0 ) {
				489	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				490	/* How to test it? - OM */
				491	__kmp_error_construct2(
				492	kmp_i18n_msg_CnsMultipleNesting,
				493	ct_ordered_in_pdo, loc_ref,
				494	& p->stack_data[ p->w_top ]
				495	);
				496	}
				497	}
				498	#endif /* !defined(KMP_GOMP_COMPAT) */
				499
				500	KMP_MB(); /* Flush all pending memory write invalidates. */
				501
				502	pr->ordered_bumped += 1;
				503
				504	KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
				505	gtid, pr->ordered_bumped ) );
				506
				507	KMP_MB(); /* Flush all pending memory write invalidates. */
				508
				509	/* TODO use general release procedure? */
				510	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				511
				512	KMP_MB(); /* Flush all pending memory write invalidates. */
				513	}
				514	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
				515	}
				516
				517	/* Computes and returns x to the power of y, where y must a non-negative integer */
				518	template< typename UT >
				519	static __forceinline long double
				520	__kmp_pow(long double x, UT y) {
				521	long double s=1.0L;
				522
				523	KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
				524	//KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
				525	while(y) {
				526	if ( y & 1 )
				527	s *= x;
				528	x *= x;
				529	y >>= 1;
				530	}
				531	return s;
				532	}
				533
				534	/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
				535	(the total number of unassigned iterations in chunks with index greater than or equal to idx).
				536	__forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
				537	(one of the unit tests, sch_guided_analytical_basic.cpp, fails)
				538	*/
				539	template< typename T >
				540	static __inline typename traits_t< T >::unsigned_t
				541	__kmp_dispatch_guided_remaining(
				542	T tc,
				543	typename traits_t< T >::floating_t base,
				544	typename traits_t< T >::unsigned_t idx
				545	) {
				546	/* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
				547	least for ICL 8.1, long double arithmetic may not really have
				548	long double precision, even with /Qlong_double. Currently, we
				549	workaround that in the caller code, by manipulating the FPCW for
				550	Windows* OS on IA-32 architecture. The lack of precision is not
				551	expected to be a correctness issue, though.
				552	*/
				553	typedef typename traits_t< T >::unsigned_t UT;
				554
				555	long double x = tc * __kmp_pow< UT >(base, idx);
				556	UT r = (UT) x;
				557	if ( x == r )
				558	return r;
				559	return r + 1;
				560	}
				561
				562	// Parameters of the guided-iterative algorithm:
				563	// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
				564	// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
				565	// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
				566	// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
				567	static int guided_int_param = 2;
				568	static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
				569
				570	// UT - unsigned flavor of T, ST - signed flavor of T,
				571	// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
				572	template< typename T >
				573	static void
				574	__kmp_dispatch_init(
				575	ident_t * loc,
				576	int gtid,
				577	enum sched_type schedule,
				578	T lb,
				579	T ub,
				580	typename traits_t< T >::signed_t st,
				581	typename traits_t< T >::signed_t chunk,
				582	int push_ws
				583	) {
				584	typedef typename traits_t< T >::unsigned_t UT;
				585	typedef typename traits_t< T >::signed_t ST;
				586	typedef typename traits_t< T >::floating_t DBL;
				587	static const int ___kmp_size_type = sizeof( UT );
				588
				589	int active;
				590	T tc;
				591	kmp_info_t * th;
				592	kmp_team_t * team;
				593	kmp_uint32 my_buffer_index;
				594	dispatch_private_info_template< T > * pr;
				595	dispatch_shared_info_template< UT > volatile * sh;
				596
				597	KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
				598	KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
				599
				600	if ( ! TCR_4( __kmp_init_parallel ) )
				601	__kmp_parallel_initialize();
				602
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	603	#if INCLUDE_SSC_MARKS
				604	SSC_MARK_DISPATCH_INIT();
				605	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	606	#ifdef KMP_DEBUG
				607	{
				608	const char * buff;
				609	// create format specifiers before the debug output
				610	buff = __kmp_str_format(
				611	"__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
				612	traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				613	KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
				614	__kmp_str_free( &buff );
				615	}
				616	#endif
				617	/* setup data */
				618	th = __kmp_threads[ gtid ];
				619	team = th -> th.th_team;
				620	active = ! team -> t.t_serialized;
				621	th->th.th_ident = loc;
				622
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	623	#if USE_ITT_BUILD
				624	kmp_uint64 cur_chunk = chunk;
				625	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	626	if ( ! active ) {
				627	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				628	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				629	} else {
				630	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				631	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				632
				633	my_buffer_index = th->th.th_dispatch->th_disp_index ++;
				634
				635	/* What happens when number of threads changes, need to resize buffer? */
				636	pr = reinterpret_cast< dispatch_private_info_template< T > * >
				637	( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				638	sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
				639	( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				640	}
				641
				642	/* Pick up the nomerge/ordered bits from the scheduling type */
				643	if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
				644	pr->nomerge = TRUE;
				645	schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
				646	} else {
				647	pr->nomerge = FALSE;
				648	}
				649	pr->type_size = ___kmp_size_type; // remember the size of variables
				650	if ( kmp_ord_lower & schedule ) {
				651	pr->ordered = TRUE;
				652	schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
				653	} else {
				654	pr->ordered = FALSE;
				655	}
				656	if ( schedule == kmp_sch_static ) {
				657	schedule = __kmp_static;
				658	} else {
				659	if ( schedule == kmp_sch_runtime ) {
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	660	// Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
				661	schedule = team -> t.t_sched.r_sched_type;
				662	// Detail the schedule if needed (global controls are differentiated appropriately)
				663	if ( schedule == kmp_sch_guided_chunked ) {
				664	schedule = __kmp_guided;
				665	} else if ( schedule == kmp_sch_static ) {
				666	schedule = __kmp_static;
				667	}
				668	// Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
				669	chunk = team -> t.t_sched.chunk;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	670
				671	#ifdef KMP_DEBUG
				672	{
				673	const char * buff;
				674	// create format specifiers before the debug output
				675	buff = __kmp_str_format(
				676	"__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
				677	traits_t< ST >::spec );
				678	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				679	__kmp_str_free( &buff );
				680	}
				681	#endif
				682	} else {
				683	if ( schedule == kmp_sch_guided_chunked ) {
				684	schedule = __kmp_guided;
				685	}
				686	if ( chunk <= 0 ) {
				687	chunk = KMP_DEFAULT_CHUNK;
				688	}
				689	}
				690
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	691	if ( schedule == kmp_sch_auto ) {
				692	// mapping and differentiation: in the __kmp_do_serial_initialize()
				693	schedule = __kmp_auto;
				694	#ifdef KMP_DEBUG
				695	{
				696	const char * buff;
				697	// create format specifiers before the debug output
				698	buff = __kmp_str_format(
				699	"__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
				700	traits_t< ST >::spec );
				701	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				702	__kmp_str_free( &buff );
				703	}
				704	#endif
				705	}
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	706
				707	/* guided analytical not safe for too many threads */
				708	if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
				709	schedule = kmp_sch_guided_iterative_chunked;
				710	KMP_WARNING( DispatchManyThreads );
				711	}
				712	pr->u.p.parm1 = chunk;
				713	}
				714	KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
				715	"unknown scheduling type" );
				716
				717	pr->u.p.count = 0;
				718
				719	if ( __kmp_env_consistency_check ) {
				720	if ( st == 0 ) {
				721	__kmp_error_construct(
				722	kmp_i18n_msg_CnsLoopIncrZeroProhibited,
				723	( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
				724	);
				725	}
				726	}
				727
				728	tc = ( ub - lb + st );
				729	if ( st != 1 ) {
				730	if ( st < 0 ) {
				731	if ( lb < ub ) {
				732	tc = 0; // zero-trip
				733	} else { // lb >= ub
				734	tc = (ST)tc / st; // convert to signed division
				735	}
				736	} else { // st > 0
				737	if ( ub < lb ) {
				738	tc = 0; // zero-trip
				739	} else { // lb >= ub
				740	tc /= st;
				741	}
				742	}
				743	} else if ( ub < lb ) { // st == 1
				744	tc = 0; // zero-trip
				745	}
				746
				747	pr->u.p.lb = lb;
				748	pr->u.p.ub = ub;
				749	pr->u.p.st = st;
				750	pr->u.p.tc = tc;
				751
				752	#if KMP_OS_WINDOWS
				753	pr->u.p.last_upper = ub + st;
				754	#endif /* KMP_OS_WINDOWS */
				755
				756	/* NOTE: only the active parallel region(s) has active ordered sections */
				757
				758	if ( active ) {
				759	if ( pr->ordered == 0 ) {
				760	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
				761	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
				762	} else {
				763	pr->ordered_bumped = 0;
				764
				765	pr->u.p.ordered_lower = 1;
				766	pr->u.p.ordered_upper = 0;
				767
				768	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
				769	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
				770	}
				771	}
				772
				773	if ( __kmp_env_consistency_check ) {
				774	enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
				775	if ( push_ws ) {
				776	__kmp_push_workshare( gtid, ws, loc );
				777	pr->pushed_ws = ws;
				778	} else {
				779	__kmp_check_workshare( gtid, ws, loc );
				780	pr->pushed_ws = ct_none;
				781	}
				782	}
				783
				784	switch ( schedule ) {
				785	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				786	case kmp_sch_static_steal:
				787	{
				788	T nproc = team->t.t_nproc;
				789	T ntc, init;
				790
				791	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
				792
				793	ntc = (tc % chunk ? 1 : 0) + tc / chunk;
				794	if ( nproc > 1 && ntc >= nproc ) {
				795	T id = __kmp_tid_from_gtid(gtid);
				796	T small_chunk, extras;
				797
				798	small_chunk = ntc / nproc;
				799	extras = ntc % nproc;
				800
				801	init = id * small_chunk + ( id < extras ? id : extras );
				802	pr->u.p.count = init;
				803	pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
				804
				805	pr->u.p.parm2 = lb;
				806	//pr->pfields.parm3 = 0; // it's not used in static_steal
				807	pr->u.p.parm4 = id;
				808	pr->u.p.st = st;
				809	break;
				810	} else {
				811	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
				812	gtid ) );
				813	schedule = kmp_sch_static_balanced;
				814	/* too few iterations: fall-through to kmp_sch_static_balanced */
				815	} // if
				816	/* FALL-THROUGH to static balanced */
				817	} // case
				818	#endif
				819	case kmp_sch_static_balanced:
				820	{
				821	T nproc = team->t.t_nproc;
				822	T init, limit;
				823
				824	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
				825	gtid ) );
				826
				827	if ( nproc > 1 ) {
				828	T id = __kmp_tid_from_gtid(gtid);
				829
				830	if ( tc < nproc ) {
				831	if ( id < tc ) {
				832	init = id;
				833	limit = id;
				834	pr->u.p.parm1 = (id == tc - 1); /* parm1 stores plastiter /
				835	} else {
				836	pr->u.p.count = 1; /* means no more chunks to execute */
				837	pr->u.p.parm1 = FALSE;
				838	break;
				839	}
				840	} else {
				841	T small_chunk = tc / nproc;
				842	T extras = tc % nproc;
				843	init = id * small_chunk + (id < extras ? id : extras);
				844	limit = init + small_chunk - (id < extras ? 0 : 1);
				845	pr->u.p.parm1 = (id == nproc - 1);
				846	}
				847	} else {
				848	if ( tc > 0 ) {
				849	init = 0;
				850	limit = tc - 1;
				851	pr->u.p.parm1 = TRUE;
				852	} else {
				853	// zero trip count
				854	pr->u.p.count = 1; /* means no more chunks to execute */
				855	pr->u.p.parm1 = FALSE;
				856	break;
				857	}
				858	}
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	859	#if USE_ITT_BUILD
				860	// Calculate chunk for metadata report
				861	if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
				862	cur_chunk = limit - init + 1;
				863	}
				864	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	865	if ( st == 1 ) {
				866	pr->u.p.lb = lb + init;
				867	pr->u.p.ub = lb + limit;
				868	} else {
				869	T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
				870	pr->u.p.lb = lb + init * st;
				871	// adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
				872	if ( st > 0 ) {
				873	pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
				874	} else {
				875	pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
				876	}
				877	}
				878	if ( pr->ordered ) {
				879	pr->u.p.ordered_lower = init;
				880	pr->u.p.ordered_upper = limit;
				881	}
				882	break;
				883	} // case
				884	case kmp_sch_guided_iterative_chunked :
				885	{
				886	T nproc = team->t.t_nproc;
				887	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
				888
				889	if ( nproc > 1 ) {
				890	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				891	/* chunk size too large, switch to dynamic */
				892	schedule = kmp_sch_dynamic_chunked;
				893	} else {
				894	// when remaining iters become less than parm2 - switch to dynamic
				895	pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
				896	(double)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
				897	}
				898	} else {
				899	KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
				900	schedule = kmp_sch_static_greedy;
				901	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				902	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				903	pr->u.p.parm1 = tc;
				904	} // if
				905	} // case
				906	break;
				907	case kmp_sch_guided_analytical_chunked:
				908	{
				909	T nproc = team->t.t_nproc;
				910	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
				911
				912	if ( nproc > 1 ) {
				913	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				914	/* chunk size too large, switch to dynamic */
				915	schedule = kmp_sch_dynamic_chunked;
				916	} else {
				917	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				918	DBL x;
				919
				920	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				921	/* Linux* OS already has 64-bit computation by default for
				922	long double, and on Windows* OS on Intel(R) 64,
				923	/Qlong_double doesn't work. On Windows* OS
				924	on IA-32 architecture, we need to set precision to
				925	64-bit instead of the default 53-bit. Even though long
				926	double doesn't work on Windows* OS on Intel(R) 64, the
				927	resulting lack of precision is not expected to impact
				928	the correctness of the algorithm, but this has not been
				929	mathematically proven.
				930	*/
				931	// save original FPCW and set precision to 64-bit, as
				932	// Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	933	unsigned int oldFpcw = _control87(0,0);
				934	_control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	935	#endif
				936	/* value used for comparison in solver for cross-over point */
				937	long double target = ((long double)chunk * 2 + 1) * nproc / tc;
				938
				939	/* crossover point--chunk indexes equal to or greater than
				940	this point switch to dynamic-style scheduling */
				941	UT cross;
				942
				943	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				944	x = (long double)1.0 - (long double)0.5 / nproc;
				945
				946	#ifdef KMP_DEBUG
				947	{ // test natural alignment
				948	struct _test_a {
				949	char a;
				950	union {
				951	char b;
				952	DBL d;
				953	};
				954	} t;
				955	ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
				956	//__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
				957	KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
				958	}
				959	#endif // KMP_DEBUG
				960
				961	/* save the term in thread private dispatch structure */
				962	(DBL)&pr->u.p.parm3 = x;
				963
				964	/* solve for the crossover point to the nearest integer i for which C_i <= chunk */
				965	{
				966	UT left, right, mid;
				967	long double p;
				968
				969	/* estimate initial upper and lower bound */
				970
				971	/* doesn't matter what value right is as long as it is positive, but
				972	it affects performance of the solver
				973	*/
				974	right = 229;
				975	p = __kmp_pow< UT >(x,right);
				976	if ( p > target ) {
				977	do{
				978	p *= p;
				979	right <<= 1;
				980	} while(p>target && right < (1<<27));
				981	left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
				982	} else {
				983	left = 0;
				984	}
				985
				986	/* bisection root-finding method */
				987	while ( left + 1 < right ) {
				988	mid = (left + right) / 2;
				989	if ( __kmp_pow< UT >(x,mid) > target ) {
				990	left = mid;
				991	} else {
				992	right = mid;
				993	}
				994	} // while
				995	cross = right;
				996	}
				997	/* assert sanity of computed crossover point */
				998	KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
				999
				1000	/* save the crossover point in thread private dispatch structure */
				1001	pr->u.p.parm2 = cross;
				1002
				1003	// C75803
				1004	#if ( ( KMP_OS_LINUX \|\| KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
				1005	#define GUIDED_ANALYTICAL_WORKAROUND (( DBL )&pr->u.p.parm3)
				1006	#else
				1007	#define GUIDED_ANALYTICAL_WORKAROUND (x)
				1008	#endif
				1009	/* dynamic-style scheduling offset */
				1010	pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
				1011	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1012	// restore FPCW
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1013	_control87(oldFpcw,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1014	#endif
				1015	} // if
				1016	} else {
				1017	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
				1018	gtid ) );
				1019	schedule = kmp_sch_static_greedy;
				1020	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				1021	pr->u.p.parm1 = tc;
				1022	} // if
				1023	} // case
				1024	break;
				1025	case kmp_sch_static_greedy:
				1026	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				1027	pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
				1028	( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
				1029	tc;
				1030	break;
				1031	case kmp_sch_static_chunked :
				1032	case kmp_sch_dynamic_chunked :
				1033	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
				1034	break;
				1035	case kmp_sch_trapezoidal :
				1036	{
				1037	/* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
				1038
				1039	T parm1, parm2, parm3, parm4;
				1040	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
				1041
				1042	parm1 = chunk;
				1043
				1044	/* F : size of the first cycle */
				1045	parm2 = ( tc / (2 * team->t.t_nproc) );
				1046
				1047	if ( parm2 < 1 ) {
				1048	parm2 = 1;
				1049	}
				1050
				1051	/* L : size of the last cycle. Make sure the last cycle
				1052	* is not larger than the first cycle.
				1053	*/
				1054	if ( parm1 < 1 ) {
				1055	parm1 = 1;
				1056	} else if ( parm1 > parm2 ) {
				1057	parm1 = parm2;
				1058	}
				1059
				1060	/* N : number of cycles */
				1061	parm3 = ( parm2 + parm1 );
				1062	parm3 = ( 2 * tc + parm3 - 1) / parm3;
				1063
				1064	if ( parm3 < 2 ) {
				1065	parm3 = 2;
				1066	}
				1067
				1068	/* sigma : decreasing incr of the trapezoid */
				1069	parm4 = ( parm3 - 1 );
				1070	parm4 = ( parm2 - parm1 ) / parm4;
				1071
				1072	// pointless check, because parm4 >= 0 always
				1073	//if ( parm4 < 0 ) {
				1074	// parm4 = 0;
				1075	//}
				1076
				1077	pr->u.p.parm1 = parm1;
				1078	pr->u.p.parm2 = parm2;
				1079	pr->u.p.parm3 = parm3;
				1080	pr->u.p.parm4 = parm4;
				1081	} // case
				1082	break;
				1083
				1084	default:
				1085	{
				1086	__kmp_msg(
				1087	kmp_ms_fatal, // Severity
				1088	KMP_MSG( UnknownSchedTypeDetected ), // Primary message
				1089	KMP_HNT( GetNewerLibrary ), // Hint
				1090	__kmp_msg_null // Variadic argument list terminator
				1091	);
				1092	}
				1093	break;
				1094	} // switch
				1095	pr->schedule = schedule;
				1096	if ( active ) {
				1097	/* The name of this buffer should be my_buffer_index when it's free to use it */
				1098
				1099	KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1100	gtid, my_buffer_index, sh->buffer_index) );
				1101	__kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
				1102	USE_ITT_BUILD_ARG( NULL )
				1103	);
				1104	// Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
				1105	// always 32-bit integers.
				1106	KMP_MB(); /* is this necessary? */
				1107	KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1108	gtid, my_buffer_index, sh->buffer_index) );
				1109
				1110	th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
				1111	th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
				1112	#if USE_ITT_BUILD
				1113	if ( pr->ordered ) {
				1114	__kmp_itt_ordered_init( gtid );
				1115	}; // if
				1116	#endif /* USE_ITT_BUILD */
				1117	}; // if
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1118
				1119	#if USE_ITT_BUILD
				1120	// Report loop metadata
				1121	if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
				1122	kmp_uint32 tid = __kmp_tid_from_gtid( gtid );
				1123	if (KMP_MASTER_TID(tid)) {
				1124	kmp_uint64 schedtype = 0;
				1125
				1126	switch ( schedule ) {
				1127	case kmp_sch_static_chunked:
				1128	case kmp_sch_static_balanced:// Chunk is calculated in the switch above
				1129	break;
				1130	case kmp_sch_static_greedy:
				1131	cur_chunk = pr->u.p.parm1;
				1132	break;
				1133	case kmp_sch_dynamic_chunked:
				1134	schedtype = 1;
				1135	break;
				1136	case kmp_sch_guided_iterative_chunked:
				1137	case kmp_sch_guided_analytical_chunked:
				1138	schedtype = 2;
				1139	break;
				1140	default:
				1141	// Should we put this case under "static"?
				1142	// case kmp_sch_static_steal:
				1143	schedtype = 3;
				1144	break;
				1145	}
				1146	__kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
				1147	}
				1148	}
				1149	#endif /* USE_ITT_BUILD */
				1150
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1151	#ifdef KMP_DEBUG
				1152	{
				1153	const char * buff;
				1154	// create format specifiers before the debug output
				1155	buff = __kmp_str_format(
				1156	"__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
				1157	" st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
				1158	" parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
				1159	traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
				1160	traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
				1161	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
				1162	traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
				1163	KD_TRACE(10, ( buff,
				1164	gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
				1165	pr->u.p.st, pr->u.p.tc, pr->u.p.count,
				1166	pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
				1167	pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
				1168	__kmp_str_free( &buff );
				1169	}
				1170	#endif
				1171	#if ( KMP_STATIC_STEAL_ENABLED )
				1172	if ( ___kmp_size_type < 8 ) {
				1173	// It cannot be guaranteed that after execution of a loop with some other schedule kind
				1174	// all the parm3 variables will contain the same value.
				1175	// Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
				1176	// rather than program life-time increment.
				1177	// So the dedicated variable is required. The 'static_steal_counter' is used.
				1178	if( schedule == kmp_sch_static_steal ) {
				1179	// Other threads will inspect this variable when searching for a victim.
				1180	// This is a flag showing that other threads may steal from this thread since then.
				1181	volatile T * p = &pr->u.p.static_steal_counter;
				1182	p = p + 1;
				1183	}
				1184	}
				1185	#endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
				1186	}
				1187
				1188	/*
				1189	* For ordered loops, either __kmp_dispatch_finish() should be called after
				1190	* every iteration, or __kmp_dispatch_finish_chunk() should be called after
				1191	* every chunk of iterations. If the ordered section(s) were not executed
				1192	* for this iteration (or every iteration in this chunk), we need to set the
				1193	* ordered iteration counters so that the next thread can proceed.
				1194	*/
				1195	template< typename UT >
				1196	static void
				1197	__kmp_dispatch_finish( int gtid, ident_t *loc )
				1198	{
				1199	typedef typename traits_t< UT >::signed_t ST;
				1200	kmp_info_t *th = __kmp_threads[ gtid ];
				1201
				1202	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
				1203	if ( ! th -> th.th_team -> t.t_serialized ) {
				1204
				1205	dispatch_private_info_template< UT > * pr =
				1206	reinterpret_cast< dispatch_private_info_template< UT >* >
				1207	( th->th.th_dispatch->th_dispatch_pr_current );
				1208	dispatch_shared_info_template< UT > volatile * sh =
				1209	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1210	( th->th.th_dispatch->th_dispatch_sh_current );
				1211	KMP_DEBUG_ASSERT( pr );
				1212	KMP_DEBUG_ASSERT( sh );
				1213	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1214	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1215
				1216	if ( pr->ordered_bumped ) {
				1217	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1218	gtid ) );
				1219	pr->ordered_bumped = 0;
				1220	} else {
				1221	UT lower = pr->u.p.ordered_lower;
				1222
				1223	#ifdef KMP_DEBUG
				1224	{
				1225	const char * buff;
				1226	// create format specifiers before the debug output
				1227	buff = __kmp_str_format(
				1228	"__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
				1229	traits_t< UT >::spec, traits_t< UT >::spec );
				1230	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1231	__kmp_str_free( &buff );
				1232	}
				1233	#endif
				1234
				1235	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1236	USE_ITT_BUILD_ARG(NULL)
				1237	);
				1238	KMP_MB(); /* is this necessary? */
				1239	#ifdef KMP_DEBUG
				1240	{
				1241	const char * buff;
				1242	// create format specifiers before the debug output
				1243	buff = __kmp_str_format(
				1244	"__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
				1245	traits_t< UT >::spec, traits_t< UT >::spec );
				1246	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1247	__kmp_str_free( &buff );
				1248	}
				1249	#endif
				1250
				1251	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				1252	} // if
				1253	} // if
				1254	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
				1255	}
				1256
				1257	#ifdef KMP_GOMP_COMPAT
				1258
				1259	template< typename UT >
				1260	static void
				1261	__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
				1262	{
				1263	typedef typename traits_t< UT >::signed_t ST;
				1264	kmp_info_t *th = __kmp_threads[ gtid ];
				1265
				1266	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
				1267	if ( ! th -> th.th_team -> t.t_serialized ) {
				1268	// int cid;
				1269	dispatch_private_info_template< UT > * pr =
				1270	reinterpret_cast< dispatch_private_info_template< UT >* >
				1271	( th->th.th_dispatch->th_dispatch_pr_current );
				1272	dispatch_shared_info_template< UT > volatile * sh =
				1273	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1274	( th->th.th_dispatch->th_dispatch_sh_current );
				1275	KMP_DEBUG_ASSERT( pr );
				1276	KMP_DEBUG_ASSERT( sh );
				1277	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1278	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1279
				1280	// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
				1281	UT lower = pr->u.p.ordered_lower;
				1282	UT upper = pr->u.p.ordered_upper;
				1283	UT inc = upper - lower + 1;
				1284
				1285	if ( pr->ordered_bumped == inc ) {
				1286	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1287	gtid ) );
				1288	pr->ordered_bumped = 0;
				1289	} else {
				1290	inc -= pr->ordered_bumped;
				1291
				1292	#ifdef KMP_DEBUG
				1293	{
				1294	const char * buff;
				1295	// create format specifiers before the debug output
				1296	buff = __kmp_str_format(
				1297	"__kmp_dispatch_finish_chunk: T#%%d before wait: " \
				1298	"ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
				1299	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1300	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
				1301	__kmp_str_free( &buff );
				1302	}
				1303	#endif
				1304
				1305	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1306	USE_ITT_BUILD_ARG(NULL)
				1307	);
				1308
				1309	KMP_MB(); /* is this necessary? */
				1310	KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
				1311	gtid ) );
				1312	pr->ordered_bumped = 0;
				1313	//!!!!! TODO check if the inc should be unsigned, or signed???
				1314	#ifdef KMP_DEBUG
				1315	{
				1316	const char * buff;
				1317	// create format specifiers before the debug output
				1318	buff = __kmp_str_format(
				1319	"__kmp_dispatch_finish_chunk: T#%%d after wait: " \
				1320	"ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
				1321	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1322	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
				1323	__kmp_str_free( &buff );
				1324	}
				1325	#endif
				1326
				1327	test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
				1328	}
				1329	// }
				1330	}
				1331	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
				1332	}
				1333
				1334	#endif /* KMP_GOMP_COMPAT */
				1335
				1336	template< typename T >
				1337	static int
				1338	__kmp_dispatch_next(
				1339	ident_t loc, int gtid, kmp_int32 p_last, T p_lb, T p_ub, typename traits_t< T >::signed_t *p_st
				1340	) {
				1341
				1342	typedef typename traits_t< T >::unsigned_t UT;
				1343	typedef typename traits_t< T >::signed_t ST;
				1344	typedef typename traits_t< T >::floating_t DBL;
				1345	static const int ___kmp_size_type = sizeof( UT );
				1346
				1347	int status;
				1348	dispatch_private_info_template< T > * pr;
				1349	kmp_info_t * th = __kmp_threads[ gtid ];
				1350	kmp_team_t * team = th -> th.th_team;
				1351
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1352	KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1353	#ifdef KMP_DEBUG
				1354	{
				1355	const char * buff;
				1356	// create format specifiers before the debug output
				1357	buff = __kmp_str_format(
				1358	"__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
				1359	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				1360	KD_TRACE(1000, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last ) );
				1361	__kmp_str_free( &buff );
				1362	}
				1363	#endif
				1364
				1365	if ( team -> t.t_serialized ) {
				1366	/* NOTE: serialize this dispatch becase we are not at the active level */
				1367	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1368	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				1369	KMP_DEBUG_ASSERT( pr );
				1370
				1371	if ( (status = (pr->u.p.tc != 0)) == 0 ) {
				1372	*p_lb = 0;
				1373	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1374	// if ( p_last != NULL )
				1375	// *p_last = 0;
				1376	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1377	*p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1378	if ( __kmp_env_consistency_check ) {
				1379	if ( pr->pushed_ws != ct_none ) {
				1380	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1381	}
				1382	}
				1383	} else if ( pr->nomerge ) {
				1384	kmp_int32 last;
				1385	T start;
				1386	UT limit, trip, init;
				1387	ST incr;
				1388	T chunk = pr->u.p.parm1;
				1389
				1390	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
				1391
				1392	init = chunk * pr->u.p.count++;
				1393	trip = pr->u.p.tc - 1;
				1394
				1395	if ( (status = (init <= trip)) == 0 ) {
				1396	*p_lb = 0;
				1397	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1398	// if ( p_last != NULL )
				1399	// *p_last = 0;
				1400	if ( p_st != NULL )
				1401	*p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1402	if ( __kmp_env_consistency_check ) {
				1403	if ( pr->pushed_ws != ct_none ) {
				1404	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1405	}
				1406	}
				1407	} else {
				1408	start = pr->u.p.lb;
				1409	limit = chunk + init - 1;
				1410	incr = pr->u.p.st;
				1411
				1412	if ( (last = (limit >= trip)) != 0 ) {
				1413	limit = trip;
				1414	#if KMP_OS_WINDOWS
				1415	pr->u.p.last_upper = pr->u.p.ub;
				1416	#endif /* KMP_OS_WINDOWS */
				1417	}
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1418	if ( p_last != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1419	*p_last = last;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1420	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1421	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1422	if ( incr == 1 ) {
				1423	*p_lb = start + init;
				1424	*p_ub = start + limit;
				1425	} else {
				1426	p_lb = start + init incr;
				1427	p_ub = start + limit incr;
				1428	}
				1429
				1430	if ( pr->ordered ) {
				1431	pr->u.p.ordered_lower = init;
				1432	pr->u.p.ordered_upper = limit;
				1433	#ifdef KMP_DEBUG
				1434	{
				1435	const char * buff;
				1436	// create format specifiers before the debug output
				1437	buff = __kmp_str_format(
				1438	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1439	traits_t< UT >::spec, traits_t< UT >::spec );
				1440	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1441	__kmp_str_free( &buff );
				1442	}
				1443	#endif
				1444	} // if
				1445	} // if
				1446	} else {
				1447	pr->u.p.tc = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1448	*p_lb = pr->u.p.lb;
				1449	*p_ub = pr->u.p.ub;
				1450	#if KMP_OS_WINDOWS
				1451	pr->u.p.last_upper = *p_ub;
				1452	#endif /* KMP_OS_WINDOWS */
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1453	if ( p_last != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1454	*p_last = TRUE;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1455	if ( p_st != NULL )
				1456	*p_st = pr->u.p.st;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1457	} // if
				1458	#ifdef KMP_DEBUG
				1459	{
				1460	const char * buff;
				1461	// create format specifiers before the debug output
				1462	buff = __kmp_str_format(
				1463	"__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1464	"p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1465	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1466	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, p_st, p_last, p_last, status) );
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1467	__kmp_str_free( &buff );
				1468	}
				1469	#endif
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1470	#if INCLUDE_SSC_MARKS
				1471	SSC_MARK_DISPATCH_NEXT();
				1472	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1473	return status;
				1474	} else {
				1475	kmp_int32 last = 0;
				1476	dispatch_shared_info_template< UT > *sh;
				1477	T start;
				1478	ST incr;
				1479	UT limit, trip, init;
				1480
				1481	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1482	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1483
				1484	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1485	( th->th.th_dispatch->th_dispatch_pr_current );
				1486	KMP_DEBUG_ASSERT( pr );
				1487	sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				1488	( th->th.th_dispatch->th_dispatch_sh_current );
				1489	KMP_DEBUG_ASSERT( sh );
				1490
				1491	if ( pr->u.p.tc == 0 ) {
				1492	// zero trip count
				1493	status = 0;
				1494	} else {
				1495	switch (pr->schedule) {
				1496	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1497	case kmp_sch_static_steal:
				1498	{
				1499	T chunk = pr->u.p.parm1;
				1500
				1501	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
				1502
				1503	trip = pr->u.p.tc - 1;
				1504
				1505	if ( ___kmp_size_type > 4 ) {
				1506	// Other threads do not look into the data of this thread,
				1507	// so it's not necessary to make volatile casting.
				1508	init = ( pr->u.p.count )++;
				1509	status = ( init < (UT)pr->u.p.ub );
				1510	} else {
				1511	typedef union {
				1512	struct {
				1513	UT count;
				1514	T ub;
				1515	} p;
				1516	kmp_int64 b;
				1517	} union_i4;
				1518	// All operations on 'count' or 'ub' must be combined atomically together.
				1519	// stealing implemented only for 4-byte indexes
				1520	{
				1521	union_i4 vold, vnew;
				1522	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1523	vnew = vold;
				1524	vnew.p.count++;
				1525	while( ! KMP_COMPARE_AND_STORE_ACQ64(
				1526	( volatile kmp_int64* )&pr->u.p.count,
				1527	VOLATILE_CAST(kmp_int64 )&vold.b,
				1528	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1529	KMP_CPU_PAUSE();
				1530	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1531	vnew = vold;
				1532	vnew.p.count++;
				1533	}
				1534	vnew = vold;
				1535	init = vnew.p.count;
				1536	status = ( init < (UT)vnew.p.ub ) ;
				1537	}
				1538
				1539	if( !status ) {
				1540	kmp_info_t **other_threads = team->t.t_threads;
				1541	int while_limit = 10;
				1542	int while_index = 0;
				1543
				1544	// TODO: algorithm of searching for a victim
				1545	// should be cleaned up and measured
				1546	while ( ( !status ) && ( while_limit != ++while_index ) ) {
				1547	union_i4 vold, vnew;
				1548	kmp_int32 remaining; // kmp_int32 because KMP_I4 only
				1549	T victimIdx = pr->u.p.parm4;
				1550	T oldVictimIdx = victimIdx;
				1551	dispatch_private_info_template< T > * victim;
				1552
				1553	do {
				1554	if( !victimIdx ) {
				1555	victimIdx = team->t.t_nproc - 1;
				1556	} else {
				1557	--victimIdx;
				1558	}
				1559	victim = reinterpret_cast< dispatch_private_info_template< T >* >
				1560	( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
				1561	} while ( (victim == NULL \|\| victim == pr) && oldVictimIdx != victimIdx );
				1562	// TODO: think about a proper place of this test
				1563	if ( ( !victim ) \|\|
				1564	( (( volatile T )&victim->u.p.static_steal_counter) !=
				1565	(( volatile T )&pr->u.p.static_steal_counter) ) ) {
				1566	// TODO: delay would be nice
				1567	continue;
				1568	// the victim is not ready yet to participate in stealing
				1569	// because the victim is still in kmp_init_dispatch
				1570	}
				1571	if ( oldVictimIdx == victimIdx ) {
				1572	break;
				1573	}
				1574	pr->u.p.parm4 = victimIdx;
				1575
				1576	while( 1 ) {
				1577	vold.b = ( volatile kmp_int64 )( &victim->u.p.count );
				1578	vnew = vold;
				1579
				1580	KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
				1581	if ( vnew.p.count >= (UT)vnew.p.ub \|\| (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
				1582	break;
				1583	}
				1584	vnew.p.ub -= (remaining >> 2);
				1585	KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
				1586	#pragma warning( push )
				1587	// disable warning on pointless comparison of unsigned with 0
				1588	#pragma warning( disable: 186 )
				1589	KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
				1590	#pragma warning( pop )
				1591	// TODO: Should this be acquire or release?
				1592	if ( KMP_COMPARE_AND_STORE_ACQ64(
				1593	( volatile kmp_int64 * )&victim->u.p.count,
				1594	VOLATILE_CAST(kmp_int64 )&vold.b,
				1595	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1596	status = 1;
				1597	while_index = 0;
				1598	// now update own count and ub
				1599	#if KMP_ARCH_X86
				1600	// stealing executed on non-KMP_ARCH_X86 only
				1601	// Atomic 64-bit write on ia32 is
				1602	// unavailable, so we do this in steps.
				1603	// This code is not tested.
				1604	init = vold.p.count;
				1605	pr->u.p.ub = 0;
				1606	pr->u.p.count = init + 1;
				1607	pr->u.p.ub = vnew.p.count;
				1608	#else
				1609	init = vnew.p.ub;
				1610	vold.p.count = init + 1;
				1611	// TODO: is it safe and enough?
				1612	( volatile kmp_int64 )(&pr->u.p.count) = vold.b;
				1613	#endif // KMP_ARCH_X86
				1614	break;
				1615	} // if
				1616	KMP_CPU_PAUSE();
				1617	} // while (1)
				1618	} // while
				1619	} // if
				1620	} // if
				1621	if ( !status ) {
				1622	*p_lb = 0;
				1623	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1624	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1625	} else {
				1626	start = pr->u.p.parm2;
				1627	init *= chunk;
				1628	limit = chunk + init - 1;
				1629	incr = pr->u.p.st;
				1630
				1631	KMP_DEBUG_ASSERT(init <= trip);
				1632	if ( (last = (limit >= trip)) != 0 )
				1633	limit = trip;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1634	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1635
				1636	if ( incr == 1 ) {
				1637	*p_lb = start + init;
				1638	*p_ub = start + limit;
				1639	} else {
				1640	p_lb = start + init incr;
				1641	p_ub = start + limit incr;
				1642	}
				1643
				1644	if ( pr->ordered ) {
				1645	pr->u.p.ordered_lower = init;
				1646	pr->u.p.ordered_upper = limit;
				1647	#ifdef KMP_DEBUG
				1648	{
				1649	const char * buff;
				1650	// create format specifiers before the debug output
				1651	buff = __kmp_str_format(
				1652	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1653	traits_t< UT >::spec, traits_t< UT >::spec );
				1654	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1655	__kmp_str_free( &buff );
				1656	}
				1657	#endif
				1658	} // if
				1659	} // if
				1660	break;
				1661	} // case
				1662	#endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1663	case kmp_sch_static_balanced:
				1664	{
				1665	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
				1666	if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
				1667	pr->u.p.count = 1;
				1668	*p_lb = pr->u.p.lb;
				1669	*p_ub = pr->u.p.ub;
				1670	last = pr->u.p.parm1;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1671	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1672	*p_st = pr->u.p.st;
				1673	} else { /* no iterations to do */
				1674	pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
				1675	}
				1676	if ( pr->ordered ) {
				1677	#ifdef KMP_DEBUG
				1678	{
				1679	const char * buff;
				1680	// create format specifiers before the debug output
				1681	buff = __kmp_str_format(
				1682	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1683	traits_t< UT >::spec, traits_t< UT >::spec );
				1684	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1685	__kmp_str_free( &buff );
				1686	}
				1687	#endif
				1688	} // if
				1689	} // case
				1690	break;
				1691	case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
				1692	case kmp_sch_static_chunked:
				1693	{
				1694	T parm1;
				1695
				1696	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity\|chunked] case\n",
				1697	gtid ) );
				1698	parm1 = pr->u.p.parm1;
				1699
				1700	trip = pr->u.p.tc - 1;
				1701	init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
				1702
				1703	if ( (status = (init <= trip)) != 0 ) {
				1704	start = pr->u.p.lb;
				1705	incr = pr->u.p.st;
				1706	limit = parm1 + init - 1;
				1707
				1708	if ( (last = (limit >= trip)) != 0 )
				1709	limit = trip;
				1710
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1711	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1712
				1713	pr->u.p.count += team->t.t_nproc;
				1714
				1715	if ( incr == 1 ) {
				1716	*p_lb = start + init;
				1717	*p_ub = start + limit;
				1718	}
				1719	else {
				1720	p_lb = start + init incr;
				1721	p_ub = start + limit incr;
				1722	}
				1723
				1724	if ( pr->ordered ) {
				1725	pr->u.p.ordered_lower = init;
				1726	pr->u.p.ordered_upper = limit;
				1727	#ifdef KMP_DEBUG
				1728	{
				1729	const char * buff;
				1730	// create format specifiers before the debug output
				1731	buff = __kmp_str_format(
				1732	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1733	traits_t< UT >::spec, traits_t< UT >::spec );
				1734	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1735	__kmp_str_free( &buff );
				1736	}
				1737	#endif
				1738	} // if
				1739	} // if
				1740	} // case
				1741	break;
				1742
				1743	case kmp_sch_dynamic_chunked:
				1744	{
				1745	T chunk = pr->u.p.parm1;
				1746
				1747	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
				1748	gtid ) );
				1749
				1750	init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1751	trip = pr->u.p.tc - 1;
				1752
				1753	if ( (status = (init <= trip)) == 0 ) {
				1754	*p_lb = 0;
				1755	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1756	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1757	} else {
				1758	start = pr->u.p.lb;
				1759	limit = chunk + init - 1;
				1760	incr = pr->u.p.st;
				1761
				1762	if ( (last = (limit >= trip)) != 0 )
				1763	limit = trip;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1764
				1765	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1766
				1767	if ( incr == 1 ) {
				1768	*p_lb = start + init;
				1769	*p_ub = start + limit;
				1770	} else {
				1771	p_lb = start + init incr;
				1772	p_ub = start + limit incr;
				1773	}
				1774
				1775	if ( pr->ordered ) {
				1776	pr->u.p.ordered_lower = init;
				1777	pr->u.p.ordered_upper = limit;
				1778	#ifdef KMP_DEBUG
				1779	{
				1780	const char * buff;
				1781	// create format specifiers before the debug output
				1782	buff = __kmp_str_format(
				1783	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1784	traits_t< UT >::spec, traits_t< UT >::spec );
				1785	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1786	__kmp_str_free( &buff );
				1787	}
				1788	#endif
				1789	} // if
				1790	} // if
				1791	} // case
				1792	break;
				1793
				1794	case kmp_sch_guided_iterative_chunked:
				1795	{
				1796	T chunkspec = pr->u.p.parm1;
				1797	KD_TRACE(100,
				1798	("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
				1799	trip = pr->u.p.tc;
				1800	// Start atomic part of calculations
				1801	while(1) {
				1802	ST remaining; // signed, because can be < 0
				1803	init = sh->u.s.iteration; // shared value
				1804	remaining = trip - init;
				1805	if ( remaining <= 0 ) { // AC: need to compare with 0 first
				1806	// nothing to do, don't try atomic op
				1807	status = 0;
				1808	break;
				1809	}
				1810	if ( (T)remaining < pr->u.p.parm2 ) { // compare with Knproc(chunk+1), K=2 by default
				1811	// use dynamic-style shcedule
				1812	// atomically inrement iterations, get old value
				1813	init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
				1814	remaining = trip - init;
				1815	if (remaining <= 0) {
				1816	status = 0; // all iterations got by other threads
				1817	} else {
				1818	// got some iterations to work on
				1819	status = 1;
				1820	if ( (T)remaining > chunkspec ) {
				1821	limit = init + chunkspec - 1;
				1822	} else {
				1823	last = 1; // the last chunk
				1824	limit = init + remaining - 1;
				1825	} // if
				1826	} // if
				1827	break;
				1828	} // if
				1829	limit = init + (UT)( remaining * (double)&pr->u.p.parm3 ); // divide by K*nproc
				1830	if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
				1831	// CAS was successful, chunk obtained
				1832	status = 1;
				1833	--limit;
				1834	break;
				1835	} // if
				1836	} // while
				1837	if ( status != 0 ) {
				1838	start = pr->u.p.lb;
				1839	incr = pr->u.p.st;
				1840	if ( p_st != NULL )
				1841	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1842	p_lb = start + init incr;
				1843	p_ub = start + limit incr;
				1844	if ( pr->ordered ) {
				1845	pr->u.p.ordered_lower = init;
				1846	pr->u.p.ordered_upper = limit;
				1847	#ifdef KMP_DEBUG
				1848	{
				1849	const char * buff;
				1850	// create format specifiers before the debug output
				1851	buff = __kmp_str_format(
				1852	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1853	traits_t< UT >::spec, traits_t< UT >::spec );
				1854	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1855	__kmp_str_free( &buff );
				1856	}
				1857	#endif
				1858	} // if
				1859	} else {
				1860	*p_lb = 0;
				1861	*p_ub = 0;
				1862	if ( p_st != NULL )
				1863	*p_st = 0;
				1864	} // if
				1865	} // case
				1866	break;
				1867
				1868	case kmp_sch_guided_analytical_chunked:
				1869	{
				1870	T chunkspec = pr->u.p.parm1;
				1871	UT chunkIdx;
				1872	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1873	/* for storing original FPCW value for Windows* OS on
				1874	IA-32 architecture 8-byte version */
				1875	unsigned int oldFpcw;
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1876	unsigned int fpcwSet = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1877	#endif
				1878	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
				1879	gtid ) );
				1880
				1881	trip = pr->u.p.tc;
				1882
				1883	KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
				1884	KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
				1885
				1886	while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
				1887	chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1888	if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
				1889	--trip;
				1890	/* use dynamic-style scheduling */
				1891	init = chunkIdx * chunkspec + pr->u.p.count;
				1892	/* need to verify init > 0 in case of overflow in the above calculation */
				1893	if ( (status = (init > 0 && init <= trip)) != 0 ) {
				1894	limit = init + chunkspec -1;
				1895
				1896	if ( (last = (limit >= trip)) != 0 )
				1897	limit = trip;
				1898	}
				1899	break;
				1900	} else {
				1901	/* use exponential-style scheduling */
				1902	/* The following check is to workaround the lack of long double precision on Windows* OS.
				1903	This check works around the possible effect that init != 0 for chunkIdx == 0.
				1904	*/
				1905	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1906	/* If we haven't already done so, save original
				1907	FPCW and set precision to 64-bit, as Windows* OS
				1908	on IA-32 architecture defaults to 53-bit */
				1909	if ( !fpcwSet ) {
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1910	oldFpcw = _control87(0,0);
				1911	_control87(_PC_64,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1912	fpcwSet = 0x30000;
				1913	}
				1914	#endif
				1915	if ( chunkIdx ) {
				1916	init = __kmp_dispatch_guided_remaining< T >(
				1917	trip, ( DBL )&pr->u.p.parm3, chunkIdx );
				1918	KMP_DEBUG_ASSERT(init);
				1919	init = trip - init;
				1920	} else
				1921	init = 0;
				1922	limit = trip - __kmp_dispatch_guided_remaining< T >(
				1923	trip, ( DBL )&pr->u.p.parm3, chunkIdx + 1 );
				1924	KMP_ASSERT(init <= limit);
				1925	if ( init < limit ) {
				1926	KMP_DEBUG_ASSERT(limit <= trip);
				1927	--limit;
				1928	status = 1;
				1929	break;
				1930	} // if
				1931	} // if
				1932	} // while (1)
				1933	#if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1934	/* restore FPCW if necessary
				1935	AC: check fpcwSet flag first because oldFpcw can be uninitialized here
				1936	*/
				1937	if ( fpcwSet && ( oldFpcw & fpcwSet ) )
				1938	_control87(oldFpcw,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1939	#endif
				1940	if ( status != 0 ) {
				1941	start = pr->u.p.lb;
				1942	incr = pr->u.p.st;
				1943	if ( p_st != NULL )
				1944	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1945	p_lb = start + init incr;
				1946	p_ub = start + limit incr;
				1947	if ( pr->ordered ) {
				1948	pr->u.p.ordered_lower = init;
				1949	pr->u.p.ordered_upper = limit;
				1950	#ifdef KMP_DEBUG
				1951	{
				1952	const char * buff;
				1953	// create format specifiers before the debug output
				1954	buff = __kmp_str_format(
				1955	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1956	traits_t< UT >::spec, traits_t< UT >::spec );
				1957	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1958	__kmp_str_free( &buff );
				1959	}
				1960	#endif
				1961	}
				1962	} else {
				1963	*p_lb = 0;
				1964	*p_ub = 0;
				1965	if ( p_st != NULL )
				1966	*p_st = 0;
				1967	}
				1968	} // case
				1969	break;
				1970
				1971	case kmp_sch_trapezoidal:
				1972	{
				1973	UT index;
				1974	T parm2 = pr->u.p.parm2;
				1975	T parm3 = pr->u.p.parm3;
				1976	T parm4 = pr->u.p.parm4;
				1977	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
				1978	gtid ) );
				1979
				1980	index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
				1981
				1982	init = ( index * ( (2parm2) - (index-1)parm4 ) ) / 2;
				1983	trip = pr->u.p.tc - 1;
				1984
				1985	if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
				1986	*p_lb = 0;
				1987	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1988	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1989	} else {
				1990	start = pr->u.p.lb;
				1991	limit = ( (index+1) * ( 2parm2 - indexparm4 ) ) / 2 - 1;
				1992	incr = pr->u.p.st;
				1993
				1994	if ( (last = (limit >= trip)) != 0 )
				1995	limit = trip;
				1996
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	1997	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1998
				1999	if ( incr == 1 ) {
				2000	*p_lb = start + init;
				2001	*p_ub = start + limit;
				2002	} else {
				2003	p_lb = start + init incr;
				2004	p_ub = start + limit incr;
				2005	}
				2006
				2007	if ( pr->ordered ) {
				2008	pr->u.p.ordered_lower = init;
				2009	pr->u.p.ordered_upper = limit;
				2010	#ifdef KMP_DEBUG
				2011	{
				2012	const char * buff;
				2013	// create format specifiers before the debug output
				2014	buff = __kmp_str_format(
				2015	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				2016	traits_t< UT >::spec, traits_t< UT >::spec );
				2017	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				2018	__kmp_str_free( &buff );
				2019	}
				2020	#endif
				2021	} // if
				2022	} // if
				2023	} // case
				2024	break;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	2025	default:
				2026	{
				2027	status = 0; // to avoid complaints on uninitialized variable use
				2028	__kmp_msg(
				2029	kmp_ms_fatal, // Severity
				2030	KMP_MSG( UnknownSchedTypeDetected ), // Primary message
				2031	KMP_HNT( GetNewerLibrary ), // Hint
				2032	__kmp_msg_null // Variadic argument list terminator
				2033	);
				2034	}
				2035	break;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2036	} // switch
				2037	} // if tc == 0;
				2038
				2039	if ( status == 0 ) {
				2040	UT num_done;
				2041
				2042	num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
				2043	#ifdef KMP_DEBUG
				2044	{
				2045	const char * buff;
				2046	// create format specifiers before the debug output
				2047	buff = __kmp_str_format(
				2048	"__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
				2049	traits_t< UT >::spec );
				2050	KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
				2051	__kmp_str_free( &buff );
				2052	}
				2053	#endif
				2054
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	2055	if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2056	/* NOTE: release this buffer to be reused */
				2057
				2058	KMP_MB(); /* Flush all pending memory write invalidates. */
				2059
				2060	sh->u.s.num_done = 0;
				2061	sh->u.s.iteration = 0;
				2062
				2063	/* TODO replace with general release procedure? */
				2064	if ( pr->ordered ) {
				2065	sh->u.s.ordered_iteration = 0;
				2066	}
				2067
				2068	KMP_MB(); /* Flush all pending memory write invalidates. */
				2069
				2070	sh -> buffer_index += KMP_MAX_DISP_BUF;
				2071	KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
				2072	gtid, sh->buffer_index) );
				2073
				2074	KMP_MB(); /* Flush all pending memory write invalidates. */
				2075
				2076	} // if
				2077	if ( __kmp_env_consistency_check ) {
				2078	if ( pr->pushed_ws != ct_none ) {
				2079	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				2080	}
				2081	}
				2082
				2083	th -> th.th_dispatch -> th_deo_fcn = NULL;
				2084	th -> th.th_dispatch -> th_dxo_fcn = NULL;
				2085	th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
				2086	th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
				2087	} // if (status == 0)
				2088	#if KMP_OS_WINDOWS
				2089	else if ( last ) {
				2090	pr->u.p.last_upper = pr->u.p.ub;
				2091	}
				2092	#endif /* KMP_OS_WINDOWS */
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	2093	if ( p_last != NULL && status != 0 )
				2094	*p_last = last;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2095	} // if
				2096
				2097	#ifdef KMP_DEBUG
				2098	{
				2099	const char * buff;
				2100	// create format specifiers before the debug output
				2101	buff = __kmp_str_format(
				2102	"__kmp_dispatch_next: T#%%d normal case: " \
				2103	"p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
				2104	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				2105	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last, status ) );
				2106	__kmp_str_free( &buff );
				2107	}
				2108	#endif
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	2109	#if INCLUDE_SSC_MARKS
				2110	SSC_MARK_DISPATCH_NEXT();
				2111	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2112	return status;
				2113	}
				2114
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	2115	template< typename T >
				2116	static void
				2117	__kmp_dist_get_bounds(
				2118	ident_t *loc,
				2119	kmp_int32 gtid,
				2120	kmp_int32 *plastiter,
				2121	T *plower,
				2122	T *pupper,
				2123	typename traits_t< T >::signed_t incr
				2124	) {
				2125	KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
				2126	typedef typename traits_t< T >::unsigned_t UT;
				2127	typedef typename traits_t< T >::signed_t ST;
				2128	register kmp_uint32 team_id;
				2129	register kmp_uint32 nteams;
				2130	register UT trip_count;
				2131	register kmp_team_t *team;
				2132	kmp_info_t * th;
				2133
				2134	KMP_DEBUG_ASSERT( plastiter && plower && pupper );
				2135	KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
				2136	#ifdef KMP_DEBUG
				2137	{
				2138	const char * buff;
				2139	// create format specifiers before the debug output
				2140	buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
				2141	"iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
				2142	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
				2143	traits_t< T >::spec );
				2144	KD_TRACE(100, ( buff, gtid, plastiter, plower, *pupper, incr ) );
				2145	__kmp_str_free( &buff );
				2146	}
				2147	#endif
				2148
				2149	if( __kmp_env_consistency_check ) {
				2150	if( incr == 0 ) {
				2151	__kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
				2152	}
				2153	if( incr > 0 ? (pupper < plower) : (plower < pupper) ) {
				2154	// The loop is illegal.
				2155	// Some zero-trip loops maintained by compiler, e.g.:
				2156	// for(i=10;i<0;++i) // lower >= upper - run-time check
				2157	// for(i=0;i>10;--i) // lower <= upper - run-time check
				2158	// for(i=0;i>10;++i) // incr > 0 - compile-time check
				2159	// for(i=10;i<0;--i) // incr < 0 - compile-time check
				2160	// Compiler does not check the following illegal loops:
				2161	// for(i=0;i<10;i+=incr) // where incr<0
				2162	// for(i=10;i>0;i-=incr) // where incr<0
				2163	__kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
				2164	}
				2165	}
				2166	th = __kmp_threads[gtid];
				2167	KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
				2168	team = th->th.th_team;
				2169	#if OMP_40_ENABLED
				2170	nteams = th->th.th_teams_size.nteams;
				2171	#endif
				2172	team_id = team->t.t_master_tid;
				2173	KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
				2174
				2175	// compute global trip count
				2176	if( incr == 1 ) {
				2177	trip_count = pupper - plower + 1;
				2178	} else if(incr == -1) {
				2179	trip_count = plower - pupper + 1;
				2180	} else {
				2181	trip_count = (ST)(pupper - plower) / incr + 1; // cast to signed to cover incr<0 case
				2182	}
				2183	if( trip_count <= nteams ) {
				2184	KMP_DEBUG_ASSERT(
				2185	__kmp_static == kmp_sch_static_greedy \|\| \
				2186	__kmp_static == kmp_sch_static_balanced
				2187	); // Unknown static scheduling type.
				2188	// only some teams get single iteration, others get nothing
				2189	if( team_id < trip_count ) {
				2190	pupper = plower = plower + team_id incr;
				2191	} else {
				2192	plower = pupper + incr; // zero-trip loop
				2193	}
				2194	if( plastiter != NULL )
				2195	*plastiter = ( team_id == trip_count - 1 );
				2196	} else {
				2197	if( __kmp_static == kmp_sch_static_balanced ) {
				2198	register UT chunk = trip_count / nteams;
				2199	register UT extras = trip_count % nteams;
				2200	plower += incr ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
				2201	pupper = plower + chunk * incr - ( team_id < extras ? 0 : incr );
				2202	if( plastiter != NULL )
				2203	*plastiter = ( team_id == nteams - 1 );
				2204	} else {
				2205	register T chunk_inc_count =
				2206	( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
				2207	register T upper = *pupper;
				2208	KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
				2209	// Unknown static scheduling type.
				2210	plower += team_id chunk_inc_count;
				2211	pupper = plower + chunk_inc_count - incr;
				2212	// Check/correct bounds if needed
				2213	if( incr > 0 ) {
				2214	if( pupper < plower )
				2215	*pupper = i_maxmin< T >::mx;
				2216	if( plastiter != NULL )
				2217	plastiter = plower <= upper && *pupper > upper - incr;
				2218	if( *pupper > upper )
				2219	*pupper = upper; // tracker C73258
				2220	} else {
				2221	if( pupper > plower )
				2222	*pupper = i_maxmin< T >::mn;
				2223	if( plastiter != NULL )
				2224	plastiter = plower >= upper && *pupper < upper - incr;
				2225	if( *pupper < upper )
				2226	*pupper = upper; // tracker C73258
				2227	}
				2228	}
				2229	}
				2230	}
				2231
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2232	//-----------------------------------------------------------------------------------------
				2233	// Dispatch routines
				2234	// Transfer call to template< type T >
				2235	// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
				2236	// T lb, T ub, ST st, ST chunk )
				2237	extern "C" {
				2238
				2239	/*!
				2240	@ingroup WORK_SHARING
				2241	@{
				2242	@param loc Source location
				2243	@param gtid Global thread id
				2244	@param schedule Schedule type
				2245	@param lb Lower bound
				2246	@param ub Upper bound
				2247	@param st Step (or increment if you prefer)
				2248	@param chunk The chunk size to block with
				2249
				2250	This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
				2251	These functions are all identical apart from the types of the arguments.
				2252	*/
				2253
				2254	void
				2255	__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2256	kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
				2257	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	2258	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2259	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2260	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2261	}
				2262	/*!
				2263	See @ref __kmpc_dispatch_init_4
				2264	*/
				2265	void
				2266	__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2267	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
				2268	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	2269	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2270	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2271	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2272	}
				2273
				2274	/*!
				2275	See @ref __kmpc_dispatch_init_4
				2276	*/
				2277	void
				2278	__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2279	kmp_int64 lb, kmp_int64 ub,
				2280	kmp_int64 st, kmp_int64 chunk )
				2281	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	2282	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2283	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2284	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2285	}
				2286
				2287	/*!
				2288	See @ref __kmpc_dispatch_init_4
				2289	*/
				2290	void
				2291	__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2292	kmp_uint64 lb, kmp_uint64 ub,
				2293	kmp_int64 st, kmp_int64 chunk )
				2294	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	2295	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2296	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2297	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2298	}
				2299
				2300	/*!
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame^]	2301	See @ref __kmpc_dispatch_init_4
				2302
				2303	Difference from __kmpc_dispatch_init set of functions is these functions
				2304	are called for composite distribute parallel for construct. Thus before
				2305	regular iterations dispatching we need to calc per-team iteration space.
				2306
				2307	These functions are all identical apart from the types of the arguments.
				2308	*/
				2309	void
				2310	__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2311	kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
				2312	{
				2313	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2314	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2315	__kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
				2316	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2317	}
				2318
				2319	void
				2320	__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2321	kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
				2322	{
				2323	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2324	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2325	__kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
				2326	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2327	}
				2328
				2329	void
				2330	__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2331	kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
				2332	{
				2333	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2334	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2335	__kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
				2336	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2337	}
				2338
				2339	void
				2340	__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2341	kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
				2342	{
				2343	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2344	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2345	__kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
				2346	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2347	}
				2348
				2349	/*!
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2350	@param loc Source code location
				2351	@param gtid Global thread id
				2352	@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
				2353	@param p_lb Pointer to the lower bound for the next chunk of work
				2354	@param p_ub Pointer to the upper bound for the next chunk of work
				2355	@param p_st Pointer to the stride for the next chunk of work
				2356	@return one if there is work to be done, zero otherwise
				2357
				2358	Get the next dynamically allocated chunk of work for this thread.
				2359	If there is no more work, then the lb,ub and stride need not be modified.
				2360	*/
				2361	int
				2362	__kmpc_dispatch_next_4( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2363	kmp_int32 p_lb, kmp_int32 p_ub, kmp_int32 *p_st )
				2364	{
				2365	return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2366	}
				2367
				2368	/*!
				2369	See @ref __kmpc_dispatch_next_4
				2370	*/
				2371	int
				2372	__kmpc_dispatch_next_4u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2373	kmp_uint32 p_lb, kmp_uint32 p_ub, kmp_int32 *p_st )
				2374	{
				2375	return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2376	}
				2377
				2378	/*!
				2379	See @ref __kmpc_dispatch_next_4
				2380	*/
				2381	int
				2382	__kmpc_dispatch_next_8( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2383	kmp_int64 p_lb, kmp_int64 p_ub, kmp_int64 *p_st )
				2384	{
				2385	return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2386	}
				2387
				2388	/*!
				2389	See @ref __kmpc_dispatch_next_4
				2390	*/
				2391	int
				2392	__kmpc_dispatch_next_8u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2393	kmp_uint64 p_lb, kmp_uint64 p_ub, kmp_int64 *p_st )
				2394	{
				2395	return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2396	}
				2397
				2398	/*!
				2399	@param loc Source code location
				2400	@param gtid Global thread id
				2401
				2402	Mark the end of a dynamic loop.
				2403	*/
				2404	void
				2405	__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
				2406	{
				2407	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2408	}
				2409
				2410	/*!
				2411	See @ref __kmpc_dispatch_fini_4
				2412	*/
				2413	void
				2414	__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
				2415	{
				2416	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2417	}
				2418
				2419	/*!
				2420	See @ref __kmpc_dispatch_fini_4
				2421	*/
				2422	void
				2423	__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
				2424	{
				2425	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2426	}
				2427
				2428	/*!
				2429	See @ref __kmpc_dispatch_fini_4
				2430	*/
				2431	void
				2432	__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
				2433	{
				2434	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2435	}
				2436	/! @} /
				2437
				2438	//-----------------------------------------------------------------------------------------
				2439	//Non-template routines from kmp_dispatch.c used in other sources
				2440
				2441	kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
				2442	return value == checker;
				2443	}
				2444
				2445	kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
				2446	return value != checker;
				2447	}
				2448
				2449	kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
				2450	return value < checker;
				2451	}
				2452
				2453	kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
				2454	return value >= checker;
				2455	}
				2456
				2457	kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
				2458	return value <= checker;
				2459	}
				2460	kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
				2461	return value == checker;
				2462	}
				2463
				2464	kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
				2465	return value != checker;
				2466	}
				2467
				2468	kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
				2469	return value < checker;
				2470	}
				2471
				2472	kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
				2473	return value >= checker;
				2474	}
				2475
				2476	kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
				2477	return value <= checker;
				2478	}
				2479
				2480	kmp_uint32
				2481	__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
				2482	kmp_uint32 checker,
				2483	kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
				2484	, void * obj // Higher-level synchronization object, or NULL.
				2485	)
				2486	{
				2487	// note: we may not belong to a team at this point
				2488	register volatile kmp_uint32 * spin = spinner;
				2489	register kmp_uint32 check = checker;
				2490	register kmp_uint32 spins;
				2491	register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
				2492	register kmp_uint32 r;
				2493
				2494	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2495	KMP_INIT_YIELD( spins );
				2496	// main wait spin loop
				2497	while(!f(r = TCR_4(*spin), check)) {
				2498	KMP_FSYNC_SPIN_PREPARE( obj );
				2499	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2500	It causes problems with infinite recursion because of exit lock */
				2501	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2502	__kmp_abort_thread(); */
				2503
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2504	/* if we have waited a bit, or are oversubscribed, yield */
				2505	/* pause is in the following code */
				2506	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2507	KMP_YIELD_SPIN( spins );
				2508	}
				2509	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2510	return r;
				2511	}
				2512
				2513	kmp_uint64
				2514	__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
				2515	kmp_uint64 checker,
				2516	kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
				2517	, void * obj // Higher-level synchronization object, or NULL.
				2518	)
				2519	{
				2520	// note: we may not belong to a team at this point
				2521	register volatile kmp_uint64 * spin = spinner;
				2522	register kmp_uint64 check = checker;
				2523	register kmp_uint32 spins;
				2524	register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
				2525	register kmp_uint64 r;
				2526
				2527	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2528	KMP_INIT_YIELD( spins );
				2529	// main wait spin loop
				2530	while(!f(r = *spin, check))
				2531	{
				2532	KMP_FSYNC_SPIN_PREPARE( obj );
				2533	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2534	It causes problems with infinite recursion because of exit lock */
				2535	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2536	__kmp_abort_thread(); */
				2537
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2538	// if we are oversubscribed,
				2539	// or have waited a bit (and KMP_LIBARRY=throughput, then yield
				2540	// pause is in the following code
				2541	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2542	KMP_YIELD_SPIN( spins );
				2543	}
				2544	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2545	return r;
				2546	}
				2547
				2548	} // extern "C"
				2549
				2550	#ifdef KMP_GOMP_COMPAT
				2551
				2552	void
				2553	__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2554	kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
				2555	kmp_int32 chunk, int push_ws )
				2556	{
				2557	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2558	push_ws );
				2559	}
				2560
				2561	void
				2562	__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2563	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
				2564	kmp_int32 chunk, int push_ws )
				2565	{
				2566	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2567	push_ws );
				2568	}
				2569
				2570	void
				2571	__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2572	kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
				2573	kmp_int64 chunk, int push_ws )
				2574	{
				2575	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2576	push_ws );
				2577	}
				2578
				2579	void
				2580	__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2581	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
				2582	kmp_int64 chunk, int push_ws )
				2583	{
				2584	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2585	push_ws );
				2586	}
				2587
				2588	void
				2589	__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
				2590	{
				2591	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2592	}
				2593
				2594	void
				2595	__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
				2596	{
				2597	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2598	}
				2599
				2600	void
				2601	__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
				2602	{
				2603	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2604	}
				2605
				2606	void
				2607	__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
				2608	{
				2609	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2610	}
				2611
				2612	#endif /* KMP_GOMP_COMPAT */
				2613
				2614	/* ------------------------------------------------------------------------ */
				2615	/* ------------------------------------------------------------------------ */
				2616