Blame - openmp/runtime/src/kmp_dispatch.cpp - toolchain/llvm-project

blob: b224efcaaea120dd0ddd63b46aa039667c933800 [file] [log] [blame]

Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1	/*
				2	* kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	3	*/
				4
				5
				6	//===----------------------------------------------------------------------===//
				7	//
				8	// The LLVM Compiler Infrastructure
				9	//
				10	// This file is dual licensed under the MIT and the University of Illinois Open
				11	// Source Licenses. See LICENSE.txt for details.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15
				16	/*
				17	* Dynamic scheduling initialization and dispatch.
				18	*
				19	* NOTE: __kmp_nth is a constant inside of any dispatch loop, however
				20	* it may change values between parallel regions. __kmp_max_nth
				21	* is the largest value __kmp_nth may take, 1 is the smallest.
				22	*
				23	*/
				24
				25	/* ------------------------------------------------------------------------ */
				26	/* ------------------------------------------------------------------------ */
				27
				28	#include "kmp.h"
				29	#include "kmp_i18n.h"
				30	#include "kmp_itt.h"
				31	#include "kmp_str.h"
				32	#include "kmp_error.h"
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	33	#include "kmp_stats.h"
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	34	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				35	#include <float.h>
				36	#endif
				37
				38	/* ------------------------------------------------------------------------ */
				39	/* ------------------------------------------------------------------------ */
				40
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	41	// template for type limits
				42	template< typename T >
				43	struct i_maxmin {
				44	static const T mx;
				45	static const T mn;
				46	};
				47	template<>
				48	struct i_maxmin< int > {
				49	static const int mx = 0x7fffffff;
				50	static const int mn = 0x80000000;
				51	};
				52	template<>
				53	struct i_maxmin< unsigned int > {
				54	static const unsigned int mx = 0xffffffff;
				55	static const unsigned int mn = 0x00000000;
				56	};
				57	template<>
				58	struct i_maxmin< long long > {
				59	static const long long mx = 0x7fffffffffffffffLL;
				60	static const long long mn = 0x8000000000000000LL;
				61	};
				62	template<>
				63	struct i_maxmin< unsigned long long > {
				64	static const unsigned long long mx = 0xffffffffffffffffLL;
				65	static const unsigned long long mn = 0x0000000000000000LL;
				66	};
				67	//-------------------------------------------------------------------------
				68
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	69	#ifdef KMP_STATIC_STEAL_ENABLED
				70
				71	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				72	template< typename T >
				73	struct dispatch_private_infoXX_template {
				74	typedef typename traits_t< T >::unsigned_t UT;
				75	typedef typename traits_t< T >::signed_t ST;
				76	UT count; // unsigned
				77	T ub;
				78	/* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
				79	T lb;
				80	ST st; // signed
				81	UT tc; // unsigned
				82	T static_steal_counter; // for static_steal only; maybe better to put after ub
				83
				84	/* parm[1-4] are used in different ways by different scheduling algorithms */
				85
				86	// KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
				87	// a) parm3 is properly aligned and
				88	// b) all parm1-4 are in the same cache line.
				89	// Because of parm1-4 are used together, performance seems to be better
				90	// if they are in the same line (not measured though).
				91
				92	struct KMP_ALIGN( 32 ) { // compiler does not accept sizeof(T)*4
				93	T parm1;
				94	T parm2;
				95	T parm3;
				96	T parm4;
				97	};
				98
				99	UT ordered_lower; // unsigned
				100	UT ordered_upper; // unsigned
				101	#if KMP_OS_WINDOWS
				102	T last_upper;
				103	#endif /* KMP_OS_WINDOWS */
				104	};
				105
				106	#else /* KMP_STATIC_STEAL_ENABLED */
				107
				108	// replaces dispatch_private_info{32,64} structures and dispatch_private_info{32,64}_t types
				109	template< typename T >
				110	struct dispatch_private_infoXX_template {
				111	typedef typename traits_t< T >::unsigned_t UT;
				112	typedef typename traits_t< T >::signed_t ST;
				113	T lb;
				114	T ub;
				115	ST st; // signed
				116	UT tc; // unsigned
				117
				118	T parm1;
				119	T parm2;
				120	T parm3;
				121	T parm4;
				122
				123	UT count; // unsigned
				124
				125	UT ordered_lower; // unsigned
				126	UT ordered_upper; // unsigned
				127	#if KMP_OS_WINDOWS
				128	T last_upper;
				129	#endif /* KMP_OS_WINDOWS */
				130	};
				131
				132	#endif /* KMP_STATIC_STEAL_ENABLED */
				133
				134	// replaces dispatch_private_info structure and dispatch_private_info_t type
				135	template< typename T >
				136	struct KMP_ALIGN_CACHE dispatch_private_info_template {
				137	// duplicate alignment here, otherwise size of structure is not correct in our compiler
				138	union KMP_ALIGN_CACHE private_info_tmpl {
				139	dispatch_private_infoXX_template< T > p;
				140	dispatch_private_info64_t p64;
				141	} u;
				142	enum sched_type schedule; /* scheduling algorithm */
				143	kmp_uint32 ordered; /* ordered clause specified */
				144	kmp_uint32 ordered_bumped;
				145	kmp_int32 ordered_dummy[KMP_MAX_ORDERED-3]; // to retain the structure size after making order
				146	dispatch_private_info * next; /* stack of buffers for nest of serial regions */
				147	kmp_uint32 nomerge; /* don't merge iters if serialized */
				148	kmp_uint32 type_size;
				149	enum cons_type pushed_ws;
				150	};
				151
				152
				153	// replaces dispatch_shared_info{32,64} structures and dispatch_shared_info{32,64}_t types
				154	template< typename UT >
				155	struct dispatch_shared_infoXX_template {
				156	/* chunk index under dynamic, number of idle threads under static-steal;
				157	iteration index otherwise */
				158	volatile UT iteration;
				159	volatile UT num_done;
				160	volatile UT ordered_iteration;
				161	UT ordered_dummy[KMP_MAX_ORDERED-1]; // to retain the structure size making ordered_iteration scalar
				162	};
				163
				164	// replaces dispatch_shared_info structure and dispatch_shared_info_t type
				165	template< typename UT >
				166	struct dispatch_shared_info_template {
				167	// we need union here to keep the structure size
				168	union shared_info_tmpl {
				169	dispatch_shared_infoXX_template< UT > s;
				170	dispatch_shared_info64_t s64;
				171	} u;
				172	volatile kmp_uint32 buffer_index;
				173	};
				174
				175	/* ------------------------------------------------------------------------ */
				176	/* ------------------------------------------------------------------------ */
				177
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	178	#undef USE_TEST_LOCKS
				179
				180	// test_then_add template (general template should NOT be used)
				181	template< typename T >
				182	static __forceinline T
				183	test_then_add( volatile T *p, T d ) { KMP_ASSERT(0); };
				184
				185	template<>
				186	__forceinline kmp_int32
				187	test_then_add< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 d )
				188	{
				189	kmp_int32 r;
				190	r = KMP_TEST_THEN_ADD32( p, d );
				191	return r;
				192	}
				193
				194	template<>
				195	__forceinline kmp_int64
				196	test_then_add< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 d )
				197	{
				198	kmp_int64 r;
				199	r = KMP_TEST_THEN_ADD64( p, d );
				200	return r;
				201	}
				202
				203	// test_then_inc_acq template (general template should NOT be used)
				204	template< typename T >
				205	static __forceinline T
				206	test_then_inc_acq( volatile T *p ) { KMP_ASSERT(0); };
				207
				208	template<>
				209	__forceinline kmp_int32
				210	test_then_inc_acq< kmp_int32 >( volatile kmp_int32 *p )
				211	{
				212	kmp_int32 r;
				213	r = KMP_TEST_THEN_INC_ACQ32( p );
				214	return r;
				215	}
				216
				217	template<>
				218	__forceinline kmp_int64
				219	test_then_inc_acq< kmp_int64 >( volatile kmp_int64 *p )
				220	{
				221	kmp_int64 r;
				222	r = KMP_TEST_THEN_INC_ACQ64( p );
				223	return r;
				224	}
				225
				226	// test_then_inc template (general template should NOT be used)
				227	template< typename T >
				228	static __forceinline T
				229	test_then_inc( volatile T *p ) { KMP_ASSERT(0); };
				230
				231	template<>
				232	__forceinline kmp_int32
				233	test_then_inc< kmp_int32 >( volatile kmp_int32 *p )
				234	{
				235	kmp_int32 r;
				236	r = KMP_TEST_THEN_INC32( p );
				237	return r;
				238	}
				239
				240	template<>
				241	__forceinline kmp_int64
				242	test_then_inc< kmp_int64 >( volatile kmp_int64 *p )
				243	{
				244	kmp_int64 r;
				245	r = KMP_TEST_THEN_INC64( p );
				246	return r;
				247	}
				248
				249	// compare_and_swap template (general template should NOT be used)
				250	template< typename T >
				251	static __forceinline kmp_int32
				252	compare_and_swap( volatile T *p, T c, T s ) { KMP_ASSERT(0); };
				253
				254	template<>
				255	__forceinline kmp_int32
				256	compare_and_swap< kmp_int32 >( volatile kmp_int32 *p, kmp_int32 c, kmp_int32 s )
				257	{
				258	return KMP_COMPARE_AND_STORE_REL32( p, c, s );
				259	}
				260
				261	template<>
				262	__forceinline kmp_int32
				263	compare_and_swap< kmp_int64 >( volatile kmp_int64 *p, kmp_int64 c, kmp_int64 s )
				264	{
				265	return KMP_COMPARE_AND_STORE_REL64( p, c, s );
				266	}
				267
				268	/*
				269	Spin wait loop that first does pause, then yield.
				270	Waits until function returns non-zero when called with *spinner and check.
				271	Does NOT put threads to sleep.
				272	#if USE_ITT_BUILD
				273	Arguments:
Alp Toker	8f2d3f0	2014-02-24 10:40:15 +0000	[diff] [blame]	274	obj -- is higher-level synchronization object to report to ittnotify. It is used to report
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	275	locks consistently. For example, if lock is acquired immediately, its address is
				276	reported to ittnotify via KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired
				277	immediately and lock routine calls to KMP_WAIT_YIELD(), the later should report the same
				278	address, not an address of low-level spinner.
				279	#endif // USE_ITT_BUILD
				280	*/
				281	template< typename UT >
				282	// ToDo: make inline function (move to header file for icl)
				283	static UT // unsigned 4- or 8-byte type
				284	__kmp_wait_yield( volatile UT * spinner,
				285	UT checker,
				286	kmp_uint32 (* pred)( UT, UT )
				287	USE_ITT_BUILD_ARG(void * obj) // Higher-level synchronization object, or NULL.
				288	)
				289	{
				290	// note: we may not belong to a team at this point
				291	register volatile UT * spin = spinner;
				292	register UT check = checker;
				293	register kmp_uint32 spins;
				294	register kmp_uint32 (*f) ( UT, UT ) = pred;
				295	register UT r;
				296
				297	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				298	KMP_INIT_YIELD( spins );
				299	// main wait spin loop
				300	while(!f(r = *spin, check))
				301	{
				302	KMP_FSYNC_SPIN_PREPARE( obj );
				303	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				304	It causes problems with infinite recursion because of exit lock */
				305	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				306	__kmp_abort_thread(); */
				307
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	308	// if we are oversubscribed,
				309	// or have waited a bit (and KMP_LIBRARY=throughput, then yield
				310	// pause is in the following code
				311	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				312	KMP_YIELD_SPIN( spins );
				313	}
				314	KMP_FSYNC_SPIN_ACQUIRED( obj );
				315	return r;
				316	}
				317
				318	template< typename UT >
				319	static kmp_uint32 __kmp_eq( UT value, UT checker) {
				320	return value == checker;
				321	}
				322
				323	template< typename UT >
				324	static kmp_uint32 __kmp_neq( UT value, UT checker) {
				325	return value != checker;
				326	}
				327
				328	template< typename UT >
				329	static kmp_uint32 __kmp_lt( UT value, UT checker) {
				330	return value < checker;
				331	}
				332
				333	template< typename UT >
				334	static kmp_uint32 __kmp_ge( UT value, UT checker) {
				335	return value >= checker;
				336	}
				337
				338	template< typename UT >
				339	static kmp_uint32 __kmp_le( UT value, UT checker) {
				340	return value <= checker;
				341	}
				342
				343
				344	/* ------------------------------------------------------------------------ */
				345	/* ------------------------------------------------------------------------ */
				346
				347	static void
				348	__kmp_dispatch_deo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				349	{
				350	kmp_info_t *th;
				351
				352	KMP_DEBUG_ASSERT( gtid_ref );
				353
				354	if ( __kmp_env_consistency_check ) {
				355	th = __kmp_threads[*gtid_ref];
				356	if ( th -> th.th_root -> r.r_active
				357	&& ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) ) {
				358	__kmp_push_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref, NULL );
				359	}
				360	}
				361	}
				362
				363	template< typename UT >
				364	static void
				365	__kmp_dispatch_deo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				366	{
				367	typedef typename traits_t< UT >::signed_t ST;
				368	dispatch_private_info_template< UT > * pr;
				369
				370	int gtid = *gtid_ref;
				371	// int cid = *cid_ref;
				372	kmp_info_t *th = __kmp_threads[ gtid ];
				373	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				374
				375	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid ) );
				376	if ( __kmp_env_consistency_check ) {
				377	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				378	( th -> th.th_dispatch -> th_dispatch_pr_current );
				379	if ( pr -> pushed_ws != ct_none ) {
				380	__kmp_push_sync( gtid, ct_ordered_in_pdo, loc_ref, NULL );
				381	}
				382	}
				383
				384	if ( ! th -> th.th_team -> t.t_serialized ) {
				385	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				386	( th -> th.th_dispatch -> th_dispatch_sh_current );
				387	UT lower;
				388
				389	if ( ! __kmp_env_consistency_check ) {
				390	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				391	( th -> th.th_dispatch -> th_dispatch_pr_current );
				392	}
				393	lower = pr->u.p.ordered_lower;
				394
				395	#if ! defined( KMP_GOMP_COMPAT )
				396	if ( __kmp_env_consistency_check ) {
				397	if ( pr->ordered_bumped ) {
				398	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				399	__kmp_error_construct2(
				400	kmp_i18n_msg_CnsMultipleNesting,
				401	ct_ordered_in_pdo, loc_ref,
				402	& p->stack_data[ p->w_top ]
				403	);
				404	}
				405	}
				406	#endif /* !defined(KMP_GOMP_COMPAT) */
				407
				408	KMP_MB();
				409	#ifdef KMP_DEBUG
				410	{
				411	const char * buff;
				412	// create format specifiers before the debug output
				413	buff = __kmp_str_format(
				414	"__kmp_dispatch_deo: T#%%d before wait: ordered_iter:%%%s lower:%%%s\n",
				415	traits_t< UT >::spec, traits_t< UT >::spec );
				416	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				417	__kmp_str_free( &buff );
				418	}
				419	#endif
				420
				421	__kmp_wait_yield< UT >( &sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				422	USE_ITT_BUILD_ARG( NULL )
				423	);
				424	KMP_MB(); /* is this necessary? */
				425	#ifdef KMP_DEBUG
				426	{
				427	const char * buff;
				428	// create format specifiers before the debug output
				429	buff = __kmp_str_format(
				430	"__kmp_dispatch_deo: T#%%d after wait: ordered_iter:%%%s lower:%%%s\n",
				431	traits_t< UT >::spec, traits_t< UT >::spec );
				432	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				433	__kmp_str_free( &buff );
				434	}
				435	#endif
				436	}
				437	KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid ) );
				438	}
				439
				440	static void
				441	__kmp_dispatch_dxo_error( int gtid_ref, int cid_ref, ident_t *loc_ref )
				442	{
				443	kmp_info_t *th;
				444
				445	if ( __kmp_env_consistency_check ) {
				446	th = __kmp_threads[*gtid_ref];
				447	if ( th -> th.th_dispatch -> th_dispatch_pr_current -> pushed_ws != ct_none ) {
				448	__kmp_pop_sync( *gtid_ref, ct_ordered_in_pdo, loc_ref );
				449	}
				450	}
				451	}
				452
				453	template< typename UT >
				454	static void
				455	__kmp_dispatch_dxo( int gtid_ref, int cid_ref, ident_t *loc_ref )
				456	{
				457	typedef typename traits_t< UT >::signed_t ST;
				458	dispatch_private_info_template< UT > * pr;
				459
				460	int gtid = *gtid_ref;
				461	// int cid = *cid_ref;
				462	kmp_info_t *th = __kmp_threads[ gtid ];
				463	KMP_DEBUG_ASSERT( th -> th.th_dispatch );
				464
				465	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid ) );
				466	if ( __kmp_env_consistency_check ) {
				467	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				468	( th -> th.th_dispatch -> th_dispatch_pr_current );
				469	if ( pr -> pushed_ws != ct_none ) {
				470	__kmp_pop_sync( gtid, ct_ordered_in_pdo, loc_ref );
				471	}
				472	}
				473
				474	if ( ! th -> th.th_team -> t.t_serialized ) {
				475	dispatch_shared_info_template< UT > * sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				476	( th -> th.th_dispatch -> th_dispatch_sh_current );
				477
				478	if ( ! __kmp_env_consistency_check ) {
				479	pr = reinterpret_cast< dispatch_private_info_template< UT >* >
				480	( th -> th.th_dispatch -> th_dispatch_pr_current );
				481	}
				482
				483	KMP_FSYNC_RELEASING( & sh->u.s.ordered_iteration );
				484	#if ! defined( KMP_GOMP_COMPAT )
				485	if ( __kmp_env_consistency_check ) {
				486	if ( pr->ordered_bumped != 0 ) {
				487	struct cons_header *p = __kmp_threads[ gtid ]->th.th_cons;
				488	/* How to test it? - OM */
				489	__kmp_error_construct2(
				490	kmp_i18n_msg_CnsMultipleNesting,
				491	ct_ordered_in_pdo, loc_ref,
				492	& p->stack_data[ p->w_top ]
				493	);
				494	}
				495	}
				496	#endif /* !defined(KMP_GOMP_COMPAT) */
				497
				498	KMP_MB(); /* Flush all pending memory write invalidates. */
				499
				500	pr->ordered_bumped += 1;
				501
				502	KD_TRACE(1000, ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
				503	gtid, pr->ordered_bumped ) );
				504
				505	KMP_MB(); /* Flush all pending memory write invalidates. */
				506
				507	/* TODO use general release procedure? */
				508	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				509
				510	KMP_MB(); /* Flush all pending memory write invalidates. */
				511	}
				512	KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid ) );
				513	}
				514
				515	/* Computes and returns x to the power of y, where y must a non-negative integer */
				516	template< typename UT >
				517	static __forceinline long double
				518	__kmp_pow(long double x, UT y) {
				519	long double s=1.0L;
				520
				521	KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
				522	//KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
				523	while(y) {
				524	if ( y & 1 )
				525	s *= x;
				526	x *= x;
				527	y >>= 1;
				528	}
				529	return s;
				530	}
				531
				532	/* Computes and returns the number of unassigned iterations after idx chunks have been assigned
				533	(the total number of unassigned iterations in chunks with index greater than or equal to idx).
				534	__forceinline seems to be broken so that if we __forceinline this function, the behavior is wrong
				535	(one of the unit tests, sch_guided_analytical_basic.cpp, fails)
				536	*/
				537	template< typename T >
				538	static __inline typename traits_t< T >::unsigned_t
				539	__kmp_dispatch_guided_remaining(
				540	T tc,
				541	typename traits_t< T >::floating_t base,
				542	typename traits_t< T >::unsigned_t idx
				543	) {
				544	/* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at
				545	least for ICL 8.1, long double arithmetic may not really have
				546	long double precision, even with /Qlong_double. Currently, we
				547	workaround that in the caller code, by manipulating the FPCW for
				548	Windows* OS on IA-32 architecture. The lack of precision is not
				549	expected to be a correctness issue, though.
				550	*/
				551	typedef typename traits_t< T >::unsigned_t UT;
				552
				553	long double x = tc * __kmp_pow< UT >(base, idx);
				554	UT r = (UT) x;
				555	if ( x == r )
				556	return r;
				557	return r + 1;
				558	}
				559
				560	// Parameters of the guided-iterative algorithm:
				561	// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
				562	// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
				563	// by default n = 2. For example with n = 3 the chunks distribution will be more flat.
				564	// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
				565	static int guided_int_param = 2;
				566	static double guided_flt_param = 0.5;// = 1.0 / guided_int_param;
				567
				568	// UT - unsigned flavor of T, ST - signed flavor of T,
				569	// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
				570	template< typename T >
				571	static void
				572	__kmp_dispatch_init(
				573	ident_t * loc,
				574	int gtid,
				575	enum sched_type schedule,
				576	T lb,
				577	T ub,
				578	typename traits_t< T >::signed_t st,
				579	typename traits_t< T >::signed_t chunk,
				580	int push_ws
				581	) {
				582	typedef typename traits_t< T >::unsigned_t UT;
				583	typedef typename traits_t< T >::signed_t ST;
				584	typedef typename traits_t< T >::floating_t DBL;
				585	static const int ___kmp_size_type = sizeof( UT );
				586
				587	int active;
				588	T tc;
				589	kmp_info_t * th;
				590	kmp_team_t * team;
				591	kmp_uint32 my_buffer_index;
				592	dispatch_private_info_template< T > * pr;
				593	dispatch_shared_info_template< UT > volatile * sh;
				594
				595	KMP_BUILD_ASSERT( sizeof( dispatch_private_info_template< T > ) == sizeof( dispatch_private_info ) );
				596	KMP_BUILD_ASSERT( sizeof( dispatch_shared_info_template< UT > ) == sizeof( dispatch_shared_info ) );
				597
				598	if ( ! TCR_4( __kmp_init_parallel ) )
				599	__kmp_parallel_initialize();
				600
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	601	#if INCLUDE_SSC_MARKS
				602	SSC_MARK_DISPATCH_INIT();
				603	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	604	#ifdef KMP_DEBUG
				605	{
				606	const char * buff;
				607	// create format specifiers before the debug output
				608	buff = __kmp_str_format(
				609	"__kmp_dispatch_init: T#%%d called: schedule:%%d chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
				610	traits_t< ST >::spec, traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				611	KD_TRACE(10, ( buff, gtid, schedule, chunk, lb, ub, st ) );
				612	__kmp_str_free( &buff );
				613	}
				614	#endif
				615	/* setup data */
				616	th = __kmp_threads[ gtid ];
				617	team = th -> th.th_team;
				618	active = ! team -> t.t_serialized;
				619	th->th.th_ident = loc;
				620
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	621	#if USE_ITT_BUILD
				622	kmp_uint64 cur_chunk = chunk;
				623	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	624	if ( ! active ) {
				625	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				626	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				627	} else {
				628	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				629	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				630
				631	my_buffer_index = th->th.th_dispatch->th_disp_index ++;
				632
				633	/* What happens when number of threads changes, need to resize buffer? */
				634	pr = reinterpret_cast< dispatch_private_info_template< T > * >
				635	( &th -> th.th_dispatch -> th_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				636	sh = reinterpret_cast< dispatch_shared_info_template< UT > volatile * >
				637	( &team -> t.t_disp_buffer[ my_buffer_index % KMP_MAX_DISP_BUF ] );
				638	}
				639
				640	/* Pick up the nomerge/ordered bits from the scheduling type */
				641	if ( (schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper) ) {
				642	pr->nomerge = TRUE;
				643	schedule = (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
				644	} else {
				645	pr->nomerge = FALSE;
				646	}
				647	pr->type_size = ___kmp_size_type; // remember the size of variables
				648	if ( kmp_ord_lower & schedule ) {
				649	pr->ordered = TRUE;
				650	schedule = (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
				651	} else {
				652	pr->ordered = FALSE;
				653	}
				654	if ( schedule == kmp_sch_static ) {
				655	schedule = __kmp_static;
				656	} else {
				657	if ( schedule == kmp_sch_runtime ) {
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	658	// Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if not specified)
				659	schedule = team -> t.t_sched.r_sched_type;
				660	// Detail the schedule if needed (global controls are differentiated appropriately)
				661	if ( schedule == kmp_sch_guided_chunked ) {
				662	schedule = __kmp_guided;
				663	} else if ( schedule == kmp_sch_static ) {
				664	schedule = __kmp_static;
				665	}
				666	// Use the chunk size specified by OMP_SCHEDULE (or default if not specified)
				667	chunk = team -> t.t_sched.chunk;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	668
				669	#ifdef KMP_DEBUG
				670	{
				671	const char * buff;
				672	// create format specifiers before the debug output
				673	buff = __kmp_str_format(
				674	"__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
				675	traits_t< ST >::spec );
				676	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				677	__kmp_str_free( &buff );
				678	}
				679	#endif
				680	} else {
				681	if ( schedule == kmp_sch_guided_chunked ) {
				682	schedule = __kmp_guided;
				683	}
				684	if ( chunk <= 0 ) {
				685	chunk = KMP_DEFAULT_CHUNK;
				686	}
				687	}
				688
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	689	if ( schedule == kmp_sch_auto ) {
				690	// mapping and differentiation: in the __kmp_do_serial_initialize()
				691	schedule = __kmp_auto;
				692	#ifdef KMP_DEBUG
				693	{
				694	const char * buff;
				695	// create format specifiers before the debug output
				696	buff = __kmp_str_format(
				697	"__kmp_dispatch_init: kmp_sch_auto: T#%%d new: schedule:%%d chunk:%%%s\n",
				698	traits_t< ST >::spec );
				699	KD_TRACE(10, ( buff, gtid, schedule, chunk ) );
				700	__kmp_str_free( &buff );
				701	}
				702	#endif
				703	}
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	704
				705	/* guided analytical not safe for too many threads */
				706	if ( team->t.t_nproc > 1<<20 && schedule == kmp_sch_guided_analytical_chunked ) {
				707	schedule = kmp_sch_guided_iterative_chunked;
				708	KMP_WARNING( DispatchManyThreads );
				709	}
				710	pr->u.p.parm1 = chunk;
				711	}
				712	KMP_ASSERT2( (kmp_sch_lower < schedule && schedule < kmp_sch_upper),
				713	"unknown scheduling type" );
				714
				715	pr->u.p.count = 0;
				716
				717	if ( __kmp_env_consistency_check ) {
				718	if ( st == 0 ) {
				719	__kmp_error_construct(
				720	kmp_i18n_msg_CnsLoopIncrZeroProhibited,
				721	( pr->ordered ? ct_pdo_ordered : ct_pdo ), loc
				722	);
				723	}
				724	}
				725
				726	tc = ( ub - lb + st );
				727	if ( st != 1 ) {
				728	if ( st < 0 ) {
				729	if ( lb < ub ) {
				730	tc = 0; // zero-trip
				731	} else { // lb >= ub
				732	tc = (ST)tc / st; // convert to signed division
				733	}
				734	} else { // st > 0
				735	if ( ub < lb ) {
				736	tc = 0; // zero-trip
				737	} else { // lb >= ub
				738	tc /= st;
				739	}
				740	}
				741	} else if ( ub < lb ) { // st == 1
				742	tc = 0; // zero-trip
				743	}
				744
				745	pr->u.p.lb = lb;
				746	pr->u.p.ub = ub;
				747	pr->u.p.st = st;
				748	pr->u.p.tc = tc;
				749
				750	#if KMP_OS_WINDOWS
				751	pr->u.p.last_upper = ub + st;
				752	#endif /* KMP_OS_WINDOWS */
				753
				754	/* NOTE: only the active parallel region(s) has active ordered sections */
				755
				756	if ( active ) {
				757	if ( pr->ordered == 0 ) {
				758	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo_error;
				759	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo_error;
				760	} else {
				761	pr->ordered_bumped = 0;
				762
				763	pr->u.p.ordered_lower = 1;
				764	pr->u.p.ordered_upper = 0;
				765
				766	th -> th.th_dispatch -> th_deo_fcn = __kmp_dispatch_deo< UT >;
				767	th -> th.th_dispatch -> th_dxo_fcn = __kmp_dispatch_dxo< UT >;
				768	}
				769	}
				770
				771	if ( __kmp_env_consistency_check ) {
				772	enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
				773	if ( push_ws ) {
				774	__kmp_push_workshare( gtid, ws, loc );
				775	pr->pushed_ws = ws;
				776	} else {
				777	__kmp_check_workshare( gtid, ws, loc );
				778	pr->pushed_ws = ct_none;
				779	}
				780	}
				781
				782	switch ( schedule ) {
				783	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				784	case kmp_sch_static_steal:
				785	{
				786	T nproc = team->t.t_nproc;
				787	T ntc, init;
				788
				789	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid ) );
				790
				791	ntc = (tc % chunk ? 1 : 0) + tc / chunk;
				792	if ( nproc > 1 && ntc >= nproc ) {
				793	T id = __kmp_tid_from_gtid(gtid);
				794	T small_chunk, extras;
				795
				796	small_chunk = ntc / nproc;
				797	extras = ntc % nproc;
				798
				799	init = id * small_chunk + ( id < extras ? id : extras );
				800	pr->u.p.count = init;
				801	pr->u.p.ub = init + small_chunk + ( id < extras ? 1 : 0 );
				802
				803	pr->u.p.parm2 = lb;
				804	//pr->pfields.parm3 = 0; // it's not used in static_steal
				805	pr->u.p.parm4 = id;
				806	pr->u.p.st = st;
				807	break;
				808	} else {
				809	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_balanced\n",
				810	gtid ) );
				811	schedule = kmp_sch_static_balanced;
				812	/* too few iterations: fall-through to kmp_sch_static_balanced */
				813	} // if
				814	/* FALL-THROUGH to static balanced */
				815	} // case
				816	#endif
				817	case kmp_sch_static_balanced:
				818	{
				819	T nproc = team->t.t_nproc;
				820	T init, limit;
				821
				822	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
				823	gtid ) );
				824
				825	if ( nproc > 1 ) {
				826	T id = __kmp_tid_from_gtid(gtid);
				827
				828	if ( tc < nproc ) {
				829	if ( id < tc ) {
				830	init = id;
				831	limit = id;
				832	pr->u.p.parm1 = (id == tc - 1); /* parm1 stores plastiter /
				833	} else {
				834	pr->u.p.count = 1; /* means no more chunks to execute */
				835	pr->u.p.parm1 = FALSE;
				836	break;
				837	}
				838	} else {
				839	T small_chunk = tc / nproc;
				840	T extras = tc % nproc;
				841	init = id * small_chunk + (id < extras ? id : extras);
				842	limit = init + small_chunk - (id < extras ? 0 : 1);
				843	pr->u.p.parm1 = (id == nproc - 1);
				844	}
				845	} else {
				846	if ( tc > 0 ) {
				847	init = 0;
				848	limit = tc - 1;
				849	pr->u.p.parm1 = TRUE;
				850	} else {
				851	// zero trip count
				852	pr->u.p.count = 1; /* means no more chunks to execute */
				853	pr->u.p.parm1 = FALSE;
				854	break;
				855	}
				856	}
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	857	#if USE_ITT_BUILD
				858	// Calculate chunk for metadata report
				859	if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
				860	cur_chunk = limit - init + 1;
				861	}
				862	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	863	if ( st == 1 ) {
				864	pr->u.p.lb = lb + init;
				865	pr->u.p.ub = lb + limit;
				866	} else {
				867	T ub_tmp = lb + limit * st; // calculated upper bound, "ub" is user-defined upper bound
				868	pr->u.p.lb = lb + init * st;
				869	// adjust upper bound to "ub" if needed, so that MS lastprivate will match it exactly
				870	if ( st > 0 ) {
				871	pr->u.p.ub = ( ub_tmp + st > ub ? ub : ub_tmp );
				872	} else {
				873	pr->u.p.ub = ( ub_tmp + st < ub ? ub : ub_tmp );
				874	}
				875	}
				876	if ( pr->ordered ) {
				877	pr->u.p.ordered_lower = init;
				878	pr->u.p.ordered_upper = limit;
				879	}
				880	break;
				881	} // case
				882	case kmp_sch_guided_iterative_chunked :
				883	{
				884	T nproc = team->t.t_nproc;
				885	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked case\n",gtid));
				886
				887	if ( nproc > 1 ) {
				888	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				889	/* chunk size too large, switch to dynamic */
				890	schedule = kmp_sch_dynamic_chunked;
				891	} else {
				892	// when remaining iters become less than parm2 - switch to dynamic
				893	pr->u.p.parm2 = guided_int_param * nproc * ( chunk + 1 );
				894	(double)&pr->u.p.parm3 = guided_flt_param / nproc; // may occupy parm3 and parm4
				895	}
				896	} else {
				897	KD_TRACE(100,("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",gtid));
				898	schedule = kmp_sch_static_greedy;
				899	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				900	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				901	pr->u.p.parm1 = tc;
				902	} // if
				903	} // case
				904	break;
				905	case kmp_sch_guided_analytical_chunked:
				906	{
				907	T nproc = team->t.t_nproc;
				908	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked case\n", gtid));
				909
				910	if ( nproc > 1 ) {
				911	if ( (2L * chunk + 1 ) * nproc >= tc ) {
				912	/* chunk size too large, switch to dynamic */
				913	schedule = kmp_sch_dynamic_chunked;
				914	} else {
				915	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				916	DBL x;
				917
				918	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				919	/* Linux* OS already has 64-bit computation by default for
				920	long double, and on Windows* OS on Intel(R) 64,
				921	/Qlong_double doesn't work. On Windows* OS
				922	on IA-32 architecture, we need to set precision to
				923	64-bit instead of the default 53-bit. Even though long
				924	double doesn't work on Windows* OS on Intel(R) 64, the
				925	resulting lack of precision is not expected to impact
				926	the correctness of the algorithm, but this has not been
				927	mathematically proven.
				928	*/
				929	// save original FPCW and set precision to 64-bit, as
				930	// Windows* OS on IA-32 architecture defaults to 53-bit
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	931	unsigned int oldFpcw = _control87(0,0);
				932	_control87(_PC_64,_MCW_PC); // 0,0x30000
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	933	#endif
				934	/* value used for comparison in solver for cross-over point */
				935	long double target = ((long double)chunk * 2 + 1) * nproc / tc;
				936
				937	/* crossover point--chunk indexes equal to or greater than
				938	this point switch to dynamic-style scheduling */
				939	UT cross;
				940
				941	/* commonly used term: (2 nproc - 1)/(2 nproc) */
				942	x = (long double)1.0 - (long double)0.5 / nproc;
				943
				944	#ifdef KMP_DEBUG
				945	{ // test natural alignment
				946	struct _test_a {
				947	char a;
				948	union {
				949	char b;
				950	DBL d;
				951	};
				952	} t;
				953	ptrdiff_t natural_alignment = (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
				954	//__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long long)natural_alignment );
				955	KMP_DEBUG_ASSERT( ( ( (ptrdiff_t)&pr->u.p.parm3 ) & ( natural_alignment ) ) == 0 );
				956	}
				957	#endif // KMP_DEBUG
				958
				959	/* save the term in thread private dispatch structure */
				960	(DBL)&pr->u.p.parm3 = x;
				961
				962	/* solve for the crossover point to the nearest integer i for which C_i <= chunk */
				963	{
				964	UT left, right, mid;
				965	long double p;
				966
				967	/* estimate initial upper and lower bound */
				968
				969	/* doesn't matter what value right is as long as it is positive, but
				970	it affects performance of the solver
				971	*/
				972	right = 229;
				973	p = __kmp_pow< UT >(x,right);
				974	if ( p > target ) {
				975	do{
				976	p *= p;
				977	right <<= 1;
				978	} while(p>target && right < (1<<27));
				979	left = right >> 1; /* lower bound is previous (failed) estimate of upper bound */
				980	} else {
				981	left = 0;
				982	}
				983
				984	/* bisection root-finding method */
				985	while ( left + 1 < right ) {
				986	mid = (left + right) / 2;
				987	if ( __kmp_pow< UT >(x,mid) > target ) {
				988	left = mid;
				989	} else {
				990	right = mid;
				991	}
				992	} // while
				993	cross = right;
				994	}
				995	/* assert sanity of computed crossover point */
				996	KMP_ASSERT(cross && __kmp_pow< UT >(x, cross - 1) > target && __kmp_pow< UT >(x, cross) <= target);
				997
				998	/* save the crossover point in thread private dispatch structure */
				999	pr->u.p.parm2 = cross;
				1000
				1001	// C75803
				1002	#if ( ( KMP_OS_LINUX \|\| KMP_OS_WINDOWS ) && KMP_ARCH_X86 ) && ( ! defined( KMP_I8 ) )
				1003	#define GUIDED_ANALYTICAL_WORKAROUND (( DBL )&pr->u.p.parm3)
				1004	#else
				1005	#define GUIDED_ANALYTICAL_WORKAROUND (x)
				1006	#endif
				1007	/* dynamic-style scheduling offset */
				1008	pr->u.p.count = tc - __kmp_dispatch_guided_remaining(tc, GUIDED_ANALYTICAL_WORKAROUND, cross) - cross * chunk;
				1009	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1010	// restore FPCW
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1011	_control87(oldFpcw,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1012	#endif
				1013	} // if
				1014	} else {
				1015	KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to kmp_sch_static_greedy\n",
				1016	gtid ) );
				1017	schedule = kmp_sch_static_greedy;
				1018	/* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
				1019	pr->u.p.parm1 = tc;
				1020	} // if
				1021	} // case
				1022	break;
				1023	case kmp_sch_static_greedy:
				1024	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",gtid));
				1025	pr->u.p.parm1 = ( team -> t.t_nproc > 1 ) ?
				1026	( tc + team->t.t_nproc - 1 ) / team->t.t_nproc :
				1027	tc;
				1028	break;
				1029	case kmp_sch_static_chunked :
				1030	case kmp_sch_dynamic_chunked :
				1031	KD_TRACE(100,("__kmp_dispatch_init: T#%d kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid));
				1032	break;
				1033	case kmp_sch_trapezoidal :
				1034	{
				1035	/* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
				1036
				1037	T parm1, parm2, parm3, parm4;
				1038	KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid ) );
				1039
				1040	parm1 = chunk;
				1041
				1042	/* F : size of the first cycle */
				1043	parm2 = ( tc / (2 * team->t.t_nproc) );
				1044
				1045	if ( parm2 < 1 ) {
				1046	parm2 = 1;
				1047	}
				1048
				1049	/* L : size of the last cycle. Make sure the last cycle
				1050	* is not larger than the first cycle.
				1051	*/
				1052	if ( parm1 < 1 ) {
				1053	parm1 = 1;
				1054	} else if ( parm1 > parm2 ) {
				1055	parm1 = parm2;
				1056	}
				1057
				1058	/* N : number of cycles */
				1059	parm3 = ( parm2 + parm1 );
				1060	parm3 = ( 2 * tc + parm3 - 1) / parm3;
				1061
				1062	if ( parm3 < 2 ) {
				1063	parm3 = 2;
				1064	}
				1065
				1066	/* sigma : decreasing incr of the trapezoid */
				1067	parm4 = ( parm3 - 1 );
				1068	parm4 = ( parm2 - parm1 ) / parm4;
				1069
				1070	// pointless check, because parm4 >= 0 always
				1071	//if ( parm4 < 0 ) {
				1072	// parm4 = 0;
				1073	//}
				1074
				1075	pr->u.p.parm1 = parm1;
				1076	pr->u.p.parm2 = parm2;
				1077	pr->u.p.parm3 = parm3;
				1078	pr->u.p.parm4 = parm4;
				1079	} // case
				1080	break;
				1081
				1082	default:
				1083	{
				1084	__kmp_msg(
				1085	kmp_ms_fatal, // Severity
				1086	KMP_MSG( UnknownSchedTypeDetected ), // Primary message
				1087	KMP_HNT( GetNewerLibrary ), // Hint
				1088	__kmp_msg_null // Variadic argument list terminator
				1089	);
				1090	}
				1091	break;
				1092	} // switch
				1093	pr->schedule = schedule;
				1094	if ( active ) {
				1095	/* The name of this buffer should be my_buffer_index when it's free to use it */
				1096
				1097	KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1098	gtid, my_buffer_index, sh->buffer_index) );
				1099	__kmp_wait_yield< kmp_uint32 >( & sh->buffer_index, my_buffer_index, __kmp_eq< kmp_uint32 >
				1100	USE_ITT_BUILD_ARG( NULL )
				1101	);
				1102	// Note: KMP_WAIT_YIELD() cannot be used there: buffer index and my_buffer_index are
				1103	// always 32-bit integers.
				1104	KMP_MB(); /* is this necessary? */
				1105	KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d sh->buffer_index:%d\n",
				1106	gtid, my_buffer_index, sh->buffer_index) );
				1107
				1108	th -> th.th_dispatch -> th_dispatch_pr_current = (dispatch_private_info_t*) pr;
				1109	th -> th.th_dispatch -> th_dispatch_sh_current = (dispatch_shared_info_t*) sh;
				1110	#if USE_ITT_BUILD
				1111	if ( pr->ordered ) {
				1112	__kmp_itt_ordered_init( gtid );
				1113	}; // if
				1114	#endif /* USE_ITT_BUILD */
				1115	}; // if
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1116
				1117	#if USE_ITT_BUILD
				1118	// Report loop metadata
				1119	if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) {
				1120	kmp_uint32 tid = __kmp_tid_from_gtid( gtid );
				1121	if (KMP_MASTER_TID(tid)) {
				1122	kmp_uint64 schedtype = 0;
				1123
				1124	switch ( schedule ) {
				1125	case kmp_sch_static_chunked:
				1126	case kmp_sch_static_balanced:// Chunk is calculated in the switch above
				1127	break;
				1128	case kmp_sch_static_greedy:
				1129	cur_chunk = pr->u.p.parm1;
				1130	break;
				1131	case kmp_sch_dynamic_chunked:
				1132	schedtype = 1;
				1133	break;
				1134	case kmp_sch_guided_iterative_chunked:
				1135	case kmp_sch_guided_analytical_chunked:
				1136	schedtype = 2;
				1137	break;
				1138	default:
				1139	// Should we put this case under "static"?
				1140	// case kmp_sch_static_steal:
				1141	schedtype = 3;
				1142	break;
				1143	}
				1144	__kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
				1145	}
				1146	}
				1147	#endif /* USE_ITT_BUILD */
				1148
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1149	#ifdef KMP_DEBUG
				1150	{
				1151	const char * buff;
				1152	// create format specifiers before the debug output
				1153	buff = __kmp_str_format(
				1154	"__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s lb:%%%s ub:%%%s" \
				1155	" st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s" \
				1156	" parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
				1157	traits_t< UT >::spec, traits_t< T >::spec, traits_t< T >::spec,
				1158	traits_t< ST >::spec, traits_t< UT >::spec, traits_t< UT >::spec,
				1159	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< T >::spec,
				1160	traits_t< T >::spec, traits_t< T >::spec, traits_t< T >::spec );
				1161	KD_TRACE(10, ( buff,
				1162	gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
				1163	pr->u.p.st, pr->u.p.tc, pr->u.p.count,
				1164	pr->u.p.ordered_lower, pr->u.p.ordered_upper, pr->u.p.parm1,
				1165	pr->u.p.parm2, pr->u.p.parm3, pr->u.p.parm4 ) );
				1166	__kmp_str_free( &buff );
				1167	}
				1168	#endif
				1169	#if ( KMP_STATIC_STEAL_ENABLED )
				1170	if ( ___kmp_size_type < 8 ) {
				1171	// It cannot be guaranteed that after execution of a loop with some other schedule kind
				1172	// all the parm3 variables will contain the same value.
				1173	// Even if all parm3 will be the same, it still exists a bad case like using 0 and 1
				1174	// rather than program life-time increment.
				1175	// So the dedicated variable is required. The 'static_steal_counter' is used.
				1176	if( schedule == kmp_sch_static_steal ) {
				1177	// Other threads will inspect this variable when searching for a victim.
				1178	// This is a flag showing that other threads may steal from this thread since then.
				1179	volatile T * p = &pr->u.p.static_steal_counter;
				1180	p = p + 1;
				1181	}
				1182	}
				1183	#endif // ( KMP_STATIC_STEAL_ENABLED && USE_STEALING )
				1184	}
				1185
				1186	/*
				1187	* For ordered loops, either __kmp_dispatch_finish() should be called after
				1188	* every iteration, or __kmp_dispatch_finish_chunk() should be called after
				1189	* every chunk of iterations. If the ordered section(s) were not executed
				1190	* for this iteration (or every iteration in this chunk), we need to set the
				1191	* ordered iteration counters so that the next thread can proceed.
				1192	*/
				1193	template< typename UT >
				1194	static void
				1195	__kmp_dispatch_finish( int gtid, ident_t *loc )
				1196	{
				1197	typedef typename traits_t< UT >::signed_t ST;
				1198	kmp_info_t *th = __kmp_threads[ gtid ];
				1199
				1200	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid ) );
				1201	if ( ! th -> th.th_team -> t.t_serialized ) {
				1202
				1203	dispatch_private_info_template< UT > * pr =
				1204	reinterpret_cast< dispatch_private_info_template< UT >* >
				1205	( th->th.th_dispatch->th_dispatch_pr_current );
				1206	dispatch_shared_info_template< UT > volatile * sh =
				1207	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1208	( th->th.th_dispatch->th_dispatch_sh_current );
				1209	KMP_DEBUG_ASSERT( pr );
				1210	KMP_DEBUG_ASSERT( sh );
				1211	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1212	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1213
				1214	if ( pr->ordered_bumped ) {
				1215	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1216	gtid ) );
				1217	pr->ordered_bumped = 0;
				1218	} else {
				1219	UT lower = pr->u.p.ordered_lower;
				1220
				1221	#ifdef KMP_DEBUG
				1222	{
				1223	const char * buff;
				1224	// create format specifiers before the debug output
				1225	buff = __kmp_str_format(
				1226	"__kmp_dispatch_finish: T#%%d before wait: ordered_iteration:%%%s lower:%%%s\n",
				1227	traits_t< UT >::spec, traits_t< UT >::spec );
				1228	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1229	__kmp_str_free( &buff );
				1230	}
				1231	#endif
				1232
				1233	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1234	USE_ITT_BUILD_ARG(NULL)
				1235	);
				1236	KMP_MB(); /* is this necessary? */
				1237	#ifdef KMP_DEBUG
				1238	{
				1239	const char * buff;
				1240	// create format specifiers before the debug output
				1241	buff = __kmp_str_format(
				1242	"__kmp_dispatch_finish: T#%%d after wait: ordered_iteration:%%%s lower:%%%s\n",
				1243	traits_t< UT >::spec, traits_t< UT >::spec );
				1244	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower ) );
				1245	__kmp_str_free( &buff );
				1246	}
				1247	#endif
				1248
				1249	test_then_inc< ST >( (volatile ST *) & sh->u.s.ordered_iteration );
				1250	} // if
				1251	} // if
				1252	KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid ) );
				1253	}
				1254
				1255	#ifdef KMP_GOMP_COMPAT
				1256
				1257	template< typename UT >
				1258	static void
				1259	__kmp_dispatch_finish_chunk( int gtid, ident_t *loc )
				1260	{
				1261	typedef typename traits_t< UT >::signed_t ST;
				1262	kmp_info_t *th = __kmp_threads[ gtid ];
				1263
				1264	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid ) );
				1265	if ( ! th -> th.th_team -> t.t_serialized ) {
				1266	// int cid;
				1267	dispatch_private_info_template< UT > * pr =
				1268	reinterpret_cast< dispatch_private_info_template< UT >* >
				1269	( th->th.th_dispatch->th_dispatch_pr_current );
				1270	dispatch_shared_info_template< UT > volatile * sh =
				1271	reinterpret_cast< dispatch_shared_info_template< UT >volatile* >
				1272	( th->th.th_dispatch->th_dispatch_sh_current );
				1273	KMP_DEBUG_ASSERT( pr );
				1274	KMP_DEBUG_ASSERT( sh );
				1275	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1276	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1277
				1278	// for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
				1279	UT lower = pr->u.p.ordered_lower;
				1280	UT upper = pr->u.p.ordered_upper;
				1281	UT inc = upper - lower + 1;
				1282
				1283	if ( pr->ordered_bumped == inc ) {
				1284	KD_TRACE(1000, ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
				1285	gtid ) );
				1286	pr->ordered_bumped = 0;
				1287	} else {
				1288	inc -= pr->ordered_bumped;
				1289
				1290	#ifdef KMP_DEBUG
				1291	{
				1292	const char * buff;
				1293	// create format specifiers before the debug output
				1294	buff = __kmp_str_format(
				1295	"__kmp_dispatch_finish_chunk: T#%%d before wait: " \
				1296	"ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
				1297	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1298	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, lower, upper ) );
				1299	__kmp_str_free( &buff );
				1300	}
				1301	#endif
				1302
				1303	__kmp_wait_yield< UT >(&sh->u.s.ordered_iteration, lower, __kmp_ge< UT >
				1304	USE_ITT_BUILD_ARG(NULL)
				1305	);
				1306
				1307	KMP_MB(); /* is this necessary? */
				1308	KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting ordered_bumped to zero\n",
				1309	gtid ) );
				1310	pr->ordered_bumped = 0;
				1311	//!!!!! TODO check if the inc should be unsigned, or signed???
				1312	#ifdef KMP_DEBUG
				1313	{
				1314	const char * buff;
				1315	// create format specifiers before the debug output
				1316	buff = __kmp_str_format(
				1317	"__kmp_dispatch_finish_chunk: T#%%d after wait: " \
				1318	"ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
				1319	traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec, traits_t< UT >::spec );
				1320	KD_TRACE(1000, ( buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper ) );
				1321	__kmp_str_free( &buff );
				1322	}
				1323	#endif
				1324
				1325	test_then_add< ST >( (volatile ST *) & sh->u.s.ordered_iteration, inc);
				1326	}
				1327	// }
				1328	}
				1329	KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid ) );
				1330	}
				1331
				1332	#endif /* KMP_GOMP_COMPAT */
				1333
				1334	template< typename T >
				1335	static int
				1336	__kmp_dispatch_next(
				1337	ident_t loc, int gtid, kmp_int32 p_last, T p_lb, T p_ub, typename traits_t< T >::signed_t *p_st
				1338	) {
				1339
				1340	typedef typename traits_t< T >::unsigned_t UT;
				1341	typedef typename traits_t< T >::signed_t ST;
				1342	typedef typename traits_t< T >::floating_t DBL;
				1343	static const int ___kmp_size_type = sizeof( UT );
				1344
				1345	int status;
				1346	dispatch_private_info_template< T > * pr;
				1347	kmp_info_t * th = __kmp_threads[ gtid ];
				1348	kmp_team_t * team = th -> th.th_team;
				1349
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1350	KMP_DEBUG_ASSERT( p_last && p_lb && p_ub && p_st ); // AC: these cannot be NULL
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1351	#ifdef KMP_DEBUG
				1352	{
				1353	const char * buff;
				1354	// create format specifiers before the debug output
				1355	buff = __kmp_str_format(
				1356	"__kmp_dispatch_next: T#%%d called p_lb:%%%s p_ub:%%%s p_st:%%%s p_last: %%p\n",
				1357	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				1358	KD_TRACE(1000, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last ) );
				1359	__kmp_str_free( &buff );
				1360	}
				1361	#endif
				1362
				1363	if ( team -> t.t_serialized ) {
				1364	/* NOTE: serialize this dispatch becase we are not at the active level */
				1365	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1366	( th -> th.th_dispatch -> th_disp_buffer ); /* top of the stack */
				1367	KMP_DEBUG_ASSERT( pr );
				1368
				1369	if ( (status = (pr->u.p.tc != 0)) == 0 ) {
				1370	*p_lb = 0;
				1371	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1372	// if ( p_last != NULL )
				1373	// *p_last = 0;
				1374	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1375	*p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1376	if ( __kmp_env_consistency_check ) {
				1377	if ( pr->pushed_ws != ct_none ) {
				1378	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1379	}
				1380	}
				1381	} else if ( pr->nomerge ) {
				1382	kmp_int32 last;
				1383	T start;
				1384	UT limit, trip, init;
				1385	ST incr;
				1386	T chunk = pr->u.p.parm1;
				1387
				1388	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid ) );
				1389
				1390	init = chunk * pr->u.p.count++;
				1391	trip = pr->u.p.tc - 1;
				1392
				1393	if ( (status = (init <= trip)) == 0 ) {
				1394	*p_lb = 0;
				1395	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1396	// if ( p_last != NULL )
				1397	// *p_last = 0;
				1398	if ( p_st != NULL )
				1399	*p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1400	if ( __kmp_env_consistency_check ) {
				1401	if ( pr->pushed_ws != ct_none ) {
				1402	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				1403	}
				1404	}
				1405	} else {
				1406	start = pr->u.p.lb;
				1407	limit = chunk + init - 1;
				1408	incr = pr->u.p.st;
				1409
				1410	if ( (last = (limit >= trip)) != 0 ) {
				1411	limit = trip;
				1412	#if KMP_OS_WINDOWS
				1413	pr->u.p.last_upper = pr->u.p.ub;
				1414	#endif /* KMP_OS_WINDOWS */
				1415	}
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1416	if ( p_last != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1417	*p_last = last;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1418	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1419	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1420	if ( incr == 1 ) {
				1421	*p_lb = start + init;
				1422	*p_ub = start + limit;
				1423	} else {
				1424	p_lb = start + init incr;
				1425	p_ub = start + limit incr;
				1426	}
				1427
				1428	if ( pr->ordered ) {
				1429	pr->u.p.ordered_lower = init;
				1430	pr->u.p.ordered_upper = limit;
				1431	#ifdef KMP_DEBUG
				1432	{
				1433	const char * buff;
				1434	// create format specifiers before the debug output
				1435	buff = __kmp_str_format(
				1436	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1437	traits_t< UT >::spec, traits_t< UT >::spec );
				1438	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1439	__kmp_str_free( &buff );
				1440	}
				1441	#endif
				1442	} // if
				1443	} // if
				1444	} else {
				1445	pr->u.p.tc = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1446	*p_lb = pr->u.p.lb;
				1447	*p_ub = pr->u.p.ub;
				1448	#if KMP_OS_WINDOWS
				1449	pr->u.p.last_upper = *p_ub;
				1450	#endif /* KMP_OS_WINDOWS */
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1451	if ( p_last != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1452	*p_last = TRUE;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1453	if ( p_st != NULL )
				1454	*p_st = pr->u.p.st;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1455	} // if
				1456	#ifdef KMP_DEBUG
				1457	{
				1458	const char * buff;
				1459	// create format specifiers before the debug output
				1460	buff = __kmp_str_format(
				1461	"__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s " \
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1462	"p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1463	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1464	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, p_st, p_last, p_last, status) );
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1465	__kmp_str_free( &buff );
				1466	}
				1467	#endif
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1468	#if INCLUDE_SSC_MARKS
				1469	SSC_MARK_DISPATCH_NEXT();
				1470	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1471	return status;
				1472	} else {
				1473	kmp_int32 last = 0;
				1474	dispatch_shared_info_template< UT > *sh;
				1475	T start;
				1476	ST incr;
				1477	UT limit, trip, init;
				1478
				1479	KMP_DEBUG_ASSERT( th->th.th_dispatch ==
				1480	&th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid] );
				1481
				1482	pr = reinterpret_cast< dispatch_private_info_template< T >* >
				1483	( th->th.th_dispatch->th_dispatch_pr_current );
				1484	KMP_DEBUG_ASSERT( pr );
				1485	sh = reinterpret_cast< dispatch_shared_info_template< UT >* >
				1486	( th->th.th_dispatch->th_dispatch_sh_current );
				1487	KMP_DEBUG_ASSERT( sh );
				1488
				1489	if ( pr->u.p.tc == 0 ) {
				1490	// zero trip count
				1491	status = 0;
				1492	} else {
				1493	switch (pr->schedule) {
				1494	#if ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1495	case kmp_sch_static_steal:
				1496	{
				1497	T chunk = pr->u.p.parm1;
				1498
				1499	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n", gtid) );
				1500
				1501	trip = pr->u.p.tc - 1;
				1502
				1503	if ( ___kmp_size_type > 4 ) {
				1504	// Other threads do not look into the data of this thread,
				1505	// so it's not necessary to make volatile casting.
				1506	init = ( pr->u.p.count )++;
				1507	status = ( init < (UT)pr->u.p.ub );
				1508	} else {
				1509	typedef union {
				1510	struct {
				1511	UT count;
				1512	T ub;
				1513	} p;
				1514	kmp_int64 b;
				1515	} union_i4;
				1516	// All operations on 'count' or 'ub' must be combined atomically together.
				1517	// stealing implemented only for 4-byte indexes
				1518	{
				1519	union_i4 vold, vnew;
				1520	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1521	vnew = vold;
				1522	vnew.p.count++;
				1523	while( ! KMP_COMPARE_AND_STORE_ACQ64(
				1524	( volatile kmp_int64* )&pr->u.p.count,
				1525	VOLATILE_CAST(kmp_int64 )&vold.b,
				1526	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1527	KMP_CPU_PAUSE();
				1528	vold.b = ( volatile kmp_int64 )(&pr->u.p.count);
				1529	vnew = vold;
				1530	vnew.p.count++;
				1531	}
				1532	vnew = vold;
				1533	init = vnew.p.count;
				1534	status = ( init < (UT)vnew.p.ub ) ;
				1535	}
				1536
				1537	if( !status ) {
				1538	kmp_info_t **other_threads = team->t.t_threads;
				1539	int while_limit = 10;
				1540	int while_index = 0;
				1541
				1542	// TODO: algorithm of searching for a victim
				1543	// should be cleaned up and measured
				1544	while ( ( !status ) && ( while_limit != ++while_index ) ) {
				1545	union_i4 vold, vnew;
				1546	kmp_int32 remaining; // kmp_int32 because KMP_I4 only
				1547	T victimIdx = pr->u.p.parm4;
				1548	T oldVictimIdx = victimIdx;
				1549	dispatch_private_info_template< T > * victim;
				1550
				1551	do {
				1552	if( !victimIdx ) {
				1553	victimIdx = team->t.t_nproc - 1;
				1554	} else {
				1555	--victimIdx;
				1556	}
				1557	victim = reinterpret_cast< dispatch_private_info_template< T >* >
				1558	( other_threads[victimIdx]->th.th_dispatch->th_dispatch_pr_current );
				1559	} while ( (victim == NULL \|\| victim == pr) && oldVictimIdx != victimIdx );
				1560	// TODO: think about a proper place of this test
				1561	if ( ( !victim ) \|\|
				1562	( (( volatile T )&victim->u.p.static_steal_counter) !=
				1563	(( volatile T )&pr->u.p.static_steal_counter) ) ) {
				1564	// TODO: delay would be nice
				1565	continue;
				1566	// the victim is not ready yet to participate in stealing
				1567	// because the victim is still in kmp_init_dispatch
				1568	}
				1569	if ( oldVictimIdx == victimIdx ) {
				1570	break;
				1571	}
				1572	pr->u.p.parm4 = victimIdx;
				1573
				1574	while( 1 ) {
				1575	vold.b = ( volatile kmp_int64 )( &victim->u.p.count );
				1576	vnew = vold;
				1577
				1578	KMP_DEBUG_ASSERT( (vnew.p.ub - 1) * (UT)chunk <= trip );
				1579	if ( vnew.p.count >= (UT)vnew.p.ub \|\| (remaining = vnew.p.ub - vnew.p.count) < 4 ) {
				1580	break;
				1581	}
				1582	vnew.p.ub -= (remaining >> 2);
				1583	KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
				1584	#pragma warning( push )
				1585	// disable warning on pointless comparison of unsigned with 0
				1586	#pragma warning( disable: 186 )
				1587	KMP_DEBUG_ASSERT(vnew.p.ub >= 0);
				1588	#pragma warning( pop )
				1589	// TODO: Should this be acquire or release?
				1590	if ( KMP_COMPARE_AND_STORE_ACQ64(
				1591	( volatile kmp_int64 * )&victim->u.p.count,
				1592	VOLATILE_CAST(kmp_int64 )&vold.b,
				1593	VOLATILE_CAST(kmp_int64 )&vnew.b ) ) {
				1594	status = 1;
				1595	while_index = 0;
				1596	// now update own count and ub
				1597	#if KMP_ARCH_X86
				1598	// stealing executed on non-KMP_ARCH_X86 only
				1599	// Atomic 64-bit write on ia32 is
				1600	// unavailable, so we do this in steps.
				1601	// This code is not tested.
				1602	init = vold.p.count;
				1603	pr->u.p.ub = 0;
				1604	pr->u.p.count = init + 1;
				1605	pr->u.p.ub = vnew.p.count;
				1606	#else
				1607	init = vnew.p.ub;
				1608	vold.p.count = init + 1;
				1609	// TODO: is it safe and enough?
				1610	( volatile kmp_int64 )(&pr->u.p.count) = vold.b;
				1611	#endif // KMP_ARCH_X86
				1612	break;
				1613	} // if
				1614	KMP_CPU_PAUSE();
				1615	} // while (1)
				1616	} // while
				1617	} // if
				1618	} // if
				1619	if ( !status ) {
				1620	*p_lb = 0;
				1621	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1622	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1623	} else {
				1624	start = pr->u.p.parm2;
				1625	init *= chunk;
				1626	limit = chunk + init - 1;
				1627	incr = pr->u.p.st;
				1628
				1629	KMP_DEBUG_ASSERT(init <= trip);
				1630	if ( (last = (limit >= trip)) != 0 )
				1631	limit = trip;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1632	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1633
				1634	if ( incr == 1 ) {
				1635	*p_lb = start + init;
				1636	*p_ub = start + limit;
				1637	} else {
				1638	p_lb = start + init incr;
				1639	p_ub = start + limit incr;
				1640	}
				1641
				1642	if ( pr->ordered ) {
				1643	pr->u.p.ordered_lower = init;
				1644	pr->u.p.ordered_upper = limit;
				1645	#ifdef KMP_DEBUG
				1646	{
				1647	const char * buff;
				1648	// create format specifiers before the debug output
				1649	buff = __kmp_str_format(
				1650	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1651	traits_t< UT >::spec, traits_t< UT >::spec );
				1652	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1653	__kmp_str_free( &buff );
				1654	}
				1655	#endif
				1656	} // if
				1657	} // if
				1658	break;
				1659	} // case
				1660	#endif // ( KMP_STATIC_STEAL_ENABLED && KMP_ARCH_X86_64 )
				1661	case kmp_sch_static_balanced:
				1662	{
				1663	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid) );
				1664	if ( (status = !pr->u.p.count) != 0 ) { /* check if thread has any iteration to do */
				1665	pr->u.p.count = 1;
				1666	*p_lb = pr->u.p.lb;
				1667	*p_ub = pr->u.p.ub;
				1668	last = pr->u.p.parm1;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1669	if ( p_st != NULL )
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1670	*p_st = pr->u.p.st;
				1671	} else { /* no iterations to do */
				1672	pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
				1673	}
				1674	if ( pr->ordered ) {
				1675	#ifdef KMP_DEBUG
				1676	{
				1677	const char * buff;
				1678	// create format specifiers before the debug output
				1679	buff = __kmp_str_format(
				1680	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1681	traits_t< UT >::spec, traits_t< UT >::spec );
				1682	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1683	__kmp_str_free( &buff );
				1684	}
				1685	#endif
				1686	} // if
				1687	} // case
				1688	break;
				1689	case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was merged here */
				1690	case kmp_sch_static_chunked:
				1691	{
				1692	T parm1;
				1693
				1694	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_[affinity\|chunked] case\n",
				1695	gtid ) );
				1696	parm1 = pr->u.p.parm1;
				1697
				1698	trip = pr->u.p.tc - 1;
				1699	init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
				1700
				1701	if ( (status = (init <= trip)) != 0 ) {
				1702	start = pr->u.p.lb;
				1703	incr = pr->u.p.st;
				1704	limit = parm1 + init - 1;
				1705
				1706	if ( (last = (limit >= trip)) != 0 )
				1707	limit = trip;
				1708
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1709	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1710
				1711	pr->u.p.count += team->t.t_nproc;
				1712
				1713	if ( incr == 1 ) {
				1714	*p_lb = start + init;
				1715	*p_ub = start + limit;
				1716	}
				1717	else {
				1718	p_lb = start + init incr;
				1719	p_ub = start + limit incr;
				1720	}
				1721
				1722	if ( pr->ordered ) {
				1723	pr->u.p.ordered_lower = init;
				1724	pr->u.p.ordered_upper = limit;
				1725	#ifdef KMP_DEBUG
				1726	{
				1727	const char * buff;
				1728	// create format specifiers before the debug output
				1729	buff = __kmp_str_format(
				1730	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1731	traits_t< UT >::spec, traits_t< UT >::spec );
				1732	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1733	__kmp_str_free( &buff );
				1734	}
				1735	#endif
				1736	} // if
				1737	} // if
				1738	} // case
				1739	break;
				1740
				1741	case kmp_sch_dynamic_chunked:
				1742	{
				1743	T chunk = pr->u.p.parm1;
				1744
				1745	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
				1746	gtid ) );
				1747
				1748	init = chunk * test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1749	trip = pr->u.p.tc - 1;
				1750
				1751	if ( (status = (init <= trip)) == 0 ) {
				1752	*p_lb = 0;
				1753	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1754	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1755	} else {
				1756	start = pr->u.p.lb;
				1757	limit = chunk + init - 1;
				1758	incr = pr->u.p.st;
				1759
				1760	if ( (last = (limit >= trip)) != 0 )
				1761	limit = trip;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1762
				1763	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1764
				1765	if ( incr == 1 ) {
				1766	*p_lb = start + init;
				1767	*p_ub = start + limit;
				1768	} else {
				1769	p_lb = start + init incr;
				1770	p_ub = start + limit incr;
				1771	}
				1772
				1773	if ( pr->ordered ) {
				1774	pr->u.p.ordered_lower = init;
				1775	pr->u.p.ordered_upper = limit;
				1776	#ifdef KMP_DEBUG
				1777	{
				1778	const char * buff;
				1779	// create format specifiers before the debug output
				1780	buff = __kmp_str_format(
				1781	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1782	traits_t< UT >::spec, traits_t< UT >::spec );
				1783	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1784	__kmp_str_free( &buff );
				1785	}
				1786	#endif
				1787	} // if
				1788	} // if
				1789	} // case
				1790	break;
				1791
				1792	case kmp_sch_guided_iterative_chunked:
				1793	{
				1794	T chunkspec = pr->u.p.parm1;
				1795	KD_TRACE(100,
				1796	("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked iterative case\n",gtid));
				1797	trip = pr->u.p.tc;
				1798	// Start atomic part of calculations
				1799	while(1) {
				1800	ST remaining; // signed, because can be < 0
				1801	init = sh->u.s.iteration; // shared value
				1802	remaining = trip - init;
				1803	if ( remaining <= 0 ) { // AC: need to compare with 0 first
				1804	// nothing to do, don't try atomic op
				1805	status = 0;
				1806	break;
				1807	}
				1808	if ( (T)remaining < pr->u.p.parm2 ) { // compare with Knproc(chunk+1), K=2 by default
				1809	// use dynamic-style shcedule
				1810	// atomically inrement iterations, get old value
				1811	init = test_then_add<ST>( (ST*)&sh->u.s.iteration, (ST)chunkspec );
				1812	remaining = trip - init;
				1813	if (remaining <= 0) {
				1814	status = 0; // all iterations got by other threads
				1815	} else {
				1816	// got some iterations to work on
				1817	status = 1;
				1818	if ( (T)remaining > chunkspec ) {
				1819	limit = init + chunkspec - 1;
				1820	} else {
				1821	last = 1; // the last chunk
				1822	limit = init + remaining - 1;
				1823	} // if
				1824	} // if
				1825	break;
				1826	} // if
				1827	limit = init + (UT)( remaining * (double)&pr->u.p.parm3 ); // divide by K*nproc
				1828	if ( compare_and_swap<ST>( (ST*)&sh->u.s.iteration, (ST)init, (ST)limit ) ) {
				1829	// CAS was successful, chunk obtained
				1830	status = 1;
				1831	--limit;
				1832	break;
				1833	} // if
				1834	} // while
				1835	if ( status != 0 ) {
				1836	start = pr->u.p.lb;
				1837	incr = pr->u.p.st;
				1838	if ( p_st != NULL )
				1839	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1840	p_lb = start + init incr;
				1841	p_ub = start + limit incr;
				1842	if ( pr->ordered ) {
				1843	pr->u.p.ordered_lower = init;
				1844	pr->u.p.ordered_upper = limit;
				1845	#ifdef KMP_DEBUG
				1846	{
				1847	const char * buff;
				1848	// create format specifiers before the debug output
				1849	buff = __kmp_str_format(
				1850	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1851	traits_t< UT >::spec, traits_t< UT >::spec );
				1852	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1853	__kmp_str_free( &buff );
				1854	}
				1855	#endif
				1856	} // if
				1857	} else {
				1858	*p_lb = 0;
				1859	*p_ub = 0;
				1860	if ( p_st != NULL )
				1861	*p_st = 0;
				1862	} // if
				1863	} // case
				1864	break;
				1865
				1866	case kmp_sch_guided_analytical_chunked:
				1867	{
				1868	T chunkspec = pr->u.p.parm1;
				1869	UT chunkIdx;
				1870	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1871	/* for storing original FPCW value for Windows* OS on
				1872	IA-32 architecture 8-byte version */
				1873	unsigned int oldFpcw;
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1874	unsigned int fpcwSet = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1875	#endif
				1876	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked analytical case\n",
				1877	gtid ) );
				1878
				1879	trip = pr->u.p.tc;
				1880
				1881	KMP_DEBUG_ASSERT(team->t.t_nproc > 1);
				1882	KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)team->t.t_nproc < trip);
				1883
				1884	while(1) { /* this while loop is a safeguard against unexpected zero chunk sizes */
				1885	chunkIdx = test_then_inc_acq< ST >((volatile ST *) & sh->u.s.iteration );
				1886	if ( chunkIdx >= (UT)pr->u.p.parm2 ) {
				1887	--trip;
				1888	/* use dynamic-style scheduling */
				1889	init = chunkIdx * chunkspec + pr->u.p.count;
				1890	/* need to verify init > 0 in case of overflow in the above calculation */
				1891	if ( (status = (init > 0 && init <= trip)) != 0 ) {
				1892	limit = init + chunkspec -1;
				1893
				1894	if ( (last = (limit >= trip)) != 0 )
				1895	limit = trip;
				1896	}
				1897	break;
				1898	} else {
				1899	/* use exponential-style scheduling */
				1900	/* The following check is to workaround the lack of long double precision on Windows* OS.
				1901	This check works around the possible effect that init != 0 for chunkIdx == 0.
				1902	*/
				1903	#if KMP_OS_WINDOWS && KMP_ARCH_X86
				1904	/* If we haven't already done so, save original
				1905	FPCW and set precision to 64-bit, as Windows* OS
				1906	on IA-32 architecture defaults to 53-bit */
				1907	if ( !fpcwSet ) {
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1908	oldFpcw = _control87(0,0);
				1909	_control87(_PC_64,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1910	fpcwSet = 0x30000;
				1911	}
				1912	#endif
				1913	if ( chunkIdx ) {
				1914	init = __kmp_dispatch_guided_remaining< T >(
				1915	trip, ( DBL )&pr->u.p.parm3, chunkIdx );
				1916	KMP_DEBUG_ASSERT(init);
				1917	init = trip - init;
				1918	} else
				1919	init = 0;
				1920	limit = trip - __kmp_dispatch_guided_remaining< T >(
				1921	trip, ( DBL )&pr->u.p.parm3, chunkIdx + 1 );
				1922	KMP_ASSERT(init <= limit);
				1923	if ( init < limit ) {
				1924	KMP_DEBUG_ASSERT(limit <= trip);
				1925	--limit;
				1926	status = 1;
				1927	break;
				1928	} // if
				1929	} // if
				1930	} // while (1)
				1931	#if KMP_OS_WINDOWS && KMP_ARCH_X86
Jim Cownie	181b4bb	2013-12-23 17:28:57 +0000	[diff] [blame]	1932	/* restore FPCW if necessary
				1933	AC: check fpcwSet flag first because oldFpcw can be uninitialized here
				1934	*/
				1935	if ( fpcwSet && ( oldFpcw & fpcwSet ) )
				1936	_control87(oldFpcw,_MCW_PC);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1937	#endif
				1938	if ( status != 0 ) {
				1939	start = pr->u.p.lb;
				1940	incr = pr->u.p.st;
				1941	if ( p_st != NULL )
				1942	*p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1943	p_lb = start + init incr;
				1944	p_ub = start + limit incr;
				1945	if ( pr->ordered ) {
				1946	pr->u.p.ordered_lower = init;
				1947	pr->u.p.ordered_upper = limit;
				1948	#ifdef KMP_DEBUG
				1949	{
				1950	const char * buff;
				1951	// create format specifiers before the debug output
				1952	buff = __kmp_str_format(
				1953	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				1954	traits_t< UT >::spec, traits_t< UT >::spec );
				1955	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				1956	__kmp_str_free( &buff );
				1957	}
				1958	#endif
				1959	}
				1960	} else {
				1961	*p_lb = 0;
				1962	*p_ub = 0;
				1963	if ( p_st != NULL )
				1964	*p_st = 0;
				1965	}
				1966	} // case
				1967	break;
				1968
				1969	case kmp_sch_trapezoidal:
				1970	{
				1971	UT index;
				1972	T parm2 = pr->u.p.parm2;
				1973	T parm3 = pr->u.p.parm3;
				1974	T parm4 = pr->u.p.parm4;
				1975	KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
				1976	gtid ) );
				1977
				1978	index = test_then_inc< ST >( (volatile ST *) & sh->u.s.iteration );
				1979
				1980	init = ( index * ( (2parm2) - (index-1)parm4 ) ) / 2;
				1981	trip = pr->u.p.tc - 1;
				1982
				1983	if ( (status = ((T)index < parm3 && init <= trip)) == 0 ) {
				1984	*p_lb = 0;
				1985	*p_ub = 0;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1986	if ( p_st != NULL ) *p_st = 0;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1987	} else {
				1988	start = pr->u.p.lb;
				1989	limit = ( (index+1) * ( 2parm2 - indexparm4 ) ) / 2 - 1;
				1990	incr = pr->u.p.st;
				1991
				1992	if ( (last = (limit >= trip)) != 0 )
				1993	limit = trip;
				1994
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	1995	if ( p_st != NULL ) *p_st = incr;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	1996
				1997	if ( incr == 1 ) {
				1998	*p_lb = start + init;
				1999	*p_ub = start + limit;
				2000	} else {
				2001	p_lb = start + init incr;
				2002	p_ub = start + limit incr;
				2003	}
				2004
				2005	if ( pr->ordered ) {
				2006	pr->u.p.ordered_lower = init;
				2007	pr->u.p.ordered_upper = limit;
				2008	#ifdef KMP_DEBUG
				2009	{
				2010	const char * buff;
				2011	// create format specifiers before the debug output
				2012	buff = __kmp_str_format(
				2013	"__kmp_dispatch_next: T#%%d ordered_lower:%%%s ordered_upper:%%%s\n",
				2014	traits_t< UT >::spec, traits_t< UT >::spec );
				2015	KD_TRACE(1000, ( buff, gtid, pr->u.p.ordered_lower, pr->u.p.ordered_upper ) );
				2016	__kmp_str_free( &buff );
				2017	}
				2018	#endif
				2019	} // if
				2020	} // if
				2021	} // case
				2022	break;
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2023	default:
				2024	{
				2025	status = 0; // to avoid complaints on uninitialized variable use
				2026	__kmp_msg(
				2027	kmp_ms_fatal, // Severity
				2028	KMP_MSG( UnknownSchedTypeDetected ), // Primary message
				2029	KMP_HNT( GetNewerLibrary ), // Hint
				2030	__kmp_msg_null // Variadic argument list terminator
				2031	);
				2032	}
				2033	break;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2034	} // switch
				2035	} // if tc == 0;
				2036
				2037	if ( status == 0 ) {
				2038	UT num_done;
				2039
				2040	num_done = test_then_inc< ST >( (volatile ST *) & sh->u.s.num_done );
				2041	#ifdef KMP_DEBUG
				2042	{
				2043	const char * buff;
				2044	// create format specifiers before the debug output
				2045	buff = __kmp_str_format(
				2046	"__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
				2047	traits_t< UT >::spec );
				2048	KD_TRACE(100, ( buff, gtid, sh->u.s.num_done ) );
				2049	__kmp_str_free( &buff );
				2050	}
				2051	#endif
				2052
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2053	if ( (ST)num_done == team->t.t_nproc-1 ) {
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2054	/* NOTE: release this buffer to be reused */
				2055
				2056	KMP_MB(); /* Flush all pending memory write invalidates. */
				2057
				2058	sh->u.s.num_done = 0;
				2059	sh->u.s.iteration = 0;
				2060
				2061	/* TODO replace with general release procedure? */
				2062	if ( pr->ordered ) {
				2063	sh->u.s.ordered_iteration = 0;
				2064	}
				2065
				2066	KMP_MB(); /* Flush all pending memory write invalidates. */
				2067
				2068	sh -> buffer_index += KMP_MAX_DISP_BUF;
				2069	KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
				2070	gtid, sh->buffer_index) );
				2071
				2072	KMP_MB(); /* Flush all pending memory write invalidates. */
				2073
				2074	} // if
				2075	if ( __kmp_env_consistency_check ) {
				2076	if ( pr->pushed_ws != ct_none ) {
				2077	pr->pushed_ws = __kmp_pop_workshare( gtid, pr->pushed_ws, loc );
				2078	}
				2079	}
				2080
				2081	th -> th.th_dispatch -> th_deo_fcn = NULL;
				2082	th -> th.th_dispatch -> th_dxo_fcn = NULL;
				2083	th -> th.th_dispatch -> th_dispatch_sh_current = NULL;
				2084	th -> th.th_dispatch -> th_dispatch_pr_current = NULL;
				2085	} // if (status == 0)
				2086	#if KMP_OS_WINDOWS
				2087	else if ( last ) {
				2088	pr->u.p.last_upper = pr->u.p.ub;
				2089	}
				2090	#endif /* KMP_OS_WINDOWS */
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2091	if ( p_last != NULL && status != 0 )
				2092	*p_last = last;
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2093	} // if
				2094
				2095	#ifdef KMP_DEBUG
				2096	{
				2097	const char * buff;
				2098	// create format specifiers before the debug output
				2099	buff = __kmp_str_format(
				2100	"__kmp_dispatch_next: T#%%d normal case: " \
				2101	"p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
				2102	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec );
				2103	KD_TRACE(10, ( buff, gtid, p_lb, p_ub, p_st ? *p_st : 0, p_last, status ) );
				2104	__kmp_str_free( &buff );
				2105	}
				2106	#endif
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2107	#if INCLUDE_SSC_MARKS
				2108	SSC_MARK_DISPATCH_NEXT();
				2109	#endif
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2110	return status;
				2111	}
				2112
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2113	template< typename T >
				2114	static void
				2115	__kmp_dist_get_bounds(
				2116	ident_t *loc,
				2117	kmp_int32 gtid,
				2118	kmp_int32 *plastiter,
				2119	T *plower,
				2120	T *pupper,
				2121	typename traits_t< T >::signed_t incr
				2122	) {
				2123	KMP_COUNT_BLOCK(OMP_DISTR_FOR_dynamic);
				2124	typedef typename traits_t< T >::unsigned_t UT;
				2125	typedef typename traits_t< T >::signed_t ST;
				2126	register kmp_uint32 team_id;
				2127	register kmp_uint32 nteams;
				2128	register UT trip_count;
				2129	register kmp_team_t *team;
				2130	kmp_info_t * th;
				2131
				2132	KMP_DEBUG_ASSERT( plastiter && plower && pupper );
				2133	KE_TRACE( 10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
				2134	#ifdef KMP_DEBUG
				2135	{
				2136	const char * buff;
				2137	// create format specifiers before the debug output
				2138	buff = __kmp_str_format( "__kmpc_dist_get_bounds: T#%%d liter=%%d "\
				2139	"iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
				2140	traits_t< T >::spec, traits_t< T >::spec, traits_t< ST >::spec,
				2141	traits_t< T >::spec );
				2142	KD_TRACE(100, ( buff, gtid, plastiter, plower, *pupper, incr ) );
				2143	__kmp_str_free( &buff );
				2144	}
				2145	#endif
				2146
				2147	if( __kmp_env_consistency_check ) {
				2148	if( incr == 0 ) {
				2149	__kmp_error_construct( kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo, loc );
				2150	}
				2151	if( incr > 0 ? (pupper < plower) : (plower < pupper) ) {
				2152	// The loop is illegal.
				2153	// Some zero-trip loops maintained by compiler, e.g.:
				2154	// for(i=10;i<0;++i) // lower >= upper - run-time check
				2155	// for(i=0;i>10;--i) // lower <= upper - run-time check
				2156	// for(i=0;i>10;++i) // incr > 0 - compile-time check
				2157	// for(i=10;i<0;--i) // incr < 0 - compile-time check
				2158	// Compiler does not check the following illegal loops:
				2159	// for(i=0;i<10;i+=incr) // where incr<0
				2160	// for(i=10;i>0;i-=incr) // where incr<0
				2161	__kmp_error_construct( kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc );
				2162	}
				2163	}
				2164	th = __kmp_threads[gtid];
				2165	KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
				2166	team = th->th.th_team;
				2167	#if OMP_40_ENABLED
				2168	nteams = th->th.th_teams_size.nteams;
				2169	#endif
				2170	team_id = team->t.t_master_tid;
				2171	KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
				2172
				2173	// compute global trip count
				2174	if( incr == 1 ) {
				2175	trip_count = pupper - plower + 1;
				2176	} else if(incr == -1) {
				2177	trip_count = plower - pupper + 1;
				2178	} else {
				2179	trip_count = (ST)(pupper - plower) / incr + 1; // cast to signed to cover incr<0 case
				2180	}
				2181	if( trip_count <= nteams ) {
				2182	KMP_DEBUG_ASSERT(
				2183	__kmp_static == kmp_sch_static_greedy \|\| \
				2184	__kmp_static == kmp_sch_static_balanced
				2185	); // Unknown static scheduling type.
				2186	// only some teams get single iteration, others get nothing
				2187	if( team_id < trip_count ) {
				2188	pupper = plower = plower + team_id incr;
				2189	} else {
				2190	plower = pupper + incr; // zero-trip loop
				2191	}
				2192	if( plastiter != NULL )
				2193	*plastiter = ( team_id == trip_count - 1 );
				2194	} else {
				2195	if( __kmp_static == kmp_sch_static_balanced ) {
				2196	register UT chunk = trip_count / nteams;
				2197	register UT extras = trip_count % nteams;
				2198	plower += incr ( team_id * chunk + ( team_id < extras ? team_id : extras ) );
				2199	pupper = plower + chunk * incr - ( team_id < extras ? 0 : incr );
				2200	if( plastiter != NULL )
				2201	*plastiter = ( team_id == nteams - 1 );
				2202	} else {
				2203	register T chunk_inc_count =
				2204	( trip_count / nteams + ( ( trip_count % nteams ) ? 1 : 0) ) * incr;
				2205	register T upper = *pupper;
				2206	KMP_DEBUG_ASSERT( __kmp_static == kmp_sch_static_greedy );
				2207	// Unknown static scheduling type.
				2208	plower += team_id chunk_inc_count;
				2209	pupper = plower + chunk_inc_count - incr;
				2210	// Check/correct bounds if needed
				2211	if( incr > 0 ) {
				2212	if( pupper < plower )
				2213	*pupper = i_maxmin< T >::mx;
				2214	if( plastiter != NULL )
				2215	plastiter = plower <= upper && *pupper > upper - incr;
				2216	if( *pupper > upper )
				2217	*pupper = upper; // tracker C73258
				2218	} else {
				2219	if( pupper > plower )
				2220	*pupper = i_maxmin< T >::mn;
				2221	if( plastiter != NULL )
				2222	plastiter = plower >= upper && *pupper < upper - incr;
				2223	if( *pupper < upper )
				2224	*pupper = upper; // tracker C73258
				2225	}
				2226	}
				2227	}
				2228	}
				2229
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2230	//-----------------------------------------------------------------------------------------
				2231	// Dispatch routines
				2232	// Transfer call to template< type T >
				2233	// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
				2234	// T lb, T ub, ST st, ST chunk )
				2235	extern "C" {
				2236
				2237	/*!
				2238	@ingroup WORK_SHARING
				2239	@{
				2240	@param loc Source location
				2241	@param gtid Global thread id
				2242	@param schedule Schedule type
				2243	@param lb Lower bound
				2244	@param ub Upper bound
				2245	@param st Step (or increment if you prefer)
				2246	@param chunk The chunk size to block with
				2247
				2248	This function prepares the runtime to start a dynamically scheduled for loop, saving the loop arguments.
				2249	These functions are all identical apart from the types of the arguments.
				2250	*/
				2251
				2252	void
				2253	__kmpc_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2254	kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
				2255	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2256	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2257	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2258	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2259	}
				2260	/*!
				2261	See @ref __kmpc_dispatch_init_4
				2262	*/
				2263	void
				2264	__kmpc_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2265	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
				2266	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2267	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2268	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2269	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2270	}
				2271
				2272	/*!
				2273	See @ref __kmpc_dispatch_init_4
				2274	*/
				2275	void
				2276	__kmpc_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2277	kmp_int64 lb, kmp_int64 ub,
				2278	kmp_int64 st, kmp_int64 chunk )
				2279	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2280	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2281	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2282	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2283	}
				2284
				2285	/*!
				2286	See @ref __kmpc_dispatch_init_4
				2287	*/
				2288	void
				2289	__kmpc_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2290	kmp_uint64 lb, kmp_uint64 ub,
				2291	kmp_int64 st, kmp_int64 chunk )
				2292	{
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2293	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2294	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2295	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2296	}
				2297
				2298	/*!
Jim Cownie	4cc4bb4	2014-10-07 16:25:50 +0000	[diff] [blame]	2299	See @ref __kmpc_dispatch_init_4
				2300
				2301	Difference from __kmpc_dispatch_init set of functions is these functions
				2302	are called for composite distribute parallel for construct. Thus before
				2303	regular iterations dispatching we need to calc per-team iteration space.
				2304
				2305	These functions are all identical apart from the types of the arguments.
				2306	*/
				2307	void
				2308	__kmpc_dist_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2309	kmp_int32 *p_last, kmp_int32 lb, kmp_int32 ub, kmp_int32 st, kmp_int32 chunk )
				2310	{
				2311	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2312	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2313	__kmp_dist_get_bounds< kmp_int32 >( loc, gtid, p_last, &lb, &ub, st );
				2314	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2315	}
				2316
				2317	void
				2318	__kmpc_dist_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2319	kmp_int32 *p_last, kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk )
				2320	{
				2321	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2322	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2323	__kmp_dist_get_bounds< kmp_uint32 >( loc, gtid, p_last, &lb, &ub, st );
				2324	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2325	}
				2326
				2327	void
				2328	__kmpc_dist_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2329	kmp_int32 *p_last, kmp_int64 lb, kmp_int64 ub, kmp_int64 st, kmp_int64 chunk )
				2330	{
				2331	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2332	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2333	__kmp_dist_get_bounds< kmp_int64 >( loc, gtid, p_last, &lb, &ub, st );
				2334	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2335	}
				2336
				2337	void
				2338	__kmpc_dist_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2339	kmp_int32 *p_last, kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk )
				2340	{
				2341	KMP_COUNT_BLOCK(OMP_FOR_dynamic);
				2342	KMP_DEBUG_ASSERT( __kmp_init_serial );
				2343	__kmp_dist_get_bounds< kmp_uint64 >( loc, gtid, p_last, &lb, &ub, st );
				2344	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk, true );
				2345	}
				2346
				2347	/*!
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2348	@param loc Source code location
				2349	@param gtid Global thread id
				2350	@param p_last Pointer to a flag set to one if this is the last chunk or zero otherwise
				2351	@param p_lb Pointer to the lower bound for the next chunk of work
				2352	@param p_ub Pointer to the upper bound for the next chunk of work
				2353	@param p_st Pointer to the stride for the next chunk of work
				2354	@return one if there is work to be done, zero otherwise
				2355
				2356	Get the next dynamically allocated chunk of work for this thread.
				2357	If there is no more work, then the lb,ub and stride need not be modified.
				2358	*/
				2359	int
				2360	__kmpc_dispatch_next_4( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2361	kmp_int32 p_lb, kmp_int32 p_ub, kmp_int32 *p_st )
				2362	{
				2363	return __kmp_dispatch_next< kmp_int32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2364	}
				2365
				2366	/*!
				2367	See @ref __kmpc_dispatch_next_4
				2368	*/
				2369	int
				2370	__kmpc_dispatch_next_4u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2371	kmp_uint32 p_lb, kmp_uint32 p_ub, kmp_int32 *p_st )
				2372	{
				2373	return __kmp_dispatch_next< kmp_uint32 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2374	}
				2375
				2376	/*!
				2377	See @ref __kmpc_dispatch_next_4
				2378	*/
				2379	int
				2380	__kmpc_dispatch_next_8( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2381	kmp_int64 p_lb, kmp_int64 p_ub, kmp_int64 *p_st )
				2382	{
				2383	return __kmp_dispatch_next< kmp_int64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2384	}
				2385
				2386	/*!
				2387	See @ref __kmpc_dispatch_next_4
				2388	*/
				2389	int
				2390	__kmpc_dispatch_next_8u( ident_t loc, kmp_int32 gtid, kmp_int32 p_last,
				2391	kmp_uint64 p_lb, kmp_uint64 p_ub, kmp_int64 *p_st )
				2392	{
				2393	return __kmp_dispatch_next< kmp_uint64 >( loc, gtid, p_last, p_lb, p_ub, p_st );
				2394	}
				2395
				2396	/*!
				2397	@param loc Source code location
				2398	@param gtid Global thread id
				2399
				2400	Mark the end of a dynamic loop.
				2401	*/
				2402	void
				2403	__kmpc_dispatch_fini_4( ident_t *loc, kmp_int32 gtid )
				2404	{
				2405	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2406	}
				2407
				2408	/*!
				2409	See @ref __kmpc_dispatch_fini_4
				2410	*/
				2411	void
				2412	__kmpc_dispatch_fini_8( ident_t *loc, kmp_int32 gtid )
				2413	{
				2414	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2415	}
				2416
				2417	/*!
				2418	See @ref __kmpc_dispatch_fini_4
				2419	*/
				2420	void
				2421	__kmpc_dispatch_fini_4u( ident_t *loc, kmp_int32 gtid )
				2422	{
				2423	__kmp_dispatch_finish< kmp_uint32 >( gtid, loc );
				2424	}
				2425
				2426	/*!
				2427	See @ref __kmpc_dispatch_fini_4
				2428	*/
				2429	void
				2430	__kmpc_dispatch_fini_8u( ident_t *loc, kmp_int32 gtid )
				2431	{
				2432	__kmp_dispatch_finish< kmp_uint64 >( gtid, loc );
				2433	}
				2434	/! @} /
				2435
				2436	//-----------------------------------------------------------------------------------------
				2437	//Non-template routines from kmp_dispatch.c used in other sources
				2438
				2439	kmp_uint32 __kmp_eq_4( kmp_uint32 value, kmp_uint32 checker) {
				2440	return value == checker;
				2441	}
				2442
				2443	kmp_uint32 __kmp_neq_4( kmp_uint32 value, kmp_uint32 checker) {
				2444	return value != checker;
				2445	}
				2446
				2447	kmp_uint32 __kmp_lt_4( kmp_uint32 value, kmp_uint32 checker) {
				2448	return value < checker;
				2449	}
				2450
				2451	kmp_uint32 __kmp_ge_4( kmp_uint32 value, kmp_uint32 checker) {
				2452	return value >= checker;
				2453	}
				2454
				2455	kmp_uint32 __kmp_le_4( kmp_uint32 value, kmp_uint32 checker) {
				2456	return value <= checker;
				2457	}
				2458	kmp_uint32 __kmp_eq_8( kmp_uint64 value, kmp_uint64 checker) {
				2459	return value == checker;
				2460	}
				2461
				2462	kmp_uint32 __kmp_neq_8( kmp_uint64 value, kmp_uint64 checker) {
				2463	return value != checker;
				2464	}
				2465
				2466	kmp_uint32 __kmp_lt_8( kmp_uint64 value, kmp_uint64 checker) {
				2467	return value < checker;
				2468	}
				2469
				2470	kmp_uint32 __kmp_ge_8( kmp_uint64 value, kmp_uint64 checker) {
				2471	return value >= checker;
				2472	}
				2473
				2474	kmp_uint32 __kmp_le_8( kmp_uint64 value, kmp_uint64 checker) {
				2475	return value <= checker;
				2476	}
				2477
				2478	kmp_uint32
				2479	__kmp_wait_yield_4(volatile kmp_uint32 * spinner,
				2480	kmp_uint32 checker,
				2481	kmp_uint32 (* pred)( kmp_uint32, kmp_uint32 )
				2482	, void * obj // Higher-level synchronization object, or NULL.
				2483	)
				2484	{
				2485	// note: we may not belong to a team at this point
				2486	register volatile kmp_uint32 * spin = spinner;
				2487	register kmp_uint32 check = checker;
				2488	register kmp_uint32 spins;
				2489	register kmp_uint32 (*f) ( kmp_uint32, kmp_uint32 ) = pred;
				2490	register kmp_uint32 r;
				2491
				2492	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2493	KMP_INIT_YIELD( spins );
				2494	// main wait spin loop
				2495	while(!f(r = TCR_4(*spin), check)) {
				2496	KMP_FSYNC_SPIN_PREPARE( obj );
				2497	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2498	It causes problems with infinite recursion because of exit lock */
				2499	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2500	__kmp_abort_thread(); */
				2501
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2502	/* if we have waited a bit, or are oversubscribed, yield */
				2503	/* pause is in the following code */
				2504	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2505	KMP_YIELD_SPIN( spins );
				2506	}
				2507	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2508	return r;
				2509	}
				2510
				2511	kmp_uint64
				2512	__kmp_wait_yield_8( volatile kmp_uint64 * spinner,
				2513	kmp_uint64 checker,
				2514	kmp_uint32 (* pred)( kmp_uint64, kmp_uint64 )
				2515	, void * obj // Higher-level synchronization object, or NULL.
				2516	)
				2517	{
				2518	// note: we may not belong to a team at this point
				2519	register volatile kmp_uint64 * spin = spinner;
				2520	register kmp_uint64 check = checker;
				2521	register kmp_uint32 spins;
				2522	register kmp_uint32 (*f) ( kmp_uint64, kmp_uint64 ) = pred;
				2523	register kmp_uint64 r;
				2524
				2525	KMP_FSYNC_SPIN_INIT( obj, (void*) spin );
				2526	KMP_INIT_YIELD( spins );
				2527	// main wait spin loop
				2528	while(!f(r = *spin, check))
				2529	{
				2530	KMP_FSYNC_SPIN_PREPARE( obj );
				2531	/* GEH - remove this since it was accidentally introduced when kmp_wait was split.
				2532	It causes problems with infinite recursion because of exit lock */
				2533	/* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
				2534	__kmp_abort_thread(); */
				2535
Jim Cownie	5e8470a	2013-09-27 10:38:44 +0000	[diff] [blame]	2536	// if we are oversubscribed,
				2537	// or have waited a bit (and KMP_LIBARRY=throughput, then yield
				2538	// pause is in the following code
				2539	KMP_YIELD( TCR_4(__kmp_nth) > __kmp_avail_proc );
				2540	KMP_YIELD_SPIN( spins );
				2541	}
				2542	KMP_FSYNC_SPIN_ACQUIRED( obj );
				2543	return r;
				2544	}
				2545
				2546	} // extern "C"
				2547
				2548	#ifdef KMP_GOMP_COMPAT
				2549
				2550	void
				2551	__kmp_aux_dispatch_init_4( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2552	kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
				2553	kmp_int32 chunk, int push_ws )
				2554	{
				2555	__kmp_dispatch_init< kmp_int32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2556	push_ws );
				2557	}
				2558
				2559	void
				2560	__kmp_aux_dispatch_init_4u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2561	kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
				2562	kmp_int32 chunk, int push_ws )
				2563	{
				2564	__kmp_dispatch_init< kmp_uint32 >( loc, gtid, schedule, lb, ub, st, chunk,
				2565	push_ws );
				2566	}
				2567
				2568	void
				2569	__kmp_aux_dispatch_init_8( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2570	kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
				2571	kmp_int64 chunk, int push_ws )
				2572	{
				2573	__kmp_dispatch_init< kmp_int64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2574	push_ws );
				2575	}
				2576
				2577	void
				2578	__kmp_aux_dispatch_init_8u( ident_t *loc, kmp_int32 gtid, enum sched_type schedule,
				2579	kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
				2580	kmp_int64 chunk, int push_ws )
				2581	{
				2582	__kmp_dispatch_init< kmp_uint64 >( loc, gtid, schedule, lb, ub, st, chunk,
				2583	push_ws );
				2584	}
				2585
				2586	void
				2587	__kmp_aux_dispatch_fini_chunk_4( ident_t *loc, kmp_int32 gtid )
				2588	{
				2589	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2590	}
				2591
				2592	void
				2593	__kmp_aux_dispatch_fini_chunk_8( ident_t *loc, kmp_int32 gtid )
				2594	{
				2595	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2596	}
				2597
				2598	void
				2599	__kmp_aux_dispatch_fini_chunk_4u( ident_t *loc, kmp_int32 gtid )
				2600	{
				2601	__kmp_dispatch_finish_chunk< kmp_uint32 >( gtid, loc );
				2602	}
				2603
				2604	void
				2605	__kmp_aux_dispatch_fini_chunk_8u( ident_t *loc, kmp_int32 gtid )
				2606	{
				2607	__kmp_dispatch_finish_chunk< kmp_uint64 >( gtid, loc );
				2608	}
				2609
				2610	#endif /* KMP_GOMP_COMPAT */
				2611
				2612	/* ------------------------------------------------------------------------ */
				2613	/* ------------------------------------------------------------------------ */
				2614