blob: 5439599df8a374ca82327f12b3c420d345cdd1cc [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
Jonathan Peyton30419822017-05-12 18:01:32 +000016/* Dynamic scheduling initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +000017 *
18 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
19 * it may change values between parallel regions. __kmp_max_nth
20 * is the largest value __kmp_nth may take, 1 is the smallest.
Jim Cownie5e8470a2013-09-27 10:38:44 +000021 */
22
Jonathan Peyton30419822017-05-12 18:01:32 +000023// Need to raise Win version from XP to Vista here for support of
24// InterlockedExchange64
Andrey Churbanov429dbc22016-07-11 10:44:57 +000025#if defined(_WIN32_WINNT) && defined(_M_IX86)
26#undef _WIN32_WINNT
27#define _WIN32_WINNT 0x0502
28#endif
29
Jim Cownie5e8470a2013-09-27 10:38:44 +000030#include "kmp.h"
Jonathan Peyton30419822017-05-12 18:01:32 +000031#include "kmp_error.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000032#include "kmp_i18n.h"
33#include "kmp_itt.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000034#include "kmp_stats.h"
Jonathan Peyton30419822017-05-12 18:01:32 +000035#include "kmp_str.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000036#if KMP_OS_WINDOWS && KMP_ARCH_X86
Jonathan Peyton30419822017-05-12 18:01:32 +000037#include <float.h>
Jim Cownie5e8470a2013-09-27 10:38:44 +000038#endif
39
Andrey Churbanovd7d088f2015-04-29 16:42:24 +000040#if OMPT_SUPPORT
41#include "ompt-internal.h"
42#include "ompt-specific.h"
43#endif
44
Jim Cownie5e8470a2013-09-27 10:38:44 +000045/* ------------------------------------------------------------------------ */
Jim Cownie5e8470a2013-09-27 10:38:44 +000046
Andrey Churbanov429dbc22016-07-11 10:44:57 +000047#if KMP_STATIC_STEAL_ENABLED
Jim Cownie5e8470a2013-09-27 10:38:44 +000048
Jonathan Peyton30419822017-05-12 18:01:32 +000049// replaces dispatch_private_info{32,64} structures and
50// dispatch_private_info{32,64}_t types
51template <typename T> struct dispatch_private_infoXX_template {
52 typedef typename traits_t<T>::unsigned_t UT;
53 typedef typename traits_t<T>::signed_t ST;
54 UT count; // unsigned
55 T ub;
56 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
57 T lb;
58 ST st; // signed
59 UT tc; // unsigned
60 T static_steal_counter; // for static_steal only; maybe better to put after ub
Jim Cownie5e8470a2013-09-27 10:38:44 +000061
Jonathan Peyton30419822017-05-12 18:01:32 +000062 /* parm[1-4] are used in different ways by different scheduling algorithms */
Jim Cownie5e8470a2013-09-27 10:38:44 +000063
Jonathan Peyton30419822017-05-12 18:01:32 +000064 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
65 // a) parm3 is properly aligned and
66 // b) all parm1-4 are in the same cache line.
67 // Because of parm1-4 are used together, performance seems to be better
68 // if they are in the same line (not measured though).
Jim Cownie5e8470a2013-09-27 10:38:44 +000069
Jonathan Peyton30419822017-05-12 18:01:32 +000070 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
71 T parm1;
72 T parm2;
73 T parm3;
74 T parm4;
75 };
Jim Cownie5e8470a2013-09-27 10:38:44 +000076
Jonathan Peyton30419822017-05-12 18:01:32 +000077 UT ordered_lower; // unsigned
78 UT ordered_upper; // unsigned
79#if KMP_OS_WINDOWS
80 T last_upper;
81#endif /* KMP_OS_WINDOWS */
82};
Jim Cownie5e8470a2013-09-27 10:38:44 +000083
84#else /* KMP_STATIC_STEAL_ENABLED */
85
Jonathan Peyton30419822017-05-12 18:01:32 +000086// replaces dispatch_private_info{32,64} structures and
87// dispatch_private_info{32,64}_t types
88template <typename T> struct dispatch_private_infoXX_template {
89 typedef typename traits_t<T>::unsigned_t UT;
90 typedef typename traits_t<T>::signed_t ST;
91 T lb;
92 T ub;
93 ST st; // signed
94 UT tc; // unsigned
Jim Cownie5e8470a2013-09-27 10:38:44 +000095
Jonathan Peyton30419822017-05-12 18:01:32 +000096 T parm1;
97 T parm2;
98 T parm3;
99 T parm4;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000100
Jonathan Peyton30419822017-05-12 18:01:32 +0000101 UT count; // unsigned
Jim Cownie5e8470a2013-09-27 10:38:44 +0000102
Jonathan Peyton30419822017-05-12 18:01:32 +0000103 UT ordered_lower; // unsigned
104 UT ordered_upper; // unsigned
105#if KMP_OS_WINDOWS
106 T last_upper;
107#endif /* KMP_OS_WINDOWS */
108};
Jim Cownie5e8470a2013-09-27 10:38:44 +0000109
110#endif /* KMP_STATIC_STEAL_ENABLED */
111
112// replaces dispatch_private_info structure and dispatch_private_info_t type
Jonathan Peyton30419822017-05-12 18:01:32 +0000113template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
114 // duplicate alignment here, otherwise size of structure is not correct in our
115 // compiler
116 union KMP_ALIGN_CACHE private_info_tmpl {
117 dispatch_private_infoXX_template<T> p;
118 dispatch_private_info64_t p64;
119 } u;
120 enum sched_type schedule; /* scheduling algorithm */
121 kmp_uint32 ordered; /* ordered clause specified */
122 kmp_uint32 ordered_bumped;
123 // To retain the structure size after making ordered_iteration scalar
124 kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
125 dispatch_private_info *next; /* stack of buffers for nest of serial regions */
126 kmp_uint32 nomerge; /* don't merge iters if serialized */
127 kmp_uint32 type_size;
128 enum cons_type pushed_ws;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000129};
130
Jonathan Peyton30419822017-05-12 18:01:32 +0000131// replaces dispatch_shared_info{32,64} structures and
132// dispatch_shared_info{32,64}_t types
133template <typename UT> struct dispatch_shared_infoXX_template {
134 /* chunk index under dynamic, number of idle threads under static-steal;
135 iteration index otherwise */
136 volatile UT iteration;
137 volatile UT num_done;
138 volatile UT ordered_iteration;
139 // to retain the structure size making ordered_iteration scalar
140 UT ordered_dummy[KMP_MAX_ORDERED - 3];
Jim Cownie5e8470a2013-09-27 10:38:44 +0000141};
142
143// replaces dispatch_shared_info structure and dispatch_shared_info_t type
Jonathan Peyton30419822017-05-12 18:01:32 +0000144template <typename UT> struct dispatch_shared_info_template {
145 // we need union here to keep the structure size
146 union shared_info_tmpl {
147 dispatch_shared_infoXX_template<UT> s;
148 dispatch_shared_info64_t s64;
149 } u;
150 volatile kmp_uint32 buffer_index;
Jonathan Peytondf6818b2016-06-14 17:57:47 +0000151#if OMP_45_ENABLED
Jonathan Peyton30419822017-05-12 18:01:32 +0000152 volatile kmp_int32 doacross_buf_idx; // teamwise index
153 kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
154 kmp_int32 doacross_num_done; // count finished threads
Jonathan Peyton71909c52016-03-02 22:42:06 +0000155#endif
Jonathan Peyton4d3c2132016-07-08 17:43:21 +0000156#if KMP_USE_HWLOC
Jonathan Peyton30419822017-05-12 18:01:32 +0000157 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
158 // machines (> 48 cores). Performance analysis showed that a cache thrash
159 // was occurring and this padding helps alleviate the problem.
160 char padding[64];
Jonathan Peyton4d3c2132016-07-08 17:43:21 +0000161#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000162};
163
164/* ------------------------------------------------------------------------ */
Jim Cownie5e8470a2013-09-27 10:38:44 +0000165
Jim Cownie5e8470a2013-09-27 10:38:44 +0000166#undef USE_TEST_LOCKS
167
168// test_then_add template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000169template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000170
Jonathan Peyton30419822017-05-12 18:01:32 +0000171template <>
172__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
173 kmp_int32 d) {
174 kmp_int32 r;
175 r = KMP_TEST_THEN_ADD32(p, d);
176 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000177}
178
Jonathan Peyton30419822017-05-12 18:01:32 +0000179template <>
180__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
181 kmp_int64 d) {
182 kmp_int64 r;
183 r = KMP_TEST_THEN_ADD64(p, d);
184 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000185}
186
187// test_then_inc_acq template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000188template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000189
Jonathan Peyton30419822017-05-12 18:01:32 +0000190template <>
191__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
192 kmp_int32 r;
193 r = KMP_TEST_THEN_INC_ACQ32(p);
194 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000195}
196
Jonathan Peyton30419822017-05-12 18:01:32 +0000197template <>
198__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
199 kmp_int64 r;
200 r = KMP_TEST_THEN_INC_ACQ64(p);
201 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000202}
203
204// test_then_inc template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000205template <typename T> static __forceinline T test_then_inc(volatile T *p);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000206
Jonathan Peyton30419822017-05-12 18:01:32 +0000207template <>
208__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
209 kmp_int32 r;
210 r = KMP_TEST_THEN_INC32(p);
211 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000212}
213
Jonathan Peyton30419822017-05-12 18:01:32 +0000214template <>
215__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
216 kmp_int64 r;
217 r = KMP_TEST_THEN_INC64(p);
218 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000219}
220
221// compare_and_swap template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000222template <typename T>
223static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000224
Jonathan Peyton30419822017-05-12 18:01:32 +0000225template <>
226__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
227 kmp_int32 c, kmp_int32 s) {
228 return KMP_COMPARE_AND_STORE_REL32(p, c, s);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000229}
230
Jonathan Peyton30419822017-05-12 18:01:32 +0000231template <>
232__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
233 kmp_int64 c, kmp_int64 s) {
234 return KMP_COMPARE_AND_STORE_REL64(p, c, s);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000235}
236
Jonathan Peyton30419822017-05-12 18:01:32 +0000237/* Spin wait loop that first does pause, then yield.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000238 Waits until function returns non-zero when called with *spinner and check.
239 Does NOT put threads to sleep.
240#if USE_ITT_BUILD
241 Arguments:
Jonathan Peyton30419822017-05-12 18:01:32 +0000242 obj -- is higher-level synchronization object to report to ittnotify.
243 It is used to report locks consistently. For example, if lock is
244 acquired immediately, its address is reported to ittnotify via
245 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
246 and lock routine calls to KMP_WAIT_YIELD(), the later should report the
247 same address, not an address of low-level spinner.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000248#endif // USE_ITT_BUILD
249*/
Jonathan Peyton30419822017-05-12 18:01:32 +0000250template <typename UT>
Jim Cownie5e8470a2013-09-27 10:38:44 +0000251// ToDo: make inline function (move to header file for icl)
Jonathan Peyton30419822017-05-12 18:01:32 +0000252static UT // unsigned 4- or 8-byte type
253 __kmp_wait_yield(
254 volatile UT *spinner, UT checker,
255 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
256 void *obj) // Higher-level synchronization object, or NULL.
257 ) {
258 // note: we may not belong to a team at this point
259 register volatile UT *spin = spinner;
260 register UT check = checker;
261 register kmp_uint32 spins;
262 register kmp_uint32 (*f)(UT, UT) = pred;
263 register UT r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000264
Jonathan Peyton30419822017-05-12 18:01:32 +0000265 KMP_FSYNC_SPIN_INIT(obj, (void *)spin);
266 KMP_INIT_YIELD(spins);
267 // main wait spin loop
268 while (!f(r = *spin, check)) {
269 KMP_FSYNC_SPIN_PREPARE(obj);
270 /* GEH - remove this since it was accidentally introduced when kmp_wait was
271 split. It causes problems with infinite recursion because of exit lock */
272 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
273 __kmp_abort_thread(); */
274
275 // if we are oversubscribed, or have waited a bit (and
276 // KMP_LIBRARY=throughput, then yield. pause is in the following code
277 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
278 KMP_YIELD_SPIN(spins);
279 }
280 KMP_FSYNC_SPIN_ACQUIRED(obj);
281 return r;
282}
283
284template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
285 return value == checker;
286}
287
288template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
289 return value != checker;
290}
291
292template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
293 return value < checker;
294}
295
296template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
297 return value >= checker;
298}
299
300template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
301 return value <= checker;
302}
303
304/* ------------------------------------------------------------------------ */
305
306static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
307 ident_t *loc_ref) {
308 kmp_info_t *th;
309
310 KMP_DEBUG_ASSERT(gtid_ref);
311
312 if (__kmp_env_consistency_check) {
313 th = __kmp_threads[*gtid_ref];
314 if (th->th.th_root->r.r_active &&
315 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
316#if KMP_USE_DYNAMIC_LOCK
317 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
318#else
319 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
320#endif
321 }
322 }
323}
324
325template <typename UT>
326static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
327 typedef typename traits_t<UT>::signed_t ST;
328 dispatch_private_info_template<UT> *pr;
329
330 int gtid = *gtid_ref;
331 // int cid = *cid_ref;
332 kmp_info_t *th = __kmp_threads[gtid];
333 KMP_DEBUG_ASSERT(th->th.th_dispatch);
334
335 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
336 if (__kmp_env_consistency_check) {
337 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
338 th->th.th_dispatch->th_dispatch_pr_current);
339 if (pr->pushed_ws != ct_none) {
340#if KMP_USE_DYNAMIC_LOCK
341 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
342#else
343 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
344#endif
345 }
346 }
347
348 if (!th->th.th_team->t.t_serialized) {
349 dispatch_shared_info_template<UT> *sh =
350 reinterpret_cast<dispatch_shared_info_template<UT> *>(
351 th->th.th_dispatch->th_dispatch_sh_current);
352 UT lower;
353
354 if (!__kmp_env_consistency_check) {
355 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
356 th->th.th_dispatch->th_dispatch_pr_current);
357 }
358 lower = pr->u.p.ordered_lower;
359
360#if !defined(KMP_GOMP_COMPAT)
361 if (__kmp_env_consistency_check) {
362 if (pr->ordered_bumped) {
363 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
364 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
365 ct_ordered_in_pdo, loc_ref,
366 &p->stack_data[p->w_top]);
367 }
368 }
369#endif /* !defined(KMP_GOMP_COMPAT) */
370
371 KMP_MB();
372#ifdef KMP_DEBUG
Jim Cownie5e8470a2013-09-27 10:38:44 +0000373 {
Jonathan Peyton30419822017-05-12 18:01:32 +0000374 const char *buff;
375 // create format specifiers before the debug output
376 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
377 "ordered_iter:%%%s lower:%%%s\n",
378 traits_t<UT>::spec, traits_t<UT>::spec);
379 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
380 __kmp_str_free(&buff);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000381 }
Jonathan Peyton30419822017-05-12 18:01:32 +0000382#endif
383
384 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
385 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
386 KMP_MB(); /* is this necessary? */
387#ifdef KMP_DEBUG
388 {
389 const char *buff;
390 // create format specifiers before the debug output
391 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
392 "ordered_iter:%%%s lower:%%%s\n",
393 traits_t<UT>::spec, traits_t<UT>::spec);
394 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
395 __kmp_str_free(&buff);
396 }
397#endif
398 }
399 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
400}
401
402static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
403 ident_t *loc_ref) {
404 kmp_info_t *th;
405
406 if (__kmp_env_consistency_check) {
407 th = __kmp_threads[*gtid_ref];
408 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
409 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
410 }
411 }
412}
413
414template <typename UT>
415static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
416 typedef typename traits_t<UT>::signed_t ST;
417 dispatch_private_info_template<UT> *pr;
418
419 int gtid = *gtid_ref;
420 // int cid = *cid_ref;
421 kmp_info_t *th = __kmp_threads[gtid];
422 KMP_DEBUG_ASSERT(th->th.th_dispatch);
423
424 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
425 if (__kmp_env_consistency_check) {
426 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
427 th->th.th_dispatch->th_dispatch_pr_current);
428 if (pr->pushed_ws != ct_none) {
429 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
430 }
431 }
432
433 if (!th->th.th_team->t.t_serialized) {
434 dispatch_shared_info_template<UT> *sh =
435 reinterpret_cast<dispatch_shared_info_template<UT> *>(
436 th->th.th_dispatch->th_dispatch_sh_current);
437
438 if (!__kmp_env_consistency_check) {
439 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
440 th->th.th_dispatch->th_dispatch_pr_current);
441 }
442
443 KMP_FSYNC_RELEASING(&sh->u.s.ordered_iteration);
444#if !defined(KMP_GOMP_COMPAT)
445 if (__kmp_env_consistency_check) {
446 if (pr->ordered_bumped != 0) {
447 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
448 /* How to test it? - OM */
449 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
450 ct_ordered_in_pdo, loc_ref,
451 &p->stack_data[p->w_top]);
452 }
453 }
454#endif /* !defined(KMP_GOMP_COMPAT) */
455
456 KMP_MB(); /* Flush all pending memory write invalidates. */
457
458 pr->ordered_bumped += 1;
459
460 KD_TRACE(1000,
461 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
462 gtid, pr->ordered_bumped));
463
464 KMP_MB(); /* Flush all pending memory write invalidates. */
465
466 /* TODO use general release procedure? */
467 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
468
469 KMP_MB(); /* Flush all pending memory write invalidates. */
470 }
471 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
472}
473
474// Computes and returns x to the power of y, where y must a non-negative integer
475template <typename UT>
476static __forceinline long double __kmp_pow(long double x, UT y) {
477 long double s = 1.0L;
478
479 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
480 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
481 while (y) {
482 if (y & 1)
483 s *= x;
484 x *= x;
485 y >>= 1;
486 }
487 return s;
488}
489
490/* Computes and returns the number of unassigned iterations after idx chunks
491 have been assigned (the total number of unassigned iterations in chunks with
492 index greater than or equal to idx). __forceinline seems to be broken so that
493 if we __forceinline this function, the behavior is wrong
494 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
495template <typename T>
496static __inline typename traits_t<T>::unsigned_t
497__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
498 typename traits_t<T>::unsigned_t idx) {
499 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
500 ICL 8.1, long double arithmetic may not really have long double precision,
501 even with /Qlong_double. Currently, we workaround that in the caller code,
502 by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack
503 of precision is not expected to be a correctness issue, though. */
504 typedef typename traits_t<T>::unsigned_t UT;
505
506 long double x = tc * __kmp_pow<UT>(base, idx);
507 UT r = (UT)x;
508 if (x == r)
Jim Cownie5e8470a2013-09-27 10:38:44 +0000509 return r;
Jonathan Peyton30419822017-05-12 18:01:32 +0000510 return r + 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000511}
512
513// Parameters of the guided-iterative algorithm:
514// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
515// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
Jonathan Peyton30419822017-05-12 18:01:32 +0000516// by default n = 2. For example with n = 3 the chunks distribution will be more
517// flat.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000518// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
519static int guided_int_param = 2;
Jonathan Peyton30419822017-05-12 18:01:32 +0000520static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000521
522// UT - unsigned flavor of T, ST - signed flavor of T,
523// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
Jonathan Peyton30419822017-05-12 18:01:32 +0000524template <typename T>
Jim Cownie5e8470a2013-09-27 10:38:44 +0000525static void
Jonathan Peyton30419822017-05-12 18:01:32 +0000526__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
527 T ub, typename traits_t<T>::signed_t st,
528 typename traits_t<T>::signed_t chunk, int push_ws) {
529 typedef typename traits_t<T>::unsigned_t UT;
530 typedef typename traits_t<T>::signed_t ST;
531 typedef typename traits_t<T>::floating_t DBL;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000532
Jonathan Peyton30419822017-05-12 18:01:32 +0000533 int active;
534 T tc;
535 kmp_info_t *th;
536 kmp_team_t *team;
537 kmp_uint32 my_buffer_index;
538 dispatch_private_info_template<T> *pr;
539 dispatch_shared_info_template<UT> volatile *sh;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000540
Jonathan Peyton30419822017-05-12 18:01:32 +0000541 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
542 sizeof(dispatch_private_info));
543 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
544 sizeof(dispatch_shared_info));
Jim Cownie5e8470a2013-09-27 10:38:44 +0000545
Jonathan Peyton30419822017-05-12 18:01:32 +0000546 if (!TCR_4(__kmp_init_parallel))
547 __kmp_parallel_initialize();
Jim Cownie5e8470a2013-09-27 10:38:44 +0000548
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000549#if INCLUDE_SSC_MARKS
Jonathan Peyton30419822017-05-12 18:01:32 +0000550 SSC_MARK_DISPATCH_INIT();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000551#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000552#ifdef KMP_DEBUG
553 {
554 const char *buff;
555 // create format specifiers before the debug output
556 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
557 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
558 traits_t<ST>::spec, traits_t<T>::spec,
559 traits_t<T>::spec, traits_t<ST>::spec);
560 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
561 __kmp_str_free(&buff);
562 }
563#endif
564 /* setup data */
565 th = __kmp_threads[gtid];
566 team = th->th.th_team;
567 active = !team->t.t_serialized;
568 th->th.th_ident = loc;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000569
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000570#if USE_ITT_BUILD
Jonathan Peyton30419822017-05-12 18:01:32 +0000571 kmp_uint64 cur_chunk = chunk;
572 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
573 __kmp_forkjoin_frames_mode == 3 &&
574 KMP_MASTER_GTID(gtid) &&
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000575#if OMP_40_ENABLED
Jonathan Peyton30419822017-05-12 18:01:32 +0000576 th->th.th_teams_microtask == NULL &&
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000577#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000578 team->t.t_active_level == 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000579#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000580 if (!active) {
581 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
582 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
583 } else {
584 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
585 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000586
Jonathan Peyton30419822017-05-12 18:01:32 +0000587 my_buffer_index = th->th.th_dispatch->th_disp_index++;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000588
Jonathan Peyton30419822017-05-12 18:01:32 +0000589 /* What happens when number of threads changes, need to resize buffer? */
590 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
591 &th->th.th_dispatch
592 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593 sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
594 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
595 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000596
Jonathan Peyton30419822017-05-12 18:01:32 +0000597#if (KMP_STATIC_STEAL_ENABLED)
598 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
599 // AC: we now have only one implementation of stealing, so use it
600 schedule = kmp_sch_static_steal;
601 else
602#endif
603 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
Jonathan Peytonea0fe1d2016-02-25 17:55:50 +0000604
Jonathan Peyton30419822017-05-12 18:01:32 +0000605 /* Pick up the nomerge/ordered bits from the scheduling type */
606 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
607 pr->nomerge = TRUE;
608 schedule =
609 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
610 } else {
611 pr->nomerge = FALSE;
612 }
613 pr->type_size = traits_t<T>::type_size; // remember the size of variables
614 if (kmp_ord_lower & schedule) {
615 pr->ordered = TRUE;
616 schedule =
617 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
618 } else {
619 pr->ordered = FALSE;
620 }
Jonathan Peyton45be4502015-08-11 21:36:41 +0000621
Jonathan Peyton30419822017-05-12 18:01:32 +0000622 if (schedule == kmp_sch_static) {
623 schedule = __kmp_static;
624 } else {
625 if (schedule == kmp_sch_runtime) {
626 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
627 // not specified)
628 schedule = team->t.t_sched.r_sched_type;
629 // Detail the schedule if needed (global controls are differentiated
630 // appropriately)
631 if (schedule == kmp_sch_guided_chunked) {
632 schedule = __kmp_guided;
633 } else if (schedule == kmp_sch_static) {
Jim Cownie5e8470a2013-09-27 10:38:44 +0000634 schedule = __kmp_static;
Jonathan Peyton30419822017-05-12 18:01:32 +0000635 }
636 // Use the chunk size specified by OMP_SCHEDULE (or default if not
637 // specified)
638 chunk = team->t.t_sched.chunk;
Jonathan Peyton00afbd02015-11-12 21:26:22 +0000639#if USE_ITT_BUILD
Jonathan Peyton30419822017-05-12 18:01:32 +0000640 cur_chunk = chunk;
Jonathan Peyton00afbd02015-11-12 21:26:22 +0000641#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000642#ifdef KMP_DEBUG
643 {
644 const char *buff;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000645 // create format specifiers before the debug output
646 buff = __kmp_str_format(
Jonathan Peyton30419822017-05-12 18:01:32 +0000647 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
648 traits_t<ST>::spec);
649 KD_TRACE(10, (buff, gtid, schedule, chunk));
650 __kmp_str_free(&buff);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000651 }
Jonathan Peyton30419822017-05-12 18:01:32 +0000652#endif
653 } else {
654 if (schedule == kmp_sch_guided_chunked) {
655 schedule = __kmp_guided;
656 }
657 if (chunk <= 0) {
658 chunk = KMP_DEFAULT_CHUNK;
659 }
660 }
661
662 if (schedule == kmp_sch_auto) {
663 // mapping and differentiation: in the __kmp_do_serial_initialize()
664 schedule = __kmp_auto;
665#ifdef KMP_DEBUG
666 {
667 const char *buff;
668 // create format specifiers before the debug output
669 buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
670 "schedule:%%d chunk:%%%s\n",
671 traits_t<ST>::spec);
672 KD_TRACE(10, (buff, gtid, schedule, chunk));
673 __kmp_str_free(&buff);
674 }
675#endif
676 }
677
678 /* guided analytical not safe for too many threads */
679 if (schedule == kmp_sch_guided_analytical_chunked &&
680 th->th.th_team_nproc > 1 << 20) {
681 schedule = kmp_sch_guided_iterative_chunked;
682 KMP_WARNING(DispatchManyThreads);
683 }
684 pr->u.p.parm1 = chunk;
685 }
686 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
687 "unknown scheduling type");
688
689 pr->u.p.count = 0;
690
691 if (__kmp_env_consistency_check) {
692 if (st == 0) {
693 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
694 (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
695 }
696 }
697 // compute trip count
698 if (st == 1) { // most common case
699 if (ub >= lb) {
700 tc = ub - lb + 1;
701 } else { // ub < lb
702 tc = 0; // zero-trip
703 }
704 } else if (st < 0) {
705 if (lb >= ub) {
706 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
707 // where the division needs to be unsigned regardless of the result type
708 tc = (UT)(lb - ub) / (-st) + 1;
709 } else { // lb < ub
710 tc = 0; // zero-trip
711 }
712 } else { // st > 0
713 if (ub >= lb) {
714 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
715 // where the division needs to be unsigned regardless of the result type
716 tc = (UT)(ub - lb) / st + 1;
717 } else { // ub < lb
718 tc = 0; // zero-trip
719 }
720 }
721
722 // Any half-decent optimizer will remove this test when the blocks are empty
723 // since the macros expand to nothing when statistics are disabled.
724 if (schedule == __kmp_static) {
725 KMP_COUNT_BLOCK(OMP_FOR_static);
726 KMP_COUNT_VALUE(FOR_static_iterations, tc);
727 } else {
728 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
729 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
730 }
731
732 pr->u.p.lb = lb;
733 pr->u.p.ub = ub;
734 pr->u.p.st = st;
735 pr->u.p.tc = tc;
736
737#if KMP_OS_WINDOWS
738 pr->u.p.last_upper = ub + st;
739#endif /* KMP_OS_WINDOWS */
740
741 /* NOTE: only the active parallel region(s) has active ordered sections */
742
743 if (active) {
744 if (pr->ordered == 0) {
745 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
746 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
747 } else {
748 pr->ordered_bumped = 0;
749
750 pr->u.p.ordered_lower = 1;
751 pr->u.p.ordered_upper = 0;
752
753 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
754 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
755 }
756 }
757
758 if (__kmp_env_consistency_check) {
759 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
760 if (push_ws) {
761 __kmp_push_workshare(gtid, ws, loc);
762 pr->pushed_ws = ws;
763 } else {
764 __kmp_check_workshare(gtid, ws, loc);
765 pr->pushed_ws = ct_none;
766 }
767 }
768
769 switch (schedule) {
770#if (KMP_STATIC_STEAL_ENABLED)
771 case kmp_sch_static_steal: {
772 T nproc = th->th.th_team_nproc;
773 T ntc, init;
774
775 KD_TRACE(100,
776 ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
777
778 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
779 if (nproc > 1 && ntc >= nproc) {
780 KMP_COUNT_BLOCK(OMP_FOR_static_steal);
781 T id = __kmp_tid_from_gtid(gtid);
782 T small_chunk, extras;
783
784 small_chunk = ntc / nproc;
785 extras = ntc % nproc;
786
787 init = id * small_chunk + (id < extras ? id : extras);
788 pr->u.p.count = init;
789 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
790
791 pr->u.p.parm2 = lb;
792 // pr->pfields.parm3 = 0; // it's not used in static_steal
793 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
794 pr->u.p.st = st;
795 if (traits_t<T>::type_size > 4) {
796 // AC: TODO: check if 16-byte CAS available and use it to
797 // improve performance (probably wait for explicit request
798 // before spending time on this).
799 // For now use dynamically allocated per-thread lock,
800 // free memory in __kmp_dispatch_next when status==0.
801 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
802 th->th.th_dispatch->th_steal_lock =
803 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
804 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
805 }
806 break;
807 } else {
808 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
809 "kmp_sch_static_balanced\n",
810 gtid));
811 schedule = kmp_sch_static_balanced;
812 /* too few iterations: fall-through to kmp_sch_static_balanced */
813 } // if
814 /* FALL-THROUGH to static balanced */
815 } // case
816#endif
817 case kmp_sch_static_balanced: {
818 T nproc = th->th.th_team_nproc;
819 T init, limit;
820
821 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
822 gtid));
823
824 if (nproc > 1) {
825 T id = __kmp_tid_from_gtid(gtid);
826
827 if (tc < nproc) {
828 if (id < tc) {
829 init = id;
830 limit = id;
831 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
832 } else {
833 pr->u.p.count = 1; /* means no more chunks to execute */
834 pr->u.p.parm1 = FALSE;
835 break;
836 }
837 } else {
838 T small_chunk = tc / nproc;
839 T extras = tc % nproc;
840 init = id * small_chunk + (id < extras ? id : extras);
841 limit = init + small_chunk - (id < extras ? 0 : 1);
842 pr->u.p.parm1 = (id == nproc - 1);
843 }
844 } else {
845 if (tc > 0) {
846 init = 0;
847 limit = tc - 1;
848 pr->u.p.parm1 = TRUE;
849 } else { // zero trip count
850 pr->u.p.count = 1; /* means no more chunks to execute */
851 pr->u.p.parm1 = FALSE;
852 break;
853 }
854 }
855#if USE_ITT_BUILD
856 // Calculate chunk for metadata report
857 if (itt_need_metadata_reporting)
858 cur_chunk = limit - init + 1;
859#endif
860 if (st == 1) {
861 pr->u.p.lb = lb + init;
862 pr->u.p.ub = lb + limit;
863 } else {
864 // calculated upper bound, "ub" is user-defined upper bound
865 T ub_tmp = lb + limit * st;
866 pr->u.p.lb = lb + init * st;
867 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
868 // it exactly
869 if (st > 0) {
870 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
871 } else {
872 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
873 }
874 }
875 if (pr->ordered) {
876 pr->u.p.ordered_lower = init;
877 pr->u.p.ordered_upper = limit;
878 }
879 break;
880 } // case
881 case kmp_sch_guided_iterative_chunked: {
882 T nproc = th->th.th_team_nproc;
883 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
884 " case\n",
885 gtid));
886
887 if (nproc > 1) {
888 if ((2L * chunk + 1) * nproc >= tc) {
889 /* chunk size too large, switch to dynamic */
890 schedule = kmp_sch_dynamic_chunked;
891 } else {
892 // when remaining iters become less than parm2 - switch to dynamic
893 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
894 *(double *)&pr->u.p.parm3 =
895 guided_flt_param / nproc; // may occupy parm3 and parm4
896 }
897 } else {
898 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
899 "kmp_sch_static_greedy\n",
900 gtid));
901 schedule = kmp_sch_static_greedy;
902 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
903 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
904 gtid));
905 pr->u.p.parm1 = tc;
906 } // if
907 } // case
908 break;
909 case kmp_sch_guided_analytical_chunked: {
910 T nproc = th->th.th_team_nproc;
911 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
912 " case\n",
913 gtid));
914 if (nproc > 1) {
915 if ((2L * chunk + 1) * nproc >= tc) {
916 /* chunk size too large, switch to dynamic */
917 schedule = kmp_sch_dynamic_chunked;
918 } else {
919 /* commonly used term: (2 nproc - 1)/(2 nproc) */
920 DBL x;
921
922#if KMP_OS_WINDOWS && KMP_ARCH_X86
923 /* Linux* OS already has 64-bit computation by default for long double,
924 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
925 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
926 instead of the default 53-bit. Even though long double doesn't work
927 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
928 expected to impact the correctness of the algorithm, but this has not
929 been mathematically proven. */
930 // save original FPCW and set precision to 64-bit, as
931 // Windows* OS on IA-32 architecture defaults to 53-bit
932 unsigned int oldFpcw = _control87(0, 0);
933 _control87(_PC_64, _MCW_PC); // 0,0x30000
934#endif
935 /* value used for comparison in solver for cross-over point */
936 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
937
938 /* crossover point--chunk indexes equal to or greater than
939 this point switch to dynamic-style scheduling */
940 UT cross;
941
942 /* commonly used term: (2 nproc - 1)/(2 nproc) */
943 x = (long double)1.0 - (long double)0.5 / nproc;
944
945#ifdef KMP_DEBUG
946 { // test natural alignment
947 struct _test_a {
948 char a;
949 union {
950 char b;
951 DBL d;
952 };
953 } t;
954 ptrdiff_t natural_alignment =
955 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
956 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
957 // long)natural_alignment );
958 KMP_DEBUG_ASSERT(
959 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
960 }
961#endif // KMP_DEBUG
962
963 /* save the term in thread private dispatch structure */
964 *(DBL *)&pr->u.p.parm3 = x;
965
966 /* solve for the crossover point to the nearest integer i for which C_i
967 <= chunk */
968 {
969 UT left, right, mid;
970 long double p;
971
972 /* estimate initial upper and lower bound */
973
974 /* doesn't matter what value right is as long as it is positive, but
975 it affects performance of the solver */
976 right = 229;
977 p = __kmp_pow<UT>(x, right);
978 if (p > target) {
979 do {
980 p *= p;
981 right <<= 1;
982 } while (p > target && right < (1 << 27));
983 /* lower bound is previous (failed) estimate of upper bound */
984 left = right >> 1;
985 } else {
986 left = 0;
987 }
988
989 /* bisection root-finding method */
990 while (left + 1 < right) {
991 mid = (left + right) / 2;
992 if (__kmp_pow<UT>(x, mid) > target) {
993 left = mid;
994 } else {
995 right = mid;
996 }
997 } // while
998 cross = right;
999 }
1000 /* assert sanity of computed crossover point */
1001 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1002 __kmp_pow<UT>(x, cross) <= target);
1003
1004 /* save the crossover point in thread private dispatch structure */
1005 pr->u.p.parm2 = cross;
1006
1007// C75803
1008#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1009#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1010#else
1011#define GUIDED_ANALYTICAL_WORKAROUND (x)
1012#endif
1013 /* dynamic-style scheduling offset */
1014 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1015 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1016 cross * chunk;
1017#if KMP_OS_WINDOWS && KMP_ARCH_X86
1018 // restore FPCW
1019 _control87(oldFpcw, _MCW_PC);
1020#endif
1021 } // if
1022 } else {
1023 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1024 "kmp_sch_static_greedy\n",
1025 gtid));
1026 schedule = kmp_sch_static_greedy;
1027 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1028 pr->u.p.parm1 = tc;
1029 } // if
1030 } // case
1031 break;
1032 case kmp_sch_static_greedy:
1033 KD_TRACE(100,
1034 ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1035 pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1036 ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1037 : tc;
1038 break;
1039 case kmp_sch_static_chunked:
1040 case kmp_sch_dynamic_chunked:
1041 if (pr->u.p.parm1 <= 0) {
1042 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1043 }
1044 KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1045 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1046 gtid));
1047 break;
1048 case kmp_sch_trapezoidal: {
1049 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1050
1051 T parm1, parm2, parm3, parm4;
1052 KD_TRACE(100,
1053 ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1054
1055 parm1 = chunk;
1056
1057 /* F : size of the first cycle */
1058 parm2 = (tc / (2 * th->th.th_team_nproc));
1059
1060 if (parm2 < 1) {
1061 parm2 = 1;
1062 }
1063
1064 /* L : size of the last cycle. Make sure the last cycle is not larger
1065 than the first cycle. */
1066 if (parm1 < 1) {
1067 parm1 = 1;
1068 } else if (parm1 > parm2) {
1069 parm1 = parm2;
1070 }
1071
1072 /* N : number of cycles */
1073 parm3 = (parm2 + parm1);
1074 parm3 = (2 * tc + parm3 - 1) / parm3;
1075
1076 if (parm3 < 2) {
1077 parm3 = 2;
1078 }
1079
1080 /* sigma : decreasing incr of the trapezoid */
1081 parm4 = (parm3 - 1);
1082 parm4 = (parm2 - parm1) / parm4;
1083
1084 // pointless check, because parm4 >= 0 always
1085 // if ( parm4 < 0 ) {
1086 // parm4 = 0;
1087 //}
1088
1089 pr->u.p.parm1 = parm1;
1090 pr->u.p.parm2 = parm2;
1091 pr->u.p.parm3 = parm3;
1092 pr->u.p.parm4 = parm4;
1093 } // case
1094 break;
1095
1096 default: {
1097 __kmp_msg(kmp_ms_fatal, // Severity
1098 KMP_MSG(UnknownSchedTypeDetected), // Primary message
1099 KMP_HNT(GetNewerLibrary), // Hint
1100 __kmp_msg_null // Variadic argument list terminator
1101 );
1102 } break;
1103 } // switch
1104 pr->schedule = schedule;
1105 if (active) {
1106 /* The name of this buffer should be my_buffer_index when it's free to use
1107 * it */
1108
1109 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1110 "sh->buffer_index:%d\n",
1111 gtid, my_buffer_index, sh->buffer_index));
1112 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1113 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1114 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1115 // my_buffer_index are *always* 32-bit integers.
1116 KMP_MB(); /* is this necessary? */
1117 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1118 "sh->buffer_index:%d\n",
1119 gtid, my_buffer_index, sh->buffer_index));
1120
1121 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1122 th->th.th_dispatch->th_dispatch_sh_current = (dispatch_shared_info_t *)sh;
1123#if USE_ITT_BUILD
1124 if (pr->ordered) {
1125 __kmp_itt_ordered_init(gtid);
1126 }; // if
1127 // Report loop metadata
1128 if (itt_need_metadata_reporting) {
1129 // Only report metadata by master of active team at level 1
1130 kmp_uint64 schedtype = 0;
1131 switch (schedule) {
1132 case kmp_sch_static_chunked:
1133 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1134 break;
1135 case kmp_sch_static_greedy:
1136 cur_chunk = pr->u.p.parm1;
1137 break;
1138 case kmp_sch_dynamic_chunked:
1139 schedtype = 1;
1140 break;
1141 case kmp_sch_guided_iterative_chunked:
1142 case kmp_sch_guided_analytical_chunked:
1143 schedtype = 2;
1144 break;
1145 default:
1146 // Should we put this case under "static"?
1147 // case kmp_sch_static_steal:
1148 schedtype = 3;
1149 break;
1150 }
1151 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1152 }
1153#endif /* USE_ITT_BUILD */
1154 }; // if
1155
1156#ifdef KMP_DEBUG
1157 {
1158 const char *buff;
1159 // create format specifiers before the debug output
1160 buff = __kmp_str_format(
1161 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1162 "lb:%%%s ub:%%%s"
1163 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1164 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1165 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1166 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1167 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1168 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1169 KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1170 pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1171 pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1172 pr->u.p.parm3, pr->u.p.parm4));
1173 __kmp_str_free(&buff);
1174 }
1175#endif
1176#if (KMP_STATIC_STEAL_ENABLED)
1177 // It cannot be guaranteed that after execution of a loop with some other
1178 // schedule kind all the parm3 variables will contain the same value. Even if
1179 // all parm3 will be the same, it still exists a bad case like using 0 and 1
1180 // rather than program life-time increment. So the dedicated variable is
1181 // required. The 'static_steal_counter' is used.
1182 if (schedule == kmp_sch_static_steal) {
1183 // Other threads will inspect this variable when searching for a victim.
1184 // This is a flag showing that other threads may steal from this thread
1185 // since then.
1186 volatile T *p = &pr->u.p.static_steal_counter;
1187 *p = *p + 1;
1188 }
1189#endif // ( KMP_STATIC_STEAL_ENABLED )
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001190
1191#if OMPT_SUPPORT && OMPT_TRACE
Jonathan Peyton30419822017-05-12 18:01:32 +00001192 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1193 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1194 ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1195 ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1196 team_info->parallel_id, task_info->task_id, team_info->microtask);
1197 }
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001198#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001199}
1200
Jonathan Peyton30419822017-05-12 18:01:32 +00001201/* For ordered loops, either __kmp_dispatch_finish() should be called after
Jim Cownie5e8470a2013-09-27 10:38:44 +00001202 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1203 * every chunk of iterations. If the ordered section(s) were not executed
1204 * for this iteration (or every iteration in this chunk), we need to set the
Jonathan Peyton30419822017-05-12 18:01:32 +00001205 * ordered iteration counters so that the next thread can proceed. */
1206template <typename UT>
1207static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1208 typedef typename traits_t<UT>::signed_t ST;
1209 kmp_info_t *th = __kmp_threads[gtid];
Jim Cownie5e8470a2013-09-27 10:38:44 +00001210
Jonathan Peyton30419822017-05-12 18:01:32 +00001211 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1212 if (!th->th.th_team->t.t_serialized) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00001213
Jonathan Peyton30419822017-05-12 18:01:32 +00001214 dispatch_private_info_template<UT> *pr =
1215 reinterpret_cast<dispatch_private_info_template<UT> *>(
1216 th->th.th_dispatch->th_dispatch_pr_current);
1217 dispatch_shared_info_template<UT> volatile *sh =
1218 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1219 th->th.th_dispatch->th_dispatch_sh_current);
1220 KMP_DEBUG_ASSERT(pr);
1221 KMP_DEBUG_ASSERT(sh);
1222 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1223 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001224
Jonathan Peyton30419822017-05-12 18:01:32 +00001225 if (pr->ordered_bumped) {
1226 KD_TRACE(
1227 1000,
1228 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1229 gtid));
1230 pr->ordered_bumped = 0;
1231 } else {
1232 UT lower = pr->u.p.ordered_lower;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001233
Jonathan Peyton30419822017-05-12 18:01:32 +00001234#ifdef KMP_DEBUG
1235 {
1236 const char *buff;
1237 // create format specifiers before the debug output
1238 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1239 "ordered_iteration:%%%s lower:%%%s\n",
1240 traits_t<UT>::spec, traits_t<UT>::spec);
1241 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1242 __kmp_str_free(&buff);
1243 }
1244#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001245
Jonathan Peyton30419822017-05-12 18:01:32 +00001246 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1247 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1248 KMP_MB(); /* is this necessary? */
1249#ifdef KMP_DEBUG
1250 {
1251 const char *buff;
1252 // create format specifiers before the debug output
1253 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1254 "ordered_iteration:%%%s lower:%%%s\n",
1255 traits_t<UT>::spec, traits_t<UT>::spec);
1256 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1257 __kmp_str_free(&buff);
1258 }
1259#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001260
Jonathan Peyton30419822017-05-12 18:01:32 +00001261 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001262 } // if
Jonathan Peyton30419822017-05-12 18:01:32 +00001263 } // if
1264 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
Jim Cownie5e8470a2013-09-27 10:38:44 +00001265}
1266
1267#ifdef KMP_GOMP_COMPAT
1268
Jonathan Peyton30419822017-05-12 18:01:32 +00001269template <typename UT>
1270static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1271 typedef typename traits_t<UT>::signed_t ST;
1272 kmp_info_t *th = __kmp_threads[gtid];
Jim Cownie5e8470a2013-09-27 10:38:44 +00001273
Jonathan Peyton30419822017-05-12 18:01:32 +00001274 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1275 if (!th->th.th_team->t.t_serialized) {
1276 // int cid;
1277 dispatch_private_info_template<UT> *pr =
1278 reinterpret_cast<dispatch_private_info_template<UT> *>(
1279 th->th.th_dispatch->th_dispatch_pr_current);
1280 dispatch_shared_info_template<UT> volatile *sh =
1281 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1282 th->th.th_dispatch->th_dispatch_sh_current);
1283 KMP_DEBUG_ASSERT(pr);
1284 KMP_DEBUG_ASSERT(sh);
1285 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1286 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001287
Jonathan Peyton30419822017-05-12 18:01:32 +00001288 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1289 UT lower = pr->u.p.ordered_lower;
1290 UT upper = pr->u.p.ordered_upper;
1291 UT inc = upper - lower + 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001292
Jonathan Peyton30419822017-05-12 18:01:32 +00001293 if (pr->ordered_bumped == inc) {
1294 KD_TRACE(
1295 1000,
1296 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1297 gtid));
1298 pr->ordered_bumped = 0;
1299 } else {
1300 inc -= pr->ordered_bumped;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001301
Jonathan Peyton30419822017-05-12 18:01:32 +00001302#ifdef KMP_DEBUG
1303 {
1304 const char *buff;
1305 // create format specifiers before the debug output
1306 buff = __kmp_str_format(
1307 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1308 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1309 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1310 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1311 __kmp_str_free(&buff);
1312 }
1313#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001314
Jonathan Peyton30419822017-05-12 18:01:32 +00001315 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1316 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
Jim Cownie5e8470a2013-09-27 10:38:44 +00001317
Jonathan Peyton30419822017-05-12 18:01:32 +00001318 KMP_MB(); /* is this necessary? */
1319 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1320 "ordered_bumped to zero\n",
1321 gtid));
1322 pr->ordered_bumped = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001323//!!!!! TODO check if the inc should be unsigned, or signed???
Jonathan Peyton30419822017-05-12 18:01:32 +00001324#ifdef KMP_DEBUG
1325 {
1326 const char *buff;
1327 // create format specifiers before the debug output
1328 buff = __kmp_str_format(
1329 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1330 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1331 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1332 traits_t<UT>::spec);
1333 KD_TRACE(1000,
1334 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1335 __kmp_str_free(&buff);
1336 }
1337#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001338
Jonathan Peyton30419822017-05-12 18:01:32 +00001339 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001340 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001341 // }
1342 }
1343 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
Jim Cownie5e8470a2013-09-27 10:38:44 +00001344}
1345
1346#endif /* KMP_GOMP_COMPAT */
1347
Jonathan Peyton30419822017-05-12 18:01:32 +00001348/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1349 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1350 is not called. */
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001351#if OMPT_SUPPORT && OMPT_TRACE
1352#define OMPT_LOOP_END \
Jonathan Peyton30419822017-05-12 18:01:32 +00001353 if (status == 0) { \
1354 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1355 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1356 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1357 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1358 team_info->parallel_id, task_info->task_id); \
1359 } \
1360 }
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001361#else
1362#define OMPT_LOOP_END // no-op
1363#endif
1364
Jonathan Peyton30419822017-05-12 18:01:32 +00001365template <typename T>
1366static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1367 T *p_lb, T *p_ub,
1368 typename traits_t<T>::signed_t *p_st) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00001369
Jonathan Peyton30419822017-05-12 18:01:32 +00001370 typedef typename traits_t<T>::unsigned_t UT;
1371 typedef typename traits_t<T>::signed_t ST;
1372 typedef typename traits_t<T>::floating_t DBL;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001373
Jonathan Peyton30419822017-05-12 18:01:32 +00001374 // This is potentially slightly misleading, schedule(runtime) will appear here
1375 // even if the actual runtme schedule is static. (Which points out a
1376 // disadavantage of schedule(runtime): even when static scheduling is used it
1377 // costs more than a compile time choice to use static scheduling would.)
1378 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001379
Jonathan Peyton30419822017-05-12 18:01:32 +00001380 int status;
1381 dispatch_private_info_template<T> *pr;
1382 kmp_info_t *th = __kmp_threads[gtid];
1383 kmp_team_t *team = th->th.th_team;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001384
Jonathan Peyton30419822017-05-12 18:01:32 +00001385 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1386#ifdef KMP_DEBUG
1387 {
1388 const char *buff;
1389 // create format specifiers before the debug output
1390 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1391 "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1392 traits_t<T>::spec, traits_t<T>::spec,
1393 traits_t<ST>::spec);
1394 KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1395 __kmp_str_free(&buff);
1396 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001397#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001398
Jonathan Peyton30419822017-05-12 18:01:32 +00001399 if (team->t.t_serialized) {
1400 /* NOTE: serialize this dispatch becase we are not at the active level */
1401 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1402 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1403 KMP_DEBUG_ASSERT(pr);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001404
Jonathan Peyton30419822017-05-12 18:01:32 +00001405 if ((status = (pr->u.p.tc != 0)) == 0) {
1406 *p_lb = 0;
1407 *p_ub = 0;
1408 // if ( p_last != NULL )
1409 // *p_last = 0;
1410 if (p_st != NULL)
1411 *p_st = 0;
1412 if (__kmp_env_consistency_check) {
1413 if (pr->pushed_ws != ct_none) {
1414 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001415 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001416 }
1417 } else if (pr->nomerge) {
1418 kmp_int32 last;
1419 T start;
1420 UT limit, trip, init;
1421 ST incr;
1422 T chunk = pr->u.p.parm1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001423
Jonathan Peyton30419822017-05-12 18:01:32 +00001424 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1425 gtid));
1426
1427 init = chunk * pr->u.p.count++;
1428 trip = pr->u.p.tc - 1;
1429
1430 if ((status = (init <= trip)) == 0) {
1431 *p_lb = 0;
1432 *p_ub = 0;
1433 // if ( p_last != NULL )
1434 // *p_last = 0;
1435 if (p_st != NULL)
1436 *p_st = 0;
1437 if (__kmp_env_consistency_check) {
1438 if (pr->pushed_ws != ct_none) {
1439 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1440 }
1441 }
1442 } else {
1443 start = pr->u.p.lb;
1444 limit = chunk + init - 1;
1445 incr = pr->u.p.st;
1446
1447 if ((last = (limit >= trip)) != 0) {
1448 limit = trip;
1449#if KMP_OS_WINDOWS
1450 pr->u.p.last_upper = pr->u.p.ub;
1451#endif /* KMP_OS_WINDOWS */
1452 }
1453 if (p_last != NULL)
1454 *p_last = last;
1455 if (p_st != NULL)
1456 *p_st = incr;
1457 if (incr == 1) {
1458 *p_lb = start + init;
1459 *p_ub = start + limit;
1460 } else {
1461 *p_lb = start + init * incr;
1462 *p_ub = start + limit * incr;
1463 }
1464
1465 if (pr->ordered) {
1466 pr->u.p.ordered_lower = init;
1467 pr->u.p.ordered_upper = limit;
1468#ifdef KMP_DEBUG
1469 {
1470 const char *buff;
1471 // create format specifiers before the debug output
1472 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1473 "ordered_lower:%%%s ordered_upper:%%%s\n",
1474 traits_t<UT>::spec, traits_t<UT>::spec);
1475 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1476 pr->u.p.ordered_upper));
1477 __kmp_str_free(&buff);
1478 }
1479#endif
1480 } // if
1481 } // if
1482 } else {
1483 pr->u.p.tc = 0;
1484 *p_lb = pr->u.p.lb;
1485 *p_ub = pr->u.p.ub;
1486#if KMP_OS_WINDOWS
1487 pr->u.p.last_upper = *p_ub;
1488#endif /* KMP_OS_WINDOWS */
1489 if (p_last != NULL)
1490 *p_last = TRUE;
1491 if (p_st != NULL)
1492 *p_st = pr->u.p.st;
1493 } // if
1494#ifdef KMP_DEBUG
Jim Cownie5e8470a2013-09-27 10:38:44 +00001495 {
Jonathan Peyton30419822017-05-12 18:01:32 +00001496 const char *buff;
1497 // create format specifiers before the debug output
1498 buff = __kmp_str_format(
1499 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1500 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1501 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1502 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1503 __kmp_str_free(&buff);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001504 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001505#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001506#if INCLUDE_SSC_MARKS
1507 SSC_MARK_DISPATCH_NEXT();
1508#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001509 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001510 return status;
Jonathan Peyton30419822017-05-12 18:01:32 +00001511 } else {
1512 kmp_int32 last = 0;
1513 dispatch_shared_info_template<UT> *sh;
1514 T start;
1515 ST incr;
1516 UT limit, trip, init;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001517
Jonathan Peyton30419822017-05-12 18:01:32 +00001518 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1519 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001520
Jonathan Peyton30419822017-05-12 18:01:32 +00001521 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1522 th->th.th_dispatch->th_dispatch_pr_current);
1523 KMP_DEBUG_ASSERT(pr);
1524 sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1525 th->th.th_dispatch->th_dispatch_sh_current);
1526 KMP_DEBUG_ASSERT(sh);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001527
Jonathan Peyton30419822017-05-12 18:01:32 +00001528 if (pr->u.p.tc == 0) {
1529 // zero trip count
1530 status = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001531 } else {
Jonathan Peyton30419822017-05-12 18:01:32 +00001532 switch (pr->schedule) {
1533#if (KMP_STATIC_STEAL_ENABLED)
1534 case kmp_sch_static_steal: {
1535 T chunk = pr->u.p.parm1;
1536 int nproc = th->th.th_team_nproc;
Jonathan Peyton45be4502015-08-11 21:36:41 +00001537
Jonathan Peyton30419822017-05-12 18:01:32 +00001538 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1539 gtid));
1540
1541 trip = pr->u.p.tc - 1;
1542
1543 if (traits_t<T>::type_size > 4) {
1544 // use lock for 8-byte and CAS for 4-byte induction
1545 // variable. TODO (optional): check and use 16-byte CAS
1546 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1547 KMP_DEBUG_ASSERT(lck != NULL);
1548 if (pr->u.p.count < (UT)pr->u.p.ub) {
1549 __kmp_acquire_lock(lck, gtid);
1550 // try to get own chunk of iterations
1551 init = (pr->u.p.count)++;
1552 status = (init < (UT)pr->u.p.ub);
1553 __kmp_release_lock(lck, gtid);
1554 } else {
1555 status = 0; // no own chunks
1556 }
1557 if (!status) { // try to steal
1558 kmp_info_t **other_threads = team->t.t_threads;
1559 int while_limit = nproc; // nproc attempts to find a victim
1560 int while_index = 0;
1561 // TODO: algorithm of searching for a victim
1562 // should be cleaned up and measured
1563 while ((!status) && (while_limit != ++while_index)) {
1564 T remaining;
1565 T victimIdx = pr->u.p.parm4;
1566 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1567 dispatch_private_info_template<T> *victim =
1568 reinterpret_cast<dispatch_private_info_template<T> *>(
1569 other_threads[victimIdx]
1570 ->th.th_dispatch->th_dispatch_pr_current);
1571 while ((victim == NULL || victim == pr ||
1572 (*(volatile T *)&victim->u.p.static_steal_counter !=
1573 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1574 oldVictimIdx != victimIdx) {
1575 victimIdx = (victimIdx + 1) % nproc;
1576 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1577 other_threads[victimIdx]
1578 ->th.th_dispatch->th_dispatch_pr_current);
1579 };
1580 if (!victim ||
1581 (*(volatile T *)&victim->u.p.static_steal_counter !=
1582 *(volatile T *)&pr->u.p.static_steal_counter)) {
1583 continue; // try once more (nproc attempts in total)
1584 // no victim is ready yet to participate in stealing
1585 // because all victims are still in kmp_init_dispatch
1586 }
1587 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1588 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1589 continue; // not enough chunks to steal, goto next victim
1590 }
1591
1592 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1593 KMP_ASSERT(lck != NULL);
1594 __kmp_acquire_lock(lck, gtid);
1595 limit = victim->u.p.ub; // keep initial ub
1596 if (victim->u.p.count >= limit ||
1597 (remaining = limit - victim->u.p.count) < 2) {
1598 __kmp_release_lock(lck, gtid);
1599 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1600 continue; // not enough chunks to steal
1601 }
1602 // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1603 // or by 1
1604 if (remaining > 3) {
1605 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1606 init = (victim->u.p.ub -=
1607 (remaining >> 2)); // steal 1/4 of remaining
1608 } else {
1609 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1610 init =
1611 (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1612 }
1613 __kmp_release_lock(lck, gtid);
1614
1615 KMP_DEBUG_ASSERT(init + 1 <= limit);
1616 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1617 status = 1;
1618 while_index = 0;
1619 // now update own count and ub with stolen range but init chunk
1620 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1621 pr->u.p.count = init + 1;
1622 pr->u.p.ub = limit;
1623 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1624 } // while (search for victim)
1625 } // if (try to find victim and steal)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001626 } else {
Jonathan Peyton30419822017-05-12 18:01:32 +00001627 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1628 typedef union {
1629 struct {
1630 UT count;
1631 T ub;
1632 } p;
1633 kmp_int64 b;
1634 } union_i4;
1635 // All operations on 'count' or 'ub' must be combined atomically
1636 // together.
1637 {
1638 union_i4 vold, vnew;
1639 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1640 vnew = vold;
1641 vnew.p.count++;
1642 while (!KMP_COMPARE_AND_STORE_ACQ64(
1643 (volatile kmp_int64 *)&pr->u.p.count,
1644 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1645 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1646 KMP_CPU_PAUSE();
1647 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1648 vnew = vold;
1649 vnew.p.count++;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001650 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001651 vnew = vold;
1652 init = vnew.p.count;
1653 status = (init < (UT)vnew.p.ub);
1654 }
1655
1656 if (!status) {
1657 kmp_info_t **other_threads = team->t.t_threads;
1658 int while_limit = nproc; // nproc attempts to find a victim
1659 int while_index = 0;
1660
1661 // TODO: algorithm of searching for a victim
1662 // should be cleaned up and measured
1663 while ((!status) && (while_limit != ++while_index)) {
1664 union_i4 vold, vnew;
1665 kmp_int32 remaining;
1666 T victimIdx = pr->u.p.parm4;
1667 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1668 dispatch_private_info_template<T> *victim =
1669 reinterpret_cast<dispatch_private_info_template<T> *>(
1670 other_threads[victimIdx]
1671 ->th.th_dispatch->th_dispatch_pr_current);
1672 while ((victim == NULL || victim == pr ||
1673 (*(volatile T *)&victim->u.p.static_steal_counter !=
1674 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1675 oldVictimIdx != victimIdx) {
1676 victimIdx = (victimIdx + 1) % nproc;
1677 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1678 other_threads[victimIdx]
1679 ->th.th_dispatch->th_dispatch_pr_current);
1680 };
1681 if (!victim ||
1682 (*(volatile T *)&victim->u.p.static_steal_counter !=
1683 *(volatile T *)&pr->u.p.static_steal_counter)) {
1684 continue; // try once more (nproc attempts in total)
1685 // no victim is ready yet to participate in stealing
1686 // because all victims are still in kmp_init_dispatch
1687 }
1688 pr->u.p.parm4 = victimIdx; // new victim found
1689 while (1) { // CAS loop if victim has enough chunks to steal
1690 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1691 vnew = vold;
1692
1693 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1694 if (vnew.p.count >= (UT)vnew.p.ub ||
1695 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1696 pr->u.p.parm4 =
1697 (victimIdx + 1) % nproc; // shift start victim id
1698 break; // not enough chunks to steal, goto next victim
1699 }
1700 if (remaining > 3) {
1701 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1702 } else {
1703 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1704 }
1705 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1706 // TODO: Should this be acquire or release?
1707 if (KMP_COMPARE_AND_STORE_ACQ64(
1708 (volatile kmp_int64 *)&victim->u.p.count,
1709 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1710 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1711 // stealing succeeded
1712 KMP_COUNT_VALUE(FOR_static_steal_stolen,
1713 vold.p.ub - vnew.p.ub);
1714 status = 1;
1715 while_index = 0;
1716 // now update own count and ub
1717 init = vnew.p.ub;
1718 vold.p.count = init + 1;
1719#if KMP_ARCH_X86
1720 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1721 vold.b);
1722#else
1723 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1724#endif
1725 break;
1726 } // if (check CAS result)
1727 KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1728 } // while (try to steal from particular victim)
1729 } // while (search for victim)
1730 } // if (try to find victim and steal)
1731 } // if (4-byte induction variable)
1732 if (!status) {
1733 *p_lb = 0;
1734 *p_ub = 0;
1735 if (p_st != NULL)
1736 *p_st = 0;
1737 } else {
1738 start = pr->u.p.parm2;
1739 init *= chunk;
1740 limit = chunk + init - 1;
1741 incr = pr->u.p.st;
1742 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1743
1744 KMP_DEBUG_ASSERT(init <= trip);
1745 if ((last = (limit >= trip)) != 0)
1746 limit = trip;
1747 if (p_st != NULL)
1748 *p_st = incr;
1749
1750 if (incr == 1) {
1751 *p_lb = start + init;
1752 *p_ub = start + limit;
1753 } else {
1754 *p_lb = start + init * incr;
1755 *p_ub = start + limit * incr;
1756 }
1757
1758 if (pr->ordered) {
1759 pr->u.p.ordered_lower = init;
1760 pr->u.p.ordered_upper = limit;
1761#ifdef KMP_DEBUG
1762 {
1763 const char *buff;
1764 // create format specifiers before the debug output
1765 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1766 "ordered_lower:%%%s ordered_upper:%%%s\n",
1767 traits_t<UT>::spec, traits_t<UT>::spec);
1768 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1769 pr->u.p.ordered_upper));
1770 __kmp_str_free(&buff);
1771 }
1772#endif
1773 } // if
1774 } // if
1775 break;
1776 } // case
1777#endif // ( KMP_STATIC_STEAL_ENABLED )
1778 case kmp_sch_static_balanced: {
1779 KD_TRACE(
1780 100,
1781 ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1782 if ((status = !pr->u.p.count) !=
1783 0) { /* check if thread has any iteration to do */
1784 pr->u.p.count = 1;
1785 *p_lb = pr->u.p.lb;
1786 *p_ub = pr->u.p.ub;
1787 last = pr->u.p.parm1;
1788 if (p_st != NULL)
1789 *p_st = pr->u.p.st;
1790 } else { /* no iterations to do */
1791 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001792 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001793 if (pr->ordered) {
1794#ifdef KMP_DEBUG
1795 {
1796 const char *buff;
1797 // create format specifiers before the debug output
1798 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1799 "ordered_lower:%%%s ordered_upper:%%%s\n",
1800 traits_t<UT>::spec, traits_t<UT>::spec);
1801 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1802 pr->u.p.ordered_upper));
1803 __kmp_str_free(&buff);
1804 }
1805#endif
1806 } // if
1807 } // case
1808 break;
1809 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1810 merged here */
1811 case kmp_sch_static_chunked: {
1812 T parm1;
1813
1814 KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1815 "kmp_sch_static_[affinity|chunked] case\n",
1816 gtid));
1817 parm1 = pr->u.p.parm1;
1818
1819 trip = pr->u.p.tc - 1;
1820 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1821
1822 if ((status = (init <= trip)) != 0) {
1823 start = pr->u.p.lb;
1824 incr = pr->u.p.st;
1825 limit = parm1 + init - 1;
1826
1827 if ((last = (limit >= trip)) != 0)
1828 limit = trip;
1829
1830 if (p_st != NULL)
1831 *p_st = incr;
1832
1833 pr->u.p.count += th->th.th_team_nproc;
1834
1835 if (incr == 1) {
1836 *p_lb = start + init;
1837 *p_ub = start + limit;
1838 } else {
1839 *p_lb = start + init * incr;
1840 *p_ub = start + limit * incr;
1841 }
1842
1843 if (pr->ordered) {
1844 pr->u.p.ordered_lower = init;
1845 pr->u.p.ordered_upper = limit;
1846#ifdef KMP_DEBUG
1847 {
1848 const char *buff;
1849 // create format specifiers before the debug output
1850 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1851 "ordered_lower:%%%s ordered_upper:%%%s\n",
1852 traits_t<UT>::spec, traits_t<UT>::spec);
1853 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1854 pr->u.p.ordered_upper));
1855 __kmp_str_free(&buff);
1856 }
1857#endif
1858 } // if
1859 } // if
1860 } // case
1861 break;
1862
1863 case kmp_sch_dynamic_chunked: {
1864 T chunk = pr->u.p.parm1;
1865
1866 KD_TRACE(
1867 100,
1868 ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1869
1870 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1871 trip = pr->u.p.tc - 1;
1872
1873 if ((status = (init <= trip)) == 0) {
1874 *p_lb = 0;
1875 *p_ub = 0;
1876 if (p_st != NULL)
1877 *p_st = 0;
1878 } else {
1879 start = pr->u.p.lb;
1880 limit = chunk + init - 1;
1881 incr = pr->u.p.st;
1882
1883 if ((last = (limit >= trip)) != 0)
1884 limit = trip;
1885
1886 if (p_st != NULL)
1887 *p_st = incr;
1888
1889 if (incr == 1) {
1890 *p_lb = start + init;
1891 *p_ub = start + limit;
1892 } else {
1893 *p_lb = start + init * incr;
1894 *p_ub = start + limit * incr;
1895 }
1896
1897 if (pr->ordered) {
1898 pr->u.p.ordered_lower = init;
1899 pr->u.p.ordered_upper = limit;
1900#ifdef KMP_DEBUG
1901 {
1902 const char *buff;
1903 // create format specifiers before the debug output
1904 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1905 "ordered_lower:%%%s ordered_upper:%%%s\n",
1906 traits_t<UT>::spec, traits_t<UT>::spec);
1907 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1908 pr->u.p.ordered_upper));
1909 __kmp_str_free(&buff);
1910 }
1911#endif
1912 } // if
1913 } // if
1914 } // case
1915 break;
1916
1917 case kmp_sch_guided_iterative_chunked: {
1918 T chunkspec = pr->u.p.parm1;
1919 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1920 "iterative case\n",
1921 gtid));
1922 trip = pr->u.p.tc;
1923 // Start atomic part of calculations
1924 while (1) {
1925 ST remaining; // signed, because can be < 0
1926 init = sh->u.s.iteration; // shared value
1927 remaining = trip - init;
1928 if (remaining <= 0) { // AC: need to compare with 0 first
1929 // nothing to do, don't try atomic op
1930 status = 0;
1931 break;
1932 }
1933 if ((T)remaining <
1934 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1935 // use dynamic-style shcedule
1936 // atomically inrement iterations, get old value
1937 init = test_then_add<ST>((ST *)&sh->u.s.iteration, (ST)chunkspec);
1938 remaining = trip - init;
1939 if (remaining <= 0) {
1940 status = 0; // all iterations got by other threads
1941 } else { // got some iterations to work on
1942 status = 1;
1943 if ((T)remaining > chunkspec) {
1944 limit = init + chunkspec - 1;
1945 } else {
1946 last = 1; // the last chunk
1947 limit = init + remaining - 1;
1948 } // if
1949 } // if
1950 break;
1951 } // if
1952 limit = init + (UT)(remaining *
1953 *(double *)&pr->u.p.parm3); // divide by K*nproc
1954 if (compare_and_swap<ST>((ST *)&sh->u.s.iteration, (ST)init,
1955 (ST)limit)) {
1956 // CAS was successful, chunk obtained
1957 status = 1;
1958 --limit;
1959 break;
1960 } // if
1961 } // while
1962 if (status != 0) {
1963 start = pr->u.p.lb;
1964 incr = pr->u.p.st;
1965 if (p_st != NULL)
1966 *p_st = incr;
1967 *p_lb = start + init * incr;
1968 *p_ub = start + limit * incr;
1969 if (pr->ordered) {
1970 pr->u.p.ordered_lower = init;
1971 pr->u.p.ordered_upper = limit;
1972#ifdef KMP_DEBUG
1973 {
1974 const char *buff;
1975 // create format specifiers before the debug output
1976 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1977 "ordered_lower:%%%s ordered_upper:%%%s\n",
1978 traits_t<UT>::spec, traits_t<UT>::spec);
1979 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1980 pr->u.p.ordered_upper));
1981 __kmp_str_free(&buff);
1982 }
1983#endif
1984 } // if
1985 } else {
1986 *p_lb = 0;
1987 *p_ub = 0;
1988 if (p_st != NULL)
1989 *p_st = 0;
1990 } // if
1991 } // case
1992 break;
1993
1994 case kmp_sch_guided_analytical_chunked: {
1995 T chunkspec = pr->u.p.parm1;
1996 UT chunkIdx;
1997#if KMP_OS_WINDOWS && KMP_ARCH_X86
1998 /* for storing original FPCW value for Windows* OS on
1999 IA-32 architecture 8-byte version */
2000 unsigned int oldFpcw;
2001 unsigned int fpcwSet = 0;
2002#endif
2003 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2004 "analytical case\n",
2005 gtid));
2006
2007 trip = pr->u.p.tc;
2008
2009 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2010 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2011 trip);
2012
2013 while (1) { /* this while loop is a safeguard against unexpected zero
2014 chunk sizes */
2015 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2016 if (chunkIdx >= (UT)pr->u.p.parm2) {
2017 --trip;
2018 /* use dynamic-style scheduling */
2019 init = chunkIdx * chunkspec + pr->u.p.count;
2020 /* need to verify init > 0 in case of overflow in the above
2021 * calculation */
2022 if ((status = (init > 0 && init <= trip)) != 0) {
2023 limit = init + chunkspec - 1;
2024
2025 if ((last = (limit >= trip)) != 0)
2026 limit = trip;
2027 }
2028 break;
2029 } else {
2030/* use exponential-style scheduling */
2031/* The following check is to workaround the lack of long double precision on
2032 Windows* OS.
2033 This check works around the possible effect that init != 0 for chunkIdx == 0.
2034 */
2035#if KMP_OS_WINDOWS && KMP_ARCH_X86
2036 /* If we haven't already done so, save original FPCW and set
2037 precision to 64-bit, as Windows* OS on IA-32 architecture
2038 defaults to 53-bit */
2039 if (!fpcwSet) {
2040 oldFpcw = _control87(0, 0);
2041 _control87(_PC_64, _MCW_PC);
2042 fpcwSet = 0x30000;
2043 }
2044#endif
2045 if (chunkIdx) {
2046 init = __kmp_dispatch_guided_remaining<T>(
2047 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2048 KMP_DEBUG_ASSERT(init);
2049 init = trip - init;
2050 } else
2051 init = 0;
2052 limit = trip - __kmp_dispatch_guided_remaining<T>(
2053 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2054 KMP_ASSERT(init <= limit);
2055 if (init < limit) {
2056 KMP_DEBUG_ASSERT(limit <= trip);
2057 --limit;
2058 status = 1;
2059 break;
2060 } // if
2061 } // if
2062 } // while (1)
2063#if KMP_OS_WINDOWS && KMP_ARCH_X86
2064 /* restore FPCW if necessary
2065 AC: check fpcwSet flag first because oldFpcw can be uninitialized
2066 here */
2067 if (fpcwSet && (oldFpcw & fpcwSet))
2068 _control87(oldFpcw, _MCW_PC);
2069#endif
2070 if (status != 0) {
2071 start = pr->u.p.lb;
2072 incr = pr->u.p.st;
2073 if (p_st != NULL)
2074 *p_st = incr;
2075 *p_lb = start + init * incr;
2076 *p_ub = start + limit * incr;
2077 if (pr->ordered) {
2078 pr->u.p.ordered_lower = init;
2079 pr->u.p.ordered_upper = limit;
2080#ifdef KMP_DEBUG
2081 {
2082 const char *buff;
2083 // create format specifiers before the debug output
2084 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2085 "ordered_lower:%%%s ordered_upper:%%%s\n",
2086 traits_t<UT>::spec, traits_t<UT>::spec);
2087 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2088 pr->u.p.ordered_upper));
2089 __kmp_str_free(&buff);
2090 }
2091#endif
2092 }
2093 } else {
2094 *p_lb = 0;
2095 *p_ub = 0;
2096 if (p_st != NULL)
2097 *p_st = 0;
2098 }
2099 } // case
2100 break;
2101
2102 case kmp_sch_trapezoidal: {
2103 UT index;
2104 T parm2 = pr->u.p.parm2;
2105 T parm3 = pr->u.p.parm3;
2106 T parm4 = pr->u.p.parm4;
2107 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2108 gtid));
2109
2110 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2111
2112 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2113 trip = pr->u.p.tc - 1;
2114
2115 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2116 *p_lb = 0;
2117 *p_ub = 0;
2118 if (p_st != NULL)
2119 *p_st = 0;
2120 } else {
2121 start = pr->u.p.lb;
2122 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2123 incr = pr->u.p.st;
2124
2125 if ((last = (limit >= trip)) != 0)
2126 limit = trip;
2127
2128 if (p_st != NULL)
2129 *p_st = incr;
2130
2131 if (incr == 1) {
2132 *p_lb = start + init;
2133 *p_ub = start + limit;
2134 } else {
2135 *p_lb = start + init * incr;
2136 *p_ub = start + limit * incr;
2137 }
2138
2139 if (pr->ordered) {
2140 pr->u.p.ordered_lower = init;
2141 pr->u.p.ordered_upper = limit;
2142#ifdef KMP_DEBUG
2143 {
2144 const char *buff;
2145 // create format specifiers before the debug output
2146 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2147 "ordered_lower:%%%s ordered_upper:%%%s\n",
2148 traits_t<UT>::spec, traits_t<UT>::spec);
2149 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2150 pr->u.p.ordered_upper));
2151 __kmp_str_free(&buff);
2152 }
2153#endif
2154 } // if
2155 } // if
2156 } // case
2157 break;
2158 default: {
2159 status = 0; // to avoid complaints on uninitialized variable use
2160 __kmp_msg(kmp_ms_fatal, // Severity
2161 KMP_MSG(UnknownSchedTypeDetected), // Primary message
2162 KMP_HNT(GetNewerLibrary), // Hint
2163 __kmp_msg_null // Variadic argument list terminator
2164 );
2165 } break;
2166 } // switch
2167 } // if tc == 0;
2168
2169 if (status == 0) {
2170 UT num_done;
2171
2172 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2173#ifdef KMP_DEBUG
2174 {
2175 const char *buff;
2176 // create format specifiers before the debug output
2177 buff = __kmp_str_format(
2178 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2179 traits_t<UT>::spec);
2180 KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2181 __kmp_str_free(&buff);
2182 }
2183#endif
2184
2185 if ((ST)num_done == th->th.th_team_nproc - 1) {
2186#if (KMP_STATIC_STEAL_ENABLED)
2187 if (pr->schedule == kmp_sch_static_steal &&
2188 traits_t<T>::type_size > 4) {
2189 int i;
2190 kmp_info_t **other_threads = team->t.t_threads;
2191 // loop complete, safe to destroy locks used for stealing
2192 for (i = 0; i < th->th.th_team_nproc; ++i) {
2193 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2194 KMP_ASSERT(lck != NULL);
2195 __kmp_destroy_lock(lck);
2196 __kmp_free(lck);
2197 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2198 }
2199 }
2200#endif
2201 /* NOTE: release this buffer to be reused */
2202
2203 KMP_MB(); /* Flush all pending memory write invalidates. */
2204
2205 sh->u.s.num_done = 0;
2206 sh->u.s.iteration = 0;
2207
2208 /* TODO replace with general release procedure? */
2209 if (pr->ordered) {
2210 sh->u.s.ordered_iteration = 0;
2211 }
2212
2213 KMP_MB(); /* Flush all pending memory write invalidates. */
2214
2215 sh->buffer_index += __kmp_dispatch_num_buffers;
2216 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2217 gtid, sh->buffer_index));
2218
2219 KMP_MB(); /* Flush all pending memory write invalidates. */
2220
2221 } // if
2222 if (__kmp_env_consistency_check) {
2223 if (pr->pushed_ws != ct_none) {
2224 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2225 }
2226 }
2227
2228 th->th.th_dispatch->th_deo_fcn = NULL;
2229 th->th.th_dispatch->th_dxo_fcn = NULL;
2230 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2231 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2232 } // if (status == 0)
2233#if KMP_OS_WINDOWS
2234 else if (last) {
2235 pr->u.p.last_upper = pr->u.p.ub;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002236 }
Jonathan Peyton30419822017-05-12 18:01:32 +00002237#endif /* KMP_OS_WINDOWS */
2238 if (p_last != NULL && status != 0)
2239 *p_last = last;
2240 } // if
2241
2242#ifdef KMP_DEBUG
2243 {
2244 const char *buff;
2245 // create format specifiers before the debug output
2246 buff = __kmp_str_format(
2247 "__kmp_dispatch_next: T#%%d normal case: "
2248 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2249 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2250 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2251 __kmp_str_free(&buff);
2252 }
2253#endif
2254#if INCLUDE_SSC_MARKS
2255 SSC_MARK_DISPATCH_NEXT();
2256#endif
2257 OMPT_LOOP_END;
2258 return status;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002259}
2260
Jonathan Peyton30419822017-05-12 18:01:32 +00002261template <typename T>
2262static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2263 kmp_int32 *plastiter, T *plower, T *pupper,
2264 typename traits_t<T>::signed_t incr) {
2265 typedef typename traits_t<T>::unsigned_t UT;
2266 typedef typename traits_t<T>::signed_t ST;
2267 register kmp_uint32 team_id;
2268 register kmp_uint32 nteams;
2269 register UT trip_count;
2270 register kmp_team_t *team;
2271 kmp_info_t *th;
2272
2273 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2274 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2275#ifdef KMP_DEBUG
2276 {
2277 const char *buff;
2278 // create format specifiers before the debug output
2279 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2280 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2281 traits_t<T>::spec, traits_t<T>::spec,
2282 traits_t<ST>::spec, traits_t<T>::spec);
2283 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2284 __kmp_str_free(&buff);
2285 }
2286#endif
2287
2288 if (__kmp_env_consistency_check) {
2289 if (incr == 0) {
2290 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2291 loc);
2292 }
2293 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2294 // The loop is illegal.
2295 // Some zero-trip loops maintained by compiler, e.g.:
2296 // for(i=10;i<0;++i) // lower >= upper - run-time check
2297 // for(i=0;i>10;--i) // lower <= upper - run-time check
2298 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2299 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2300 // Compiler does not check the following illegal loops:
2301 // for(i=0;i<10;i+=incr) // where incr<0
2302 // for(i=10;i>0;i-=incr) // where incr<0
2303 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2304 }
2305 }
2306 th = __kmp_threads[gtid];
2307 team = th->th.th_team;
2308#if OMP_40_ENABLED
2309 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2310 nteams = th->th.th_teams_size.nteams;
2311#endif
2312 team_id = team->t.t_master_tid;
2313 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2314
2315 // compute global trip count
2316 if (incr == 1) {
2317 trip_count = *pupper - *plower + 1;
2318 } else if (incr == -1) {
2319 trip_count = *plower - *pupper + 1;
2320 } else if (incr > 0) {
2321 // upper-lower can exceed the limit of signed type
2322 trip_count = (UT)(*pupper - *plower) / incr + 1;
2323 } else {
2324 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2325 }
2326
2327 if (trip_count <= nteams) {
2328 KMP_DEBUG_ASSERT(
2329 __kmp_static == kmp_sch_static_greedy ||
2330 __kmp_static ==
2331 kmp_sch_static_balanced); // Unknown static scheduling type.
2332 // only some teams get single iteration, others get nothing
2333 if (team_id < trip_count) {
2334 *pupper = *plower = *plower + team_id * incr;
2335 } else {
2336 *plower = *pupper + incr; // zero-trip loop
2337 }
2338 if (plastiter != NULL)
2339 *plastiter = (team_id == trip_count - 1);
2340 } else {
2341 if (__kmp_static == kmp_sch_static_balanced) {
2342 register UT chunk = trip_count / nteams;
2343 register UT extras = trip_count % nteams;
2344 *plower +=
2345 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2346 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2347 if (plastiter != NULL)
2348 *plastiter = (team_id == nteams - 1);
2349 } else {
2350 register T chunk_inc_count =
2351 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2352 register T upper = *pupper;
2353 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2354 // Unknown static scheduling type.
2355 *plower += team_id * chunk_inc_count;
2356 *pupper = *plower + chunk_inc_count - incr;
2357 // Check/correct bounds if needed
2358 if (incr > 0) {
2359 if (*pupper < *plower)
2360 *pupper = traits_t<T>::max_value;
2361 if (plastiter != NULL)
2362 *plastiter = *plower <= upper && *pupper > upper - incr;
2363 if (*pupper > upper)
2364 *pupper = upper; // tracker C73258
2365 } else {
2366 if (*pupper > *plower)
2367 *pupper = traits_t<T>::min_value;
2368 if (plastiter != NULL)
2369 *plastiter = *plower >= upper && *pupper < upper - incr;
2370 if (*pupper < upper)
2371 *pupper = upper; // tracker C73258
2372 }
2373 }
2374 }
2375}
2376
2377//-----------------------------------------------------------------------------
Jim Cownie5e8470a2013-09-27 10:38:44 +00002378// Dispatch routines
2379// Transfer call to template< type T >
2380// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2381// T lb, T ub, ST st, ST chunk )
2382extern "C" {
2383
2384/*!
2385@ingroup WORK_SHARING
2386@{
2387@param loc Source location
2388@param gtid Global thread id
2389@param schedule Schedule type
2390@param lb Lower bound
2391@param ub Upper bound
2392@param st Step (or increment if you prefer)
2393@param chunk The chunk size to block with
2394
Jonathan Peyton30419822017-05-12 18:01:32 +00002395This function prepares the runtime to start a dynamically scheduled for loop,
2396saving the loop arguments.
Jim Cownie5e8470a2013-09-27 10:38:44 +00002397These functions are all identical apart from the types of the arguments.
2398*/
2399
Jonathan Peyton30419822017-05-12 18:01:32 +00002400void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2401 enum sched_type schedule, kmp_int32 lb,
2402 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2403 KMP_DEBUG_ASSERT(__kmp_init_serial);
2404 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002405}
2406/*!
2407See @ref __kmpc_dispatch_init_4
2408*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002409void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2410 enum sched_type schedule, kmp_uint32 lb,
2411 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2412 KMP_DEBUG_ASSERT(__kmp_init_serial);
2413 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002414}
2415
2416/*!
2417See @ref __kmpc_dispatch_init_4
2418*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002419void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2420 enum sched_type schedule, kmp_int64 lb,
2421 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2422 KMP_DEBUG_ASSERT(__kmp_init_serial);
2423 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002424}
2425
2426/*!
2427See @ref __kmpc_dispatch_init_4
2428*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002429void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2430 enum sched_type schedule, kmp_uint64 lb,
2431 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2432 KMP_DEBUG_ASSERT(__kmp_init_serial);
2433 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002434}
2435
2436/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002437See @ref __kmpc_dispatch_init_4
2438
2439Difference from __kmpc_dispatch_init set of functions is these functions
2440are called for composite distribute parallel for construct. Thus before
2441regular iterations dispatching we need to calc per-team iteration space.
2442
2443These functions are all identical apart from the types of the arguments.
2444*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002445void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2446 enum sched_type schedule, kmp_int32 *p_last,
2447 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2448 kmp_int32 chunk) {
2449 KMP_DEBUG_ASSERT(__kmp_init_serial);
2450 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2451 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002452}
2453
Jonathan Peyton30419822017-05-12 18:01:32 +00002454void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2455 enum sched_type schedule, kmp_int32 *p_last,
2456 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2457 kmp_int32 chunk) {
2458 KMP_DEBUG_ASSERT(__kmp_init_serial);
2459 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2460 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002461}
2462
Jonathan Peyton30419822017-05-12 18:01:32 +00002463void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2464 enum sched_type schedule, kmp_int32 *p_last,
2465 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2466 kmp_int64 chunk) {
2467 KMP_DEBUG_ASSERT(__kmp_init_serial);
2468 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2469 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002470}
2471
Jonathan Peyton30419822017-05-12 18:01:32 +00002472void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2473 enum sched_type schedule, kmp_int32 *p_last,
2474 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2475 kmp_int64 chunk) {
2476 KMP_DEBUG_ASSERT(__kmp_init_serial);
2477 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2478 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002479}
2480
2481/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002482@param loc Source code location
2483@param gtid Global thread id
Jonathan Peyton30419822017-05-12 18:01:32 +00002484@param p_last Pointer to a flag set to one if this is the last chunk or zero
2485otherwise
Jim Cownie5e8470a2013-09-27 10:38:44 +00002486@param p_lb Pointer to the lower bound for the next chunk of work
2487@param p_ub Pointer to the upper bound for the next chunk of work
2488@param p_st Pointer to the stride for the next chunk of work
2489@return one if there is work to be done, zero otherwise
2490
2491Get the next dynamically allocated chunk of work for this thread.
2492If there is no more work, then the lb,ub and stride need not be modified.
2493*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002494int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2495 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2496 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002497}
2498
2499/*!
2500See @ref __kmpc_dispatch_next_4
2501*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002502int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2503 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2504 kmp_int32 *p_st) {
2505 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002506}
2507
2508/*!
2509See @ref __kmpc_dispatch_next_4
2510*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002511int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2512 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2513 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002514}
2515
2516/*!
2517See @ref __kmpc_dispatch_next_4
2518*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002519int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2520 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2521 kmp_int64 *p_st) {
2522 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002523}
2524
2525/*!
2526@param loc Source code location
2527@param gtid Global thread id
2528
2529Mark the end of a dynamic loop.
2530*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002531void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2532 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002533}
2534
2535/*!
2536See @ref __kmpc_dispatch_fini_4
2537*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002538void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2539 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002540}
2541
2542/*!
2543See @ref __kmpc_dispatch_fini_4
2544*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002545void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2546 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002547}
2548
2549/*!
2550See @ref __kmpc_dispatch_fini_4
2551*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002552void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2553 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002554}
2555/*! @} */
2556
Jonathan Peyton30419822017-05-12 18:01:32 +00002557//-----------------------------------------------------------------------------
2558// Non-template routines from kmp_dispatch.cpp used in other sources
Jim Cownie5e8470a2013-09-27 10:38:44 +00002559
Jonathan Peyton30419822017-05-12 18:01:32 +00002560kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2561 return value == checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002562}
2563
Jonathan Peyton30419822017-05-12 18:01:32 +00002564kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2565 return value != checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002566}
2567
Jonathan Peyton30419822017-05-12 18:01:32 +00002568kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2569 return value < checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002570}
2571
Jonathan Peyton30419822017-05-12 18:01:32 +00002572kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2573 return value >= checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002574}
2575
Jonathan Peyton30419822017-05-12 18:01:32 +00002576kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2577 return value <= checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002578}
Jim Cownie5e8470a2013-09-27 10:38:44 +00002579
2580kmp_uint32
Jonathan Peyton30419822017-05-12 18:01:32 +00002581__kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2582 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2583 void *obj // Higher-level synchronization object, or NULL.
2584 ) {
2585 // note: we may not belong to a team at this point
2586 register volatile kmp_uint32 *spin = spinner;
2587 register kmp_uint32 check = checker;
2588 register kmp_uint32 spins;
2589 register kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2590 register kmp_uint32 r;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002591
Jonathan Peyton30419822017-05-12 18:01:32 +00002592 KMP_FSYNC_SPIN_INIT(obj, (void *)spin);
2593 KMP_INIT_YIELD(spins);
2594 // main wait spin loop
2595 while (!f(r = TCR_4(*spin), check)) {
2596 KMP_FSYNC_SPIN_PREPARE(obj);
2597 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2598 split. It causes problems with infinite recursion because of exit lock */
2599 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2600 __kmp_abort_thread(); */
Jim Cownie5e8470a2013-09-27 10:38:44 +00002601
Jonathan Peyton30419822017-05-12 18:01:32 +00002602 /* if we have waited a bit, or are oversubscribed, yield */
2603 /* pause is in the following code */
2604 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2605 KMP_YIELD_SPIN(spins);
2606 }
2607 KMP_FSYNC_SPIN_ACQUIRED(obj);
2608 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002609}
2610
Jonathan Peyton30419822017-05-12 18:01:32 +00002611void __kmp_wait_yield_4_ptr(
2612 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2613 void *obj // Higher-level synchronization object, or NULL.
2614 ) {
2615 // note: we may not belong to a team at this point
2616 register void *spin = spinner;
2617 register kmp_uint32 check = checker;
2618 register kmp_uint32 spins;
2619 register kmp_uint32 (*f)(void *, kmp_uint32) = pred;
Paul Osmialowskif7cc6af2016-05-31 20:20:32 +00002620
Jonathan Peyton30419822017-05-12 18:01:32 +00002621 KMP_FSYNC_SPIN_INIT(obj, spin);
2622 KMP_INIT_YIELD(spins);
2623 // main wait spin loop
2624 while (!f(spin, check)) {
2625 KMP_FSYNC_SPIN_PREPARE(obj);
2626 /* if we have waited a bit, or are oversubscribed, yield */
2627 /* pause is in the following code */
2628 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2629 KMP_YIELD_SPIN(spins);
2630 }
2631 KMP_FSYNC_SPIN_ACQUIRED(obj);
Paul Osmialowskif7cc6af2016-05-31 20:20:32 +00002632}
2633
Jim Cownie5e8470a2013-09-27 10:38:44 +00002634} // extern "C"
2635
2636#ifdef KMP_GOMP_COMPAT
2637
Jonathan Peyton30419822017-05-12 18:01:32 +00002638void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2639 enum sched_type schedule, kmp_int32 lb,
2640 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2641 int push_ws) {
2642 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2643 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002644}
2645
Jonathan Peyton30419822017-05-12 18:01:32 +00002646void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2647 enum sched_type schedule, kmp_uint32 lb,
2648 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2649 int push_ws) {
2650 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2651 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002652}
2653
Jonathan Peyton30419822017-05-12 18:01:32 +00002654void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2655 enum sched_type schedule, kmp_int64 lb,
2656 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2657 int push_ws) {
2658 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2659 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002660}
2661
Jonathan Peyton30419822017-05-12 18:01:32 +00002662void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2663 enum sched_type schedule, kmp_uint64 lb,
2664 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2665 int push_ws) {
2666 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2667 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002668}
2669
Jonathan Peyton30419822017-05-12 18:01:32 +00002670void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2671 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002672}
2673
Jonathan Peyton30419822017-05-12 18:01:32 +00002674void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2675 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002676}
2677
Jonathan Peyton30419822017-05-12 18:01:32 +00002678void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2679 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002680}
2681
Jonathan Peyton30419822017-05-12 18:01:32 +00002682void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2683 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002684}
2685
2686#endif /* KMP_GOMP_COMPAT */
2687
2688/* ------------------------------------------------------------------------ */