blob: 359b370b43ae28697aa3a2b193b8e87f0dfb3f36 [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
Jonathan Peyton30419822017-05-12 18:01:32 +000016/* Dynamic scheduling initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +000017 *
18 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
19 * it may change values between parallel regions. __kmp_max_nth
20 * is the largest value __kmp_nth may take, 1 is the smallest.
Jim Cownie5e8470a2013-09-27 10:38:44 +000021 */
22
Jonathan Peyton30419822017-05-12 18:01:32 +000023// Need to raise Win version from XP to Vista here for support of
24// InterlockedExchange64
Andrey Churbanov429dbc22016-07-11 10:44:57 +000025#if defined(_WIN32_WINNT) && defined(_M_IX86)
26#undef _WIN32_WINNT
27#define _WIN32_WINNT 0x0502
28#endif
29
Jim Cownie5e8470a2013-09-27 10:38:44 +000030#include "kmp.h"
Jonathan Peyton30419822017-05-12 18:01:32 +000031#include "kmp_error.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000032#include "kmp_i18n.h"
33#include "kmp_itt.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000034#include "kmp_stats.h"
Jonathan Peyton30419822017-05-12 18:01:32 +000035#include "kmp_str.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000036#if KMP_OS_WINDOWS && KMP_ARCH_X86
Jonathan Peyton30419822017-05-12 18:01:32 +000037#include <float.h>
Jim Cownie5e8470a2013-09-27 10:38:44 +000038#endif
39
Andrey Churbanovd7d088f2015-04-29 16:42:24 +000040#if OMPT_SUPPORT
41#include "ompt-internal.h"
42#include "ompt-specific.h"
43#endif
44
Jim Cownie5e8470a2013-09-27 10:38:44 +000045/* ------------------------------------------------------------------------ */
Jim Cownie5e8470a2013-09-27 10:38:44 +000046
Andrey Churbanov429dbc22016-07-11 10:44:57 +000047#if KMP_STATIC_STEAL_ENABLED
Jim Cownie5e8470a2013-09-27 10:38:44 +000048
Jonathan Peyton30419822017-05-12 18:01:32 +000049// replaces dispatch_private_info{32,64} structures and
50// dispatch_private_info{32,64}_t types
51template <typename T> struct dispatch_private_infoXX_template {
52 typedef typename traits_t<T>::unsigned_t UT;
53 typedef typename traits_t<T>::signed_t ST;
54 UT count; // unsigned
55 T ub;
56 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
57 T lb;
58 ST st; // signed
59 UT tc; // unsigned
60 T static_steal_counter; // for static_steal only; maybe better to put after ub
Jim Cownie5e8470a2013-09-27 10:38:44 +000061
Jonathan Peyton30419822017-05-12 18:01:32 +000062 /* parm[1-4] are used in different ways by different scheduling algorithms */
Jim Cownie5e8470a2013-09-27 10:38:44 +000063
Jonathan Peyton30419822017-05-12 18:01:32 +000064 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
65 // a) parm3 is properly aligned and
66 // b) all parm1-4 are in the same cache line.
67 // Because of parm1-4 are used together, performance seems to be better
68 // if they are in the same line (not measured though).
Jim Cownie5e8470a2013-09-27 10:38:44 +000069
Jonathan Peyton30419822017-05-12 18:01:32 +000070 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
71 T parm1;
72 T parm2;
73 T parm3;
74 T parm4;
75 };
Jim Cownie5e8470a2013-09-27 10:38:44 +000076
Jonathan Peyton30419822017-05-12 18:01:32 +000077 UT ordered_lower; // unsigned
78 UT ordered_upper; // unsigned
79#if KMP_OS_WINDOWS
80 T last_upper;
81#endif /* KMP_OS_WINDOWS */
82};
Jim Cownie5e8470a2013-09-27 10:38:44 +000083
84#else /* KMP_STATIC_STEAL_ENABLED */
85
Jonathan Peyton30419822017-05-12 18:01:32 +000086// replaces dispatch_private_info{32,64} structures and
87// dispatch_private_info{32,64}_t types
88template <typename T> struct dispatch_private_infoXX_template {
89 typedef typename traits_t<T>::unsigned_t UT;
90 typedef typename traits_t<T>::signed_t ST;
91 T lb;
92 T ub;
93 ST st; // signed
94 UT tc; // unsigned
Jim Cownie5e8470a2013-09-27 10:38:44 +000095
Jonathan Peyton30419822017-05-12 18:01:32 +000096 T parm1;
97 T parm2;
98 T parm3;
99 T parm4;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000100
Jonathan Peyton30419822017-05-12 18:01:32 +0000101 UT count; // unsigned
Jim Cownie5e8470a2013-09-27 10:38:44 +0000102
Jonathan Peyton30419822017-05-12 18:01:32 +0000103 UT ordered_lower; // unsigned
104 UT ordered_upper; // unsigned
105#if KMP_OS_WINDOWS
106 T last_upper;
107#endif /* KMP_OS_WINDOWS */
108};
Jim Cownie5e8470a2013-09-27 10:38:44 +0000109
110#endif /* KMP_STATIC_STEAL_ENABLED */
111
112// replaces dispatch_private_info structure and dispatch_private_info_t type
Jonathan Peyton30419822017-05-12 18:01:32 +0000113template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
114 // duplicate alignment here, otherwise size of structure is not correct in our
115 // compiler
116 union KMP_ALIGN_CACHE private_info_tmpl {
117 dispatch_private_infoXX_template<T> p;
118 dispatch_private_info64_t p64;
119 } u;
120 enum sched_type schedule; /* scheduling algorithm */
121 kmp_uint32 ordered; /* ordered clause specified */
122 kmp_uint32 ordered_bumped;
123 // To retain the structure size after making ordered_iteration scalar
124 kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
125 dispatch_private_info *next; /* stack of buffers for nest of serial regions */
126 kmp_uint32 nomerge; /* don't merge iters if serialized */
127 kmp_uint32 type_size;
128 enum cons_type pushed_ws;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000129};
130
Jonathan Peyton30419822017-05-12 18:01:32 +0000131// replaces dispatch_shared_info{32,64} structures and
132// dispatch_shared_info{32,64}_t types
133template <typename UT> struct dispatch_shared_infoXX_template {
134 /* chunk index under dynamic, number of idle threads under static-steal;
135 iteration index otherwise */
136 volatile UT iteration;
137 volatile UT num_done;
138 volatile UT ordered_iteration;
139 // to retain the structure size making ordered_iteration scalar
140 UT ordered_dummy[KMP_MAX_ORDERED - 3];
Jim Cownie5e8470a2013-09-27 10:38:44 +0000141};
142
143// replaces dispatch_shared_info structure and dispatch_shared_info_t type
Jonathan Peyton30419822017-05-12 18:01:32 +0000144template <typename UT> struct dispatch_shared_info_template {
145 // we need union here to keep the structure size
146 union shared_info_tmpl {
147 dispatch_shared_infoXX_template<UT> s;
148 dispatch_shared_info64_t s64;
149 } u;
150 volatile kmp_uint32 buffer_index;
Jonathan Peytondf6818b2016-06-14 17:57:47 +0000151#if OMP_45_ENABLED
Jonathan Peyton30419822017-05-12 18:01:32 +0000152 volatile kmp_int32 doacross_buf_idx; // teamwise index
153 kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
154 kmp_int32 doacross_num_done; // count finished threads
Jonathan Peyton71909c52016-03-02 22:42:06 +0000155#endif
Jonathan Peyton4d3c2132016-07-08 17:43:21 +0000156#if KMP_USE_HWLOC
Jonathan Peyton30419822017-05-12 18:01:32 +0000157 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
158 // machines (> 48 cores). Performance analysis showed that a cache thrash
159 // was occurring and this padding helps alleviate the problem.
160 char padding[64];
Jonathan Peyton4d3c2132016-07-08 17:43:21 +0000161#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000162};
163
164/* ------------------------------------------------------------------------ */
Jim Cownie5e8470a2013-09-27 10:38:44 +0000165
Jim Cownie5e8470a2013-09-27 10:38:44 +0000166#undef USE_TEST_LOCKS
167
168// test_then_add template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000169template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000170
Jonathan Peyton30419822017-05-12 18:01:32 +0000171template <>
172__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
173 kmp_int32 d) {
174 kmp_int32 r;
Andrey Churbanov5ba90c72017-07-17 09:03:14 +0000175 r = KMP_TEST_THEN_ADD32(p, d);
Jonathan Peyton30419822017-05-12 18:01:32 +0000176 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000177}
178
Jonathan Peyton30419822017-05-12 18:01:32 +0000179template <>
180__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
181 kmp_int64 d) {
182 kmp_int64 r;
Andrey Churbanov5ba90c72017-07-17 09:03:14 +0000183 r = KMP_TEST_THEN_ADD64(p, d);
Jonathan Peyton30419822017-05-12 18:01:32 +0000184 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000185}
186
187// test_then_inc_acq template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000188template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000189
Jonathan Peyton30419822017-05-12 18:01:32 +0000190template <>
191__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
192 kmp_int32 r;
Andrey Churbanov5ba90c72017-07-17 09:03:14 +0000193 r = KMP_TEST_THEN_INC_ACQ32(p);
Jonathan Peyton30419822017-05-12 18:01:32 +0000194 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000195}
196
Jonathan Peyton30419822017-05-12 18:01:32 +0000197template <>
198__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
199 kmp_int64 r;
Andrey Churbanov5ba90c72017-07-17 09:03:14 +0000200 r = KMP_TEST_THEN_INC_ACQ64(p);
Jonathan Peyton30419822017-05-12 18:01:32 +0000201 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000202}
203
204// test_then_inc template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000205template <typename T> static __forceinline T test_then_inc(volatile T *p);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000206
Jonathan Peyton30419822017-05-12 18:01:32 +0000207template <>
208__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
209 kmp_int32 r;
Andrey Churbanov5ba90c72017-07-17 09:03:14 +0000210 r = KMP_TEST_THEN_INC32(p);
Jonathan Peyton30419822017-05-12 18:01:32 +0000211 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000212}
213
Jonathan Peyton30419822017-05-12 18:01:32 +0000214template <>
215__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
216 kmp_int64 r;
Andrey Churbanov5ba90c72017-07-17 09:03:14 +0000217 r = KMP_TEST_THEN_INC64(p);
Jonathan Peyton30419822017-05-12 18:01:32 +0000218 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000219}
220
221// compare_and_swap template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000222template <typename T>
223static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000224
Jonathan Peyton30419822017-05-12 18:01:32 +0000225template <>
226__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
227 kmp_int32 c, kmp_int32 s) {
228 return KMP_COMPARE_AND_STORE_REL32(p, c, s);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000229}
230
Jonathan Peyton30419822017-05-12 18:01:32 +0000231template <>
232__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
233 kmp_int64 c, kmp_int64 s) {
234 return KMP_COMPARE_AND_STORE_REL64(p, c, s);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000235}
236
Jonathan Peyton30419822017-05-12 18:01:32 +0000237/* Spin wait loop that first does pause, then yield.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000238 Waits until function returns non-zero when called with *spinner and check.
239 Does NOT put threads to sleep.
240#if USE_ITT_BUILD
241 Arguments:
Jonathan Peyton30419822017-05-12 18:01:32 +0000242 obj -- is higher-level synchronization object to report to ittnotify.
243 It is used to report locks consistently. For example, if lock is
244 acquired immediately, its address is reported to ittnotify via
245 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
246 and lock routine calls to KMP_WAIT_YIELD(), the later should report the
247 same address, not an address of low-level spinner.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000248#endif // USE_ITT_BUILD
249*/
Jonathan Peyton30419822017-05-12 18:01:32 +0000250template <typename UT>
Jim Cownie5e8470a2013-09-27 10:38:44 +0000251// ToDo: make inline function (move to header file for icl)
Jonathan Peyton30419822017-05-12 18:01:32 +0000252static UT // unsigned 4- or 8-byte type
253 __kmp_wait_yield(
254 volatile UT *spinner, UT checker,
255 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
256 void *obj) // Higher-level synchronization object, or NULL.
257 ) {
258 // note: we may not belong to a team at this point
Ed Maste414544c2017-07-07 21:06:05 +0000259 volatile UT *spin = spinner;
260 UT check = checker;
261 kmp_uint32 spins;
262 kmp_uint32 (*f)(UT, UT) = pred;
263 UT r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000264
Andrey Churbanovc47afcd2017-07-03 11:24:08 +0000265 KMP_FSYNC_SPIN_INIT(obj, CCAST(UT *, spin));
Jonathan Peyton30419822017-05-12 18:01:32 +0000266 KMP_INIT_YIELD(spins);
267 // main wait spin loop
268 while (!f(r = *spin, check)) {
269 KMP_FSYNC_SPIN_PREPARE(obj);
270 /* GEH - remove this since it was accidentally introduced when kmp_wait was
271 split. It causes problems with infinite recursion because of exit lock */
272 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
273 __kmp_abort_thread(); */
274
275 // if we are oversubscribed, or have waited a bit (and
276 // KMP_LIBRARY=throughput, then yield. pause is in the following code
277 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
278 KMP_YIELD_SPIN(spins);
279 }
280 KMP_FSYNC_SPIN_ACQUIRED(obj);
281 return r;
282}
283
284template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
285 return value == checker;
286}
287
288template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
289 return value != checker;
290}
291
292template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
293 return value < checker;
294}
295
296template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
297 return value >= checker;
298}
299
300template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
301 return value <= checker;
302}
303
304/* ------------------------------------------------------------------------ */
305
306static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
307 ident_t *loc_ref) {
308 kmp_info_t *th;
309
310 KMP_DEBUG_ASSERT(gtid_ref);
311
312 if (__kmp_env_consistency_check) {
313 th = __kmp_threads[*gtid_ref];
314 if (th->th.th_root->r.r_active &&
315 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
316#if KMP_USE_DYNAMIC_LOCK
317 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
318#else
319 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
320#endif
321 }
322 }
323}
324
325template <typename UT>
326static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
327 typedef typename traits_t<UT>::signed_t ST;
328 dispatch_private_info_template<UT> *pr;
329
330 int gtid = *gtid_ref;
331 // int cid = *cid_ref;
332 kmp_info_t *th = __kmp_threads[gtid];
333 KMP_DEBUG_ASSERT(th->th.th_dispatch);
334
335 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
336 if (__kmp_env_consistency_check) {
337 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
338 th->th.th_dispatch->th_dispatch_pr_current);
339 if (pr->pushed_ws != ct_none) {
340#if KMP_USE_DYNAMIC_LOCK
341 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
342#else
343 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
344#endif
345 }
346 }
347
348 if (!th->th.th_team->t.t_serialized) {
349 dispatch_shared_info_template<UT> *sh =
350 reinterpret_cast<dispatch_shared_info_template<UT> *>(
351 th->th.th_dispatch->th_dispatch_sh_current);
352 UT lower;
353
354 if (!__kmp_env_consistency_check) {
355 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
356 th->th.th_dispatch->th_dispatch_pr_current);
357 }
358 lower = pr->u.p.ordered_lower;
359
360#if !defined(KMP_GOMP_COMPAT)
361 if (__kmp_env_consistency_check) {
362 if (pr->ordered_bumped) {
363 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
364 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
365 ct_ordered_in_pdo, loc_ref,
366 &p->stack_data[p->w_top]);
367 }
368 }
369#endif /* !defined(KMP_GOMP_COMPAT) */
370
371 KMP_MB();
372#ifdef KMP_DEBUG
Jim Cownie5e8470a2013-09-27 10:38:44 +0000373 {
Jonathan Peyton30419822017-05-12 18:01:32 +0000374 const char *buff;
375 // create format specifiers before the debug output
376 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
377 "ordered_iter:%%%s lower:%%%s\n",
378 traits_t<UT>::spec, traits_t<UT>::spec);
379 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
380 __kmp_str_free(&buff);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000381 }
Jonathan Peyton30419822017-05-12 18:01:32 +0000382#endif
383
384 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
385 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
386 KMP_MB(); /* is this necessary? */
387#ifdef KMP_DEBUG
388 {
389 const char *buff;
390 // create format specifiers before the debug output
391 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
392 "ordered_iter:%%%s lower:%%%s\n",
393 traits_t<UT>::spec, traits_t<UT>::spec);
394 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
395 __kmp_str_free(&buff);
396 }
397#endif
398 }
399 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
400}
401
402static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
403 ident_t *loc_ref) {
404 kmp_info_t *th;
405
406 if (__kmp_env_consistency_check) {
407 th = __kmp_threads[*gtid_ref];
408 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
409 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
410 }
411 }
412}
413
414template <typename UT>
415static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
416 typedef typename traits_t<UT>::signed_t ST;
417 dispatch_private_info_template<UT> *pr;
418
419 int gtid = *gtid_ref;
420 // int cid = *cid_ref;
421 kmp_info_t *th = __kmp_threads[gtid];
422 KMP_DEBUG_ASSERT(th->th.th_dispatch);
423
424 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
425 if (__kmp_env_consistency_check) {
426 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
427 th->th.th_dispatch->th_dispatch_pr_current);
428 if (pr->pushed_ws != ct_none) {
429 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
430 }
431 }
432
433 if (!th->th.th_team->t.t_serialized) {
434 dispatch_shared_info_template<UT> *sh =
435 reinterpret_cast<dispatch_shared_info_template<UT> *>(
436 th->th.th_dispatch->th_dispatch_sh_current);
437
438 if (!__kmp_env_consistency_check) {
439 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
440 th->th.th_dispatch->th_dispatch_pr_current);
441 }
442
Andrey Churbanovc47afcd2017-07-03 11:24:08 +0000443 KMP_FSYNC_RELEASING(CCAST(UT *, &sh->u.s.ordered_iteration));
Jonathan Peyton30419822017-05-12 18:01:32 +0000444#if !defined(KMP_GOMP_COMPAT)
445 if (__kmp_env_consistency_check) {
446 if (pr->ordered_bumped != 0) {
447 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
448 /* How to test it? - OM */
449 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
450 ct_ordered_in_pdo, loc_ref,
451 &p->stack_data[p->w_top]);
452 }
453 }
454#endif /* !defined(KMP_GOMP_COMPAT) */
455
456 KMP_MB(); /* Flush all pending memory write invalidates. */
457
458 pr->ordered_bumped += 1;
459
460 KD_TRACE(1000,
461 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
462 gtid, pr->ordered_bumped));
463
464 KMP_MB(); /* Flush all pending memory write invalidates. */
465
466 /* TODO use general release procedure? */
467 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
468
469 KMP_MB(); /* Flush all pending memory write invalidates. */
470 }
471 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
472}
473
474// Computes and returns x to the power of y, where y must a non-negative integer
475template <typename UT>
476static __forceinline long double __kmp_pow(long double x, UT y) {
477 long double s = 1.0L;
478
479 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
480 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
481 while (y) {
482 if (y & 1)
483 s *= x;
484 x *= x;
485 y >>= 1;
486 }
487 return s;
488}
489
490/* Computes and returns the number of unassigned iterations after idx chunks
491 have been assigned (the total number of unassigned iterations in chunks with
492 index greater than or equal to idx). __forceinline seems to be broken so that
493 if we __forceinline this function, the behavior is wrong
494 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
495template <typename T>
496static __inline typename traits_t<T>::unsigned_t
497__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
498 typename traits_t<T>::unsigned_t idx) {
499 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
500 ICL 8.1, long double arithmetic may not really have long double precision,
501 even with /Qlong_double. Currently, we workaround that in the caller code,
502 by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack
503 of precision is not expected to be a correctness issue, though. */
504 typedef typename traits_t<T>::unsigned_t UT;
505
506 long double x = tc * __kmp_pow<UT>(base, idx);
507 UT r = (UT)x;
508 if (x == r)
Jim Cownie5e8470a2013-09-27 10:38:44 +0000509 return r;
Jonathan Peyton30419822017-05-12 18:01:32 +0000510 return r + 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000511}
512
513// Parameters of the guided-iterative algorithm:
514// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
515// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
Jonathan Peyton30419822017-05-12 18:01:32 +0000516// by default n = 2. For example with n = 3 the chunks distribution will be more
517// flat.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000518// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
519static int guided_int_param = 2;
Jonathan Peyton30419822017-05-12 18:01:32 +0000520static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000521
522// UT - unsigned flavor of T, ST - signed flavor of T,
523// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
Jonathan Peyton30419822017-05-12 18:01:32 +0000524template <typename T>
Jim Cownie5e8470a2013-09-27 10:38:44 +0000525static void
Jonathan Peyton30419822017-05-12 18:01:32 +0000526__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
527 T ub, typename traits_t<T>::signed_t st,
528 typename traits_t<T>::signed_t chunk, int push_ws) {
529 typedef typename traits_t<T>::unsigned_t UT;
530 typedef typename traits_t<T>::signed_t ST;
531 typedef typename traits_t<T>::floating_t DBL;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000532
Jonathan Peyton30419822017-05-12 18:01:32 +0000533 int active;
534 T tc;
535 kmp_info_t *th;
536 kmp_team_t *team;
537 kmp_uint32 my_buffer_index;
538 dispatch_private_info_template<T> *pr;
539 dispatch_shared_info_template<UT> volatile *sh;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000540
Jonathan Peyton30419822017-05-12 18:01:32 +0000541 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
542 sizeof(dispatch_private_info));
543 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
544 sizeof(dispatch_shared_info));
Jim Cownie5e8470a2013-09-27 10:38:44 +0000545
Jonathan Peyton30419822017-05-12 18:01:32 +0000546 if (!TCR_4(__kmp_init_parallel))
547 __kmp_parallel_initialize();
Jim Cownie5e8470a2013-09-27 10:38:44 +0000548
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000549#if INCLUDE_SSC_MARKS
Jonathan Peyton30419822017-05-12 18:01:32 +0000550 SSC_MARK_DISPATCH_INIT();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000551#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000552#ifdef KMP_DEBUG
553 {
554 const char *buff;
555 // create format specifiers before the debug output
556 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
557 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
558 traits_t<ST>::spec, traits_t<T>::spec,
559 traits_t<T>::spec, traits_t<ST>::spec);
560 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
561 __kmp_str_free(&buff);
562 }
563#endif
564 /* setup data */
565 th = __kmp_threads[gtid];
566 team = th->th.th_team;
567 active = !team->t.t_serialized;
568 th->th.th_ident = loc;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000569
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000570#if USE_ITT_BUILD
Jonathan Peyton30419822017-05-12 18:01:32 +0000571 kmp_uint64 cur_chunk = chunk;
572 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
573 __kmp_forkjoin_frames_mode == 3 &&
574 KMP_MASTER_GTID(gtid) &&
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000575#if OMP_40_ENABLED
Jonathan Peyton30419822017-05-12 18:01:32 +0000576 th->th.th_teams_microtask == NULL &&
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000577#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000578 team->t.t_active_level == 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000579#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000580 if (!active) {
581 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
582 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
583 } else {
584 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
585 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000586
Jonathan Peyton30419822017-05-12 18:01:32 +0000587 my_buffer_index = th->th.th_dispatch->th_disp_index++;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000588
Jonathan Peyton30419822017-05-12 18:01:32 +0000589 /* What happens when number of threads changes, need to resize buffer? */
590 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
591 &th->th.th_dispatch
592 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593 sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
594 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
595 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000596
Jonathan Peyton30419822017-05-12 18:01:32 +0000597#if (KMP_STATIC_STEAL_ENABLED)
598 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
599 // AC: we now have only one implementation of stealing, so use it
600 schedule = kmp_sch_static_steal;
601 else
602#endif
603 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
Jonathan Peytonea0fe1d2016-02-25 17:55:50 +0000604
Jonathan Peyton30419822017-05-12 18:01:32 +0000605 /* Pick up the nomerge/ordered bits from the scheduling type */
606 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
607 pr->nomerge = TRUE;
608 schedule =
609 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
610 } else {
611 pr->nomerge = FALSE;
612 }
613 pr->type_size = traits_t<T>::type_size; // remember the size of variables
614 if (kmp_ord_lower & schedule) {
615 pr->ordered = TRUE;
616 schedule =
617 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
618 } else {
619 pr->ordered = FALSE;
620 }
Jonathan Peyton45be4502015-08-11 21:36:41 +0000621
Jonathan Peyton30419822017-05-12 18:01:32 +0000622 if (schedule == kmp_sch_static) {
623 schedule = __kmp_static;
624 } else {
625 if (schedule == kmp_sch_runtime) {
626 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
627 // not specified)
628 schedule = team->t.t_sched.r_sched_type;
629 // Detail the schedule if needed (global controls are differentiated
630 // appropriately)
631 if (schedule == kmp_sch_guided_chunked) {
632 schedule = __kmp_guided;
633 } else if (schedule == kmp_sch_static) {
Jim Cownie5e8470a2013-09-27 10:38:44 +0000634 schedule = __kmp_static;
Jonathan Peyton30419822017-05-12 18:01:32 +0000635 }
636 // Use the chunk size specified by OMP_SCHEDULE (or default if not
637 // specified)
638 chunk = team->t.t_sched.chunk;
Jonathan Peyton00afbd02015-11-12 21:26:22 +0000639#if USE_ITT_BUILD
Jonathan Peyton30419822017-05-12 18:01:32 +0000640 cur_chunk = chunk;
Jonathan Peyton00afbd02015-11-12 21:26:22 +0000641#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000642#ifdef KMP_DEBUG
643 {
644 const char *buff;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000645 // create format specifiers before the debug output
646 buff = __kmp_str_format(
Jonathan Peyton30419822017-05-12 18:01:32 +0000647 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
648 traits_t<ST>::spec);
649 KD_TRACE(10, (buff, gtid, schedule, chunk));
650 __kmp_str_free(&buff);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000651 }
Jonathan Peyton30419822017-05-12 18:01:32 +0000652#endif
653 } else {
654 if (schedule == kmp_sch_guided_chunked) {
655 schedule = __kmp_guided;
656 }
657 if (chunk <= 0) {
658 chunk = KMP_DEFAULT_CHUNK;
659 }
660 }
661
662 if (schedule == kmp_sch_auto) {
663 // mapping and differentiation: in the __kmp_do_serial_initialize()
664 schedule = __kmp_auto;
665#ifdef KMP_DEBUG
666 {
667 const char *buff;
668 // create format specifiers before the debug output
669 buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
670 "schedule:%%d chunk:%%%s\n",
671 traits_t<ST>::spec);
672 KD_TRACE(10, (buff, gtid, schedule, chunk));
673 __kmp_str_free(&buff);
674 }
675#endif
676 }
677
678 /* guided analytical not safe for too many threads */
679 if (schedule == kmp_sch_guided_analytical_chunked &&
680 th->th.th_team_nproc > 1 << 20) {
681 schedule = kmp_sch_guided_iterative_chunked;
682 KMP_WARNING(DispatchManyThreads);
683 }
Andrey Churbanovd454c732017-06-05 17:17:33 +0000684 if (schedule == kmp_sch_runtime_simd) {
685 // compiler provides simd_width in the chunk parameter
686 schedule = team->t.t_sched.r_sched_type;
687 // Detail the schedule if needed (global controls are differentiated
688 // appropriately)
689 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
690 schedule == __kmp_static) {
691 schedule = kmp_sch_static_balanced_chunked;
692 } else {
693 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
694 schedule = kmp_sch_guided_simd;
695 }
696 chunk = team->t.t_sched.chunk * chunk;
697 }
698#if USE_ITT_BUILD
699 cur_chunk = chunk;
700#endif
701#ifdef KMP_DEBUG
702 {
703 const char *buff;
704 // create format specifiers before the debug output
705 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
706 " chunk:%%%s\n",
707 traits_t<ST>::spec);
708 KD_TRACE(10, (buff, gtid, schedule, chunk));
709 __kmp_str_free(&buff);
710 }
711#endif
712 }
Jonathan Peyton30419822017-05-12 18:01:32 +0000713 pr->u.p.parm1 = chunk;
714 }
715 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
716 "unknown scheduling type");
717
718 pr->u.p.count = 0;
719
720 if (__kmp_env_consistency_check) {
721 if (st == 0) {
722 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
723 (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
724 }
725 }
726 // compute trip count
727 if (st == 1) { // most common case
728 if (ub >= lb) {
729 tc = ub - lb + 1;
730 } else { // ub < lb
731 tc = 0; // zero-trip
732 }
733 } else if (st < 0) {
734 if (lb >= ub) {
735 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
736 // where the division needs to be unsigned regardless of the result type
737 tc = (UT)(lb - ub) / (-st) + 1;
738 } else { // lb < ub
739 tc = 0; // zero-trip
740 }
741 } else { // st > 0
742 if (ub >= lb) {
743 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
744 // where the division needs to be unsigned regardless of the result type
745 tc = (UT)(ub - lb) / st + 1;
746 } else { // ub < lb
747 tc = 0; // zero-trip
748 }
749 }
750
751 // Any half-decent optimizer will remove this test when the blocks are empty
752 // since the macros expand to nothing when statistics are disabled.
753 if (schedule == __kmp_static) {
754 KMP_COUNT_BLOCK(OMP_FOR_static);
755 KMP_COUNT_VALUE(FOR_static_iterations, tc);
756 } else {
757 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
758 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
759 }
760
761 pr->u.p.lb = lb;
762 pr->u.p.ub = ub;
763 pr->u.p.st = st;
764 pr->u.p.tc = tc;
765
766#if KMP_OS_WINDOWS
767 pr->u.p.last_upper = ub + st;
768#endif /* KMP_OS_WINDOWS */
769
770 /* NOTE: only the active parallel region(s) has active ordered sections */
771
772 if (active) {
773 if (pr->ordered == 0) {
774 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
775 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
776 } else {
777 pr->ordered_bumped = 0;
778
779 pr->u.p.ordered_lower = 1;
780 pr->u.p.ordered_upper = 0;
781
782 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
783 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
784 }
785 }
786
787 if (__kmp_env_consistency_check) {
788 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
789 if (push_ws) {
790 __kmp_push_workshare(gtid, ws, loc);
791 pr->pushed_ws = ws;
792 } else {
793 __kmp_check_workshare(gtid, ws, loc);
794 pr->pushed_ws = ct_none;
795 }
796 }
797
798 switch (schedule) {
799#if (KMP_STATIC_STEAL_ENABLED)
800 case kmp_sch_static_steal: {
801 T nproc = th->th.th_team_nproc;
802 T ntc, init;
803
804 KD_TRACE(100,
805 ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
806
807 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
808 if (nproc > 1 && ntc >= nproc) {
809 KMP_COUNT_BLOCK(OMP_FOR_static_steal);
810 T id = __kmp_tid_from_gtid(gtid);
811 T small_chunk, extras;
812
813 small_chunk = ntc / nproc;
814 extras = ntc % nproc;
815
816 init = id * small_chunk + (id < extras ? id : extras);
817 pr->u.p.count = init;
818 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
819
820 pr->u.p.parm2 = lb;
821 // pr->pfields.parm3 = 0; // it's not used in static_steal
822 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
823 pr->u.p.st = st;
824 if (traits_t<T>::type_size > 4) {
825 // AC: TODO: check if 16-byte CAS available and use it to
826 // improve performance (probably wait for explicit request
827 // before spending time on this).
828 // For now use dynamically allocated per-thread lock,
829 // free memory in __kmp_dispatch_next when status==0.
830 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
831 th->th.th_dispatch->th_steal_lock =
832 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
833 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
834 }
835 break;
836 } else {
837 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
838 "kmp_sch_static_balanced\n",
839 gtid));
840 schedule = kmp_sch_static_balanced;
841 /* too few iterations: fall-through to kmp_sch_static_balanced */
842 } // if
843 /* FALL-THROUGH to static balanced */
844 } // case
845#endif
846 case kmp_sch_static_balanced: {
847 T nproc = th->th.th_team_nproc;
848 T init, limit;
849
850 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
851 gtid));
852
853 if (nproc > 1) {
854 T id = __kmp_tid_from_gtid(gtid);
855
856 if (tc < nproc) {
857 if (id < tc) {
858 init = id;
859 limit = id;
860 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
861 } else {
862 pr->u.p.count = 1; /* means no more chunks to execute */
863 pr->u.p.parm1 = FALSE;
864 break;
865 }
866 } else {
867 T small_chunk = tc / nproc;
868 T extras = tc % nproc;
869 init = id * small_chunk + (id < extras ? id : extras);
870 limit = init + small_chunk - (id < extras ? 0 : 1);
871 pr->u.p.parm1 = (id == nproc - 1);
872 }
873 } else {
874 if (tc > 0) {
875 init = 0;
876 limit = tc - 1;
877 pr->u.p.parm1 = TRUE;
878 } else { // zero trip count
879 pr->u.p.count = 1; /* means no more chunks to execute */
880 pr->u.p.parm1 = FALSE;
881 break;
882 }
883 }
884#if USE_ITT_BUILD
885 // Calculate chunk for metadata report
886 if (itt_need_metadata_reporting)
887 cur_chunk = limit - init + 1;
888#endif
889 if (st == 1) {
890 pr->u.p.lb = lb + init;
891 pr->u.p.ub = lb + limit;
892 } else {
893 // calculated upper bound, "ub" is user-defined upper bound
894 T ub_tmp = lb + limit * st;
895 pr->u.p.lb = lb + init * st;
896 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
897 // it exactly
898 if (st > 0) {
899 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
900 } else {
901 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
902 }
903 }
904 if (pr->ordered) {
905 pr->u.p.ordered_lower = init;
906 pr->u.p.ordered_upper = limit;
907 }
908 break;
909 } // case
Andrey Churbanovd454c732017-06-05 17:17:33 +0000910 case kmp_sch_static_balanced_chunked: {
911 // similar to balanced, but chunk adjusted to multiple of simd width
912 T nth = th->th.th_team_nproc;
913 KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
914 " -> falling-through to static_greedy\n",
915 gtid));
916 schedule = kmp_sch_static_greedy;
917 if (nth > 1)
918 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
919 else
920 pr->u.p.parm1 = tc;
921 break;
922 } // case
923 case kmp_sch_guided_iterative_chunked:
924 case kmp_sch_guided_simd: {
Jonathan Peyton30419822017-05-12 18:01:32 +0000925 T nproc = th->th.th_team_nproc;
926 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
927 " case\n",
928 gtid));
929
930 if (nproc > 1) {
931 if ((2L * chunk + 1) * nproc >= tc) {
932 /* chunk size too large, switch to dynamic */
933 schedule = kmp_sch_dynamic_chunked;
934 } else {
935 // when remaining iters become less than parm2 - switch to dynamic
936 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
937 *(double *)&pr->u.p.parm3 =
938 guided_flt_param / nproc; // may occupy parm3 and parm4
939 }
940 } else {
941 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
942 "kmp_sch_static_greedy\n",
943 gtid));
944 schedule = kmp_sch_static_greedy;
945 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
946 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
947 gtid));
948 pr->u.p.parm1 = tc;
949 } // if
950 } // case
951 break;
952 case kmp_sch_guided_analytical_chunked: {
953 T nproc = th->th.th_team_nproc;
954 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
955 " case\n",
956 gtid));
957 if (nproc > 1) {
958 if ((2L * chunk + 1) * nproc >= tc) {
959 /* chunk size too large, switch to dynamic */
960 schedule = kmp_sch_dynamic_chunked;
961 } else {
962 /* commonly used term: (2 nproc - 1)/(2 nproc) */
963 DBL x;
964
965#if KMP_OS_WINDOWS && KMP_ARCH_X86
966 /* Linux* OS already has 64-bit computation by default for long double,
967 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
968 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
969 instead of the default 53-bit. Even though long double doesn't work
970 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
971 expected to impact the correctness of the algorithm, but this has not
972 been mathematically proven. */
973 // save original FPCW and set precision to 64-bit, as
974 // Windows* OS on IA-32 architecture defaults to 53-bit
975 unsigned int oldFpcw = _control87(0, 0);
976 _control87(_PC_64, _MCW_PC); // 0,0x30000
977#endif
978 /* value used for comparison in solver for cross-over point */
979 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
980
981 /* crossover point--chunk indexes equal to or greater than
982 this point switch to dynamic-style scheduling */
983 UT cross;
984
985 /* commonly used term: (2 nproc - 1)/(2 nproc) */
986 x = (long double)1.0 - (long double)0.5 / nproc;
987
988#ifdef KMP_DEBUG
989 { // test natural alignment
990 struct _test_a {
991 char a;
992 union {
993 char b;
994 DBL d;
995 };
996 } t;
997 ptrdiff_t natural_alignment =
998 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
999 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
1000 // long)natural_alignment );
1001 KMP_DEBUG_ASSERT(
1002 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1003 }
1004#endif // KMP_DEBUG
1005
1006 /* save the term in thread private dispatch structure */
1007 *(DBL *)&pr->u.p.parm3 = x;
1008
1009 /* solve for the crossover point to the nearest integer i for which C_i
1010 <= chunk */
1011 {
1012 UT left, right, mid;
1013 long double p;
1014
1015 /* estimate initial upper and lower bound */
1016
1017 /* doesn't matter what value right is as long as it is positive, but
1018 it affects performance of the solver */
1019 right = 229;
1020 p = __kmp_pow<UT>(x, right);
1021 if (p > target) {
1022 do {
1023 p *= p;
1024 right <<= 1;
1025 } while (p > target && right < (1 << 27));
1026 /* lower bound is previous (failed) estimate of upper bound */
1027 left = right >> 1;
1028 } else {
1029 left = 0;
1030 }
1031
1032 /* bisection root-finding method */
1033 while (left + 1 < right) {
1034 mid = (left + right) / 2;
1035 if (__kmp_pow<UT>(x, mid) > target) {
1036 left = mid;
1037 } else {
1038 right = mid;
1039 }
1040 } // while
1041 cross = right;
1042 }
1043 /* assert sanity of computed crossover point */
1044 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1045 __kmp_pow<UT>(x, cross) <= target);
1046
1047 /* save the crossover point in thread private dispatch structure */
1048 pr->u.p.parm2 = cross;
1049
1050// C75803
1051#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1052#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1053#else
1054#define GUIDED_ANALYTICAL_WORKAROUND (x)
1055#endif
1056 /* dynamic-style scheduling offset */
1057 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1058 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1059 cross * chunk;
1060#if KMP_OS_WINDOWS && KMP_ARCH_X86
1061 // restore FPCW
1062 _control87(oldFpcw, _MCW_PC);
1063#endif
1064 } // if
1065 } else {
1066 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1067 "kmp_sch_static_greedy\n",
1068 gtid));
1069 schedule = kmp_sch_static_greedy;
1070 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1071 pr->u.p.parm1 = tc;
1072 } // if
1073 } // case
1074 break;
1075 case kmp_sch_static_greedy:
1076 KD_TRACE(100,
1077 ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1078 pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1079 ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1080 : tc;
1081 break;
1082 case kmp_sch_static_chunked:
1083 case kmp_sch_dynamic_chunked:
1084 if (pr->u.p.parm1 <= 0) {
1085 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1086 }
1087 KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1088 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1089 gtid));
1090 break;
1091 case kmp_sch_trapezoidal: {
1092 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1093
1094 T parm1, parm2, parm3, parm4;
1095 KD_TRACE(100,
1096 ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1097
1098 parm1 = chunk;
1099
1100 /* F : size of the first cycle */
1101 parm2 = (tc / (2 * th->th.th_team_nproc));
1102
1103 if (parm2 < 1) {
1104 parm2 = 1;
1105 }
1106
1107 /* L : size of the last cycle. Make sure the last cycle is not larger
1108 than the first cycle. */
1109 if (parm1 < 1) {
1110 parm1 = 1;
1111 } else if (parm1 > parm2) {
1112 parm1 = parm2;
1113 }
1114
1115 /* N : number of cycles */
1116 parm3 = (parm2 + parm1);
1117 parm3 = (2 * tc + parm3 - 1) / parm3;
1118
1119 if (parm3 < 2) {
1120 parm3 = 2;
1121 }
1122
1123 /* sigma : decreasing incr of the trapezoid */
1124 parm4 = (parm3 - 1);
1125 parm4 = (parm2 - parm1) / parm4;
1126
1127 // pointless check, because parm4 >= 0 always
1128 // if ( parm4 < 0 ) {
1129 // parm4 = 0;
1130 //}
1131
1132 pr->u.p.parm1 = parm1;
1133 pr->u.p.parm2 = parm2;
1134 pr->u.p.parm3 = parm3;
1135 pr->u.p.parm4 = parm4;
1136 } // case
1137 break;
1138
1139 default: {
Jonathan Peyton6a393f72017-09-05 15:43:58 +00001140 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
1141 KMP_HNT(GetNewerLibrary), // Hint
1142 __kmp_msg_null // Variadic argument list terminator
1143 );
Jonathan Peyton30419822017-05-12 18:01:32 +00001144 } break;
1145 } // switch
1146 pr->schedule = schedule;
1147 if (active) {
1148 /* The name of this buffer should be my_buffer_index when it's free to use
1149 * it */
1150
1151 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1152 "sh->buffer_index:%d\n",
1153 gtid, my_buffer_index, sh->buffer_index));
1154 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1155 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1156 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1157 // my_buffer_index are *always* 32-bit integers.
1158 KMP_MB(); /* is this necessary? */
1159 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1160 "sh->buffer_index:%d\n",
1161 gtid, my_buffer_index, sh->buffer_index));
1162
1163 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
Andrey Churbanovc47afcd2017-07-03 11:24:08 +00001164 th->th.th_dispatch->th_dispatch_sh_current =
Andrey Churbanov5ba90c72017-07-17 09:03:14 +00001165 CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh);
Jonathan Peyton30419822017-05-12 18:01:32 +00001166#if USE_ITT_BUILD
1167 if (pr->ordered) {
1168 __kmp_itt_ordered_init(gtid);
Jonathan Peytonbd3a7632017-09-27 20:36:27 +00001169 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001170 // Report loop metadata
1171 if (itt_need_metadata_reporting) {
1172 // Only report metadata by master of active team at level 1
1173 kmp_uint64 schedtype = 0;
1174 switch (schedule) {
1175 case kmp_sch_static_chunked:
1176 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1177 break;
1178 case kmp_sch_static_greedy:
1179 cur_chunk = pr->u.p.parm1;
1180 break;
1181 case kmp_sch_dynamic_chunked:
1182 schedtype = 1;
1183 break;
1184 case kmp_sch_guided_iterative_chunked:
1185 case kmp_sch_guided_analytical_chunked:
Andrey Churbanovd454c732017-06-05 17:17:33 +00001186 case kmp_sch_guided_simd:
Jonathan Peyton30419822017-05-12 18:01:32 +00001187 schedtype = 2;
1188 break;
1189 default:
1190 // Should we put this case under "static"?
1191 // case kmp_sch_static_steal:
1192 schedtype = 3;
1193 break;
1194 }
1195 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1196 }
1197#endif /* USE_ITT_BUILD */
Jonathan Peytonbd3a7632017-09-27 20:36:27 +00001198 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001199
1200#ifdef KMP_DEBUG
1201 {
1202 const char *buff;
1203 // create format specifiers before the debug output
1204 buff = __kmp_str_format(
1205 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1206 "lb:%%%s ub:%%%s"
1207 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1208 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1209 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1210 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1211 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1212 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1213 KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1214 pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1215 pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1216 pr->u.p.parm3, pr->u.p.parm4));
1217 __kmp_str_free(&buff);
1218 }
1219#endif
1220#if (KMP_STATIC_STEAL_ENABLED)
1221 // It cannot be guaranteed that after execution of a loop with some other
1222 // schedule kind all the parm3 variables will contain the same value. Even if
1223 // all parm3 will be the same, it still exists a bad case like using 0 and 1
1224 // rather than program life-time increment. So the dedicated variable is
1225 // required. The 'static_steal_counter' is used.
1226 if (schedule == kmp_sch_static_steal) {
1227 // Other threads will inspect this variable when searching for a victim.
1228 // This is a flag showing that other threads may steal from this thread
1229 // since then.
1230 volatile T *p = &pr->u.p.static_steal_counter;
1231 *p = *p + 1;
1232 }
1233#endif // ( KMP_STATIC_STEAL_ENABLED )
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001234
1235#if OMPT_SUPPORT && OMPT_TRACE
Jonathan Peyton30419822017-05-12 18:01:32 +00001236 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1237 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1238 ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1239 ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1240 team_info->parallel_id, task_info->task_id, team_info->microtask);
1241 }
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001242#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001243}
1244
Jonathan Peyton30419822017-05-12 18:01:32 +00001245/* For ordered loops, either __kmp_dispatch_finish() should be called after
Jim Cownie5e8470a2013-09-27 10:38:44 +00001246 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1247 * every chunk of iterations. If the ordered section(s) were not executed
1248 * for this iteration (or every iteration in this chunk), we need to set the
Jonathan Peyton30419822017-05-12 18:01:32 +00001249 * ordered iteration counters so that the next thread can proceed. */
1250template <typename UT>
1251static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1252 typedef typename traits_t<UT>::signed_t ST;
1253 kmp_info_t *th = __kmp_threads[gtid];
Jim Cownie5e8470a2013-09-27 10:38:44 +00001254
Jonathan Peyton30419822017-05-12 18:01:32 +00001255 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1256 if (!th->th.th_team->t.t_serialized) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00001257
Jonathan Peyton30419822017-05-12 18:01:32 +00001258 dispatch_private_info_template<UT> *pr =
1259 reinterpret_cast<dispatch_private_info_template<UT> *>(
1260 th->th.th_dispatch->th_dispatch_pr_current);
1261 dispatch_shared_info_template<UT> volatile *sh =
1262 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1263 th->th.th_dispatch->th_dispatch_sh_current);
1264 KMP_DEBUG_ASSERT(pr);
1265 KMP_DEBUG_ASSERT(sh);
1266 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1267 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001268
Jonathan Peyton30419822017-05-12 18:01:32 +00001269 if (pr->ordered_bumped) {
1270 KD_TRACE(
1271 1000,
1272 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1273 gtid));
1274 pr->ordered_bumped = 0;
1275 } else {
1276 UT lower = pr->u.p.ordered_lower;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001277
Jonathan Peyton30419822017-05-12 18:01:32 +00001278#ifdef KMP_DEBUG
1279 {
1280 const char *buff;
1281 // create format specifiers before the debug output
1282 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1283 "ordered_iteration:%%%s lower:%%%s\n",
1284 traits_t<UT>::spec, traits_t<UT>::spec);
1285 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1286 __kmp_str_free(&buff);
1287 }
1288#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001289
Jonathan Peyton30419822017-05-12 18:01:32 +00001290 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1291 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1292 KMP_MB(); /* is this necessary? */
1293#ifdef KMP_DEBUG
1294 {
1295 const char *buff;
1296 // create format specifiers before the debug output
1297 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1298 "ordered_iteration:%%%s lower:%%%s\n",
1299 traits_t<UT>::spec, traits_t<UT>::spec);
1300 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1301 __kmp_str_free(&buff);
1302 }
1303#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001304
Jonathan Peyton30419822017-05-12 18:01:32 +00001305 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001306 } // if
Jonathan Peyton30419822017-05-12 18:01:32 +00001307 } // if
1308 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
Jim Cownie5e8470a2013-09-27 10:38:44 +00001309}
1310
1311#ifdef KMP_GOMP_COMPAT
1312
Jonathan Peyton30419822017-05-12 18:01:32 +00001313template <typename UT>
1314static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1315 typedef typename traits_t<UT>::signed_t ST;
1316 kmp_info_t *th = __kmp_threads[gtid];
Jim Cownie5e8470a2013-09-27 10:38:44 +00001317
Jonathan Peyton30419822017-05-12 18:01:32 +00001318 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1319 if (!th->th.th_team->t.t_serialized) {
1320 // int cid;
1321 dispatch_private_info_template<UT> *pr =
1322 reinterpret_cast<dispatch_private_info_template<UT> *>(
1323 th->th.th_dispatch->th_dispatch_pr_current);
1324 dispatch_shared_info_template<UT> volatile *sh =
1325 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1326 th->th.th_dispatch->th_dispatch_sh_current);
1327 KMP_DEBUG_ASSERT(pr);
1328 KMP_DEBUG_ASSERT(sh);
1329 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1330 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001331
Jonathan Peyton30419822017-05-12 18:01:32 +00001332 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1333 UT lower = pr->u.p.ordered_lower;
1334 UT upper = pr->u.p.ordered_upper;
1335 UT inc = upper - lower + 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001336
Jonathan Peyton30419822017-05-12 18:01:32 +00001337 if (pr->ordered_bumped == inc) {
1338 KD_TRACE(
1339 1000,
1340 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1341 gtid));
1342 pr->ordered_bumped = 0;
1343 } else {
1344 inc -= pr->ordered_bumped;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001345
Jonathan Peyton30419822017-05-12 18:01:32 +00001346#ifdef KMP_DEBUG
1347 {
1348 const char *buff;
1349 // create format specifiers before the debug output
1350 buff = __kmp_str_format(
1351 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1352 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1353 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1354 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1355 __kmp_str_free(&buff);
1356 }
1357#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001358
Jonathan Peyton30419822017-05-12 18:01:32 +00001359 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1360 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
Jim Cownie5e8470a2013-09-27 10:38:44 +00001361
Jonathan Peyton30419822017-05-12 18:01:32 +00001362 KMP_MB(); /* is this necessary? */
1363 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1364 "ordered_bumped to zero\n",
1365 gtid));
1366 pr->ordered_bumped = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001367//!!!!! TODO check if the inc should be unsigned, or signed???
Jonathan Peyton30419822017-05-12 18:01:32 +00001368#ifdef KMP_DEBUG
1369 {
1370 const char *buff;
1371 // create format specifiers before the debug output
1372 buff = __kmp_str_format(
1373 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1374 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1375 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1376 traits_t<UT>::spec);
1377 KD_TRACE(1000,
1378 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1379 __kmp_str_free(&buff);
1380 }
1381#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001382
Jonathan Peyton30419822017-05-12 18:01:32 +00001383 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001384 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001385 // }
1386 }
1387 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
Jim Cownie5e8470a2013-09-27 10:38:44 +00001388}
1389
1390#endif /* KMP_GOMP_COMPAT */
1391
Jonathan Peyton30419822017-05-12 18:01:32 +00001392/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1393 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1394 is not called. */
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001395#if OMPT_SUPPORT && OMPT_TRACE
1396#define OMPT_LOOP_END \
Jonathan Peyton30419822017-05-12 18:01:32 +00001397 if (status == 0) { \
1398 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1399 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1400 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1401 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1402 team_info->parallel_id, task_info->task_id); \
1403 } \
1404 }
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001405#else
1406#define OMPT_LOOP_END // no-op
1407#endif
1408
Jonathan Peyton30419822017-05-12 18:01:32 +00001409template <typename T>
1410static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1411 T *p_lb, T *p_ub,
1412 typename traits_t<T>::signed_t *p_st) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00001413
Jonathan Peyton30419822017-05-12 18:01:32 +00001414 typedef typename traits_t<T>::unsigned_t UT;
1415 typedef typename traits_t<T>::signed_t ST;
1416 typedef typename traits_t<T>::floating_t DBL;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001417
Jonathan Peyton30419822017-05-12 18:01:32 +00001418 // This is potentially slightly misleading, schedule(runtime) will appear here
1419 // even if the actual runtme schedule is static. (Which points out a
1420 // disadavantage of schedule(runtime): even when static scheduling is used it
1421 // costs more than a compile time choice to use static scheduling would.)
1422 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001423
Jonathan Peyton30419822017-05-12 18:01:32 +00001424 int status;
1425 dispatch_private_info_template<T> *pr;
1426 kmp_info_t *th = __kmp_threads[gtid];
1427 kmp_team_t *team = th->th.th_team;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001428
Jonathan Peyton30419822017-05-12 18:01:32 +00001429 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1430#ifdef KMP_DEBUG
1431 {
1432 const char *buff;
1433 // create format specifiers before the debug output
1434 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1435 "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1436 traits_t<T>::spec, traits_t<T>::spec,
1437 traits_t<ST>::spec);
1438 KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1439 __kmp_str_free(&buff);
1440 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001441#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001442
Jonathan Peyton30419822017-05-12 18:01:32 +00001443 if (team->t.t_serialized) {
1444 /* NOTE: serialize this dispatch becase we are not at the active level */
1445 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1446 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1447 KMP_DEBUG_ASSERT(pr);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001448
Jonathan Peyton30419822017-05-12 18:01:32 +00001449 if ((status = (pr->u.p.tc != 0)) == 0) {
1450 *p_lb = 0;
1451 *p_ub = 0;
1452 // if ( p_last != NULL )
1453 // *p_last = 0;
1454 if (p_st != NULL)
1455 *p_st = 0;
1456 if (__kmp_env_consistency_check) {
1457 if (pr->pushed_ws != ct_none) {
1458 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001459 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001460 }
1461 } else if (pr->nomerge) {
1462 kmp_int32 last;
1463 T start;
1464 UT limit, trip, init;
1465 ST incr;
1466 T chunk = pr->u.p.parm1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001467
Jonathan Peyton30419822017-05-12 18:01:32 +00001468 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1469 gtid));
1470
1471 init = chunk * pr->u.p.count++;
1472 trip = pr->u.p.tc - 1;
1473
1474 if ((status = (init <= trip)) == 0) {
1475 *p_lb = 0;
1476 *p_ub = 0;
1477 // if ( p_last != NULL )
1478 // *p_last = 0;
1479 if (p_st != NULL)
1480 *p_st = 0;
1481 if (__kmp_env_consistency_check) {
1482 if (pr->pushed_ws != ct_none) {
1483 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1484 }
1485 }
1486 } else {
1487 start = pr->u.p.lb;
1488 limit = chunk + init - 1;
1489 incr = pr->u.p.st;
1490
1491 if ((last = (limit >= trip)) != 0) {
1492 limit = trip;
1493#if KMP_OS_WINDOWS
1494 pr->u.p.last_upper = pr->u.p.ub;
1495#endif /* KMP_OS_WINDOWS */
1496 }
1497 if (p_last != NULL)
1498 *p_last = last;
1499 if (p_st != NULL)
1500 *p_st = incr;
1501 if (incr == 1) {
1502 *p_lb = start + init;
1503 *p_ub = start + limit;
1504 } else {
1505 *p_lb = start + init * incr;
1506 *p_ub = start + limit * incr;
1507 }
1508
1509 if (pr->ordered) {
1510 pr->u.p.ordered_lower = init;
1511 pr->u.p.ordered_upper = limit;
1512#ifdef KMP_DEBUG
1513 {
1514 const char *buff;
1515 // create format specifiers before the debug output
1516 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1517 "ordered_lower:%%%s ordered_upper:%%%s\n",
1518 traits_t<UT>::spec, traits_t<UT>::spec);
1519 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1520 pr->u.p.ordered_upper));
1521 __kmp_str_free(&buff);
1522 }
1523#endif
1524 } // if
1525 } // if
1526 } else {
1527 pr->u.p.tc = 0;
1528 *p_lb = pr->u.p.lb;
1529 *p_ub = pr->u.p.ub;
1530#if KMP_OS_WINDOWS
1531 pr->u.p.last_upper = *p_ub;
1532#endif /* KMP_OS_WINDOWS */
1533 if (p_last != NULL)
1534 *p_last = TRUE;
1535 if (p_st != NULL)
1536 *p_st = pr->u.p.st;
1537 } // if
1538#ifdef KMP_DEBUG
Jim Cownie5e8470a2013-09-27 10:38:44 +00001539 {
Jonathan Peyton30419822017-05-12 18:01:32 +00001540 const char *buff;
1541 // create format specifiers before the debug output
1542 buff = __kmp_str_format(
1543 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1544 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1545 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1546 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1547 __kmp_str_free(&buff);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001548 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001549#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001550#if INCLUDE_SSC_MARKS
1551 SSC_MARK_DISPATCH_NEXT();
1552#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001553 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001554 return status;
Jonathan Peyton30419822017-05-12 18:01:32 +00001555 } else {
1556 kmp_int32 last = 0;
1557 dispatch_shared_info_template<UT> *sh;
1558 T start;
1559 ST incr;
1560 UT limit, trip, init;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001561
Jonathan Peyton30419822017-05-12 18:01:32 +00001562 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1563 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001564
Jonathan Peyton30419822017-05-12 18:01:32 +00001565 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1566 th->th.th_dispatch->th_dispatch_pr_current);
1567 KMP_DEBUG_ASSERT(pr);
1568 sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1569 th->th.th_dispatch->th_dispatch_sh_current);
1570 KMP_DEBUG_ASSERT(sh);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001571
Jonathan Peyton30419822017-05-12 18:01:32 +00001572 if (pr->u.p.tc == 0) {
1573 // zero trip count
1574 status = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001575 } else {
Jonathan Peyton30419822017-05-12 18:01:32 +00001576 switch (pr->schedule) {
1577#if (KMP_STATIC_STEAL_ENABLED)
1578 case kmp_sch_static_steal: {
1579 T chunk = pr->u.p.parm1;
1580 int nproc = th->th.th_team_nproc;
Jonathan Peyton45be4502015-08-11 21:36:41 +00001581
Jonathan Peyton30419822017-05-12 18:01:32 +00001582 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1583 gtid));
1584
1585 trip = pr->u.p.tc - 1;
1586
1587 if (traits_t<T>::type_size > 4) {
1588 // use lock for 8-byte and CAS for 4-byte induction
1589 // variable. TODO (optional): check and use 16-byte CAS
1590 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1591 KMP_DEBUG_ASSERT(lck != NULL);
1592 if (pr->u.p.count < (UT)pr->u.p.ub) {
1593 __kmp_acquire_lock(lck, gtid);
1594 // try to get own chunk of iterations
1595 init = (pr->u.p.count)++;
1596 status = (init < (UT)pr->u.p.ub);
1597 __kmp_release_lock(lck, gtid);
1598 } else {
1599 status = 0; // no own chunks
1600 }
1601 if (!status) { // try to steal
1602 kmp_info_t **other_threads = team->t.t_threads;
1603 int while_limit = nproc; // nproc attempts to find a victim
1604 int while_index = 0;
1605 // TODO: algorithm of searching for a victim
1606 // should be cleaned up and measured
1607 while ((!status) && (while_limit != ++while_index)) {
1608 T remaining;
1609 T victimIdx = pr->u.p.parm4;
1610 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1611 dispatch_private_info_template<T> *victim =
1612 reinterpret_cast<dispatch_private_info_template<T> *>(
1613 other_threads[victimIdx]
1614 ->th.th_dispatch->th_dispatch_pr_current);
1615 while ((victim == NULL || victim == pr ||
1616 (*(volatile T *)&victim->u.p.static_steal_counter !=
1617 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1618 oldVictimIdx != victimIdx) {
1619 victimIdx = (victimIdx + 1) % nproc;
1620 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1621 other_threads[victimIdx]
1622 ->th.th_dispatch->th_dispatch_pr_current);
Jonathan Peytonbd3a7632017-09-27 20:36:27 +00001623 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001624 if (!victim ||
1625 (*(volatile T *)&victim->u.p.static_steal_counter !=
1626 *(volatile T *)&pr->u.p.static_steal_counter)) {
1627 continue; // try once more (nproc attempts in total)
1628 // no victim is ready yet to participate in stealing
1629 // because all victims are still in kmp_init_dispatch
1630 }
1631 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1632 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1633 continue; // not enough chunks to steal, goto next victim
1634 }
1635
1636 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1637 KMP_ASSERT(lck != NULL);
1638 __kmp_acquire_lock(lck, gtid);
1639 limit = victim->u.p.ub; // keep initial ub
1640 if (victim->u.p.count >= limit ||
1641 (remaining = limit - victim->u.p.count) < 2) {
1642 __kmp_release_lock(lck, gtid);
1643 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1644 continue; // not enough chunks to steal
1645 }
1646 // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1647 // or by 1
1648 if (remaining > 3) {
1649 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1650 init = (victim->u.p.ub -=
1651 (remaining >> 2)); // steal 1/4 of remaining
1652 } else {
1653 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1654 init =
1655 (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1656 }
1657 __kmp_release_lock(lck, gtid);
1658
1659 KMP_DEBUG_ASSERT(init + 1 <= limit);
1660 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1661 status = 1;
1662 while_index = 0;
1663 // now update own count and ub with stolen range but init chunk
1664 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1665 pr->u.p.count = init + 1;
1666 pr->u.p.ub = limit;
1667 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1668 } // while (search for victim)
1669 } // if (try to find victim and steal)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001670 } else {
Jonathan Peyton30419822017-05-12 18:01:32 +00001671 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1672 typedef union {
1673 struct {
1674 UT count;
1675 T ub;
1676 } p;
1677 kmp_int64 b;
1678 } union_i4;
1679 // All operations on 'count' or 'ub' must be combined atomically
1680 // together.
1681 {
1682 union_i4 vold, vnew;
1683 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1684 vnew = vold;
1685 vnew.p.count++;
1686 while (!KMP_COMPARE_AND_STORE_ACQ64(
1687 (volatile kmp_int64 *)&pr->u.p.count,
1688 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1689 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1690 KMP_CPU_PAUSE();
1691 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1692 vnew = vold;
1693 vnew.p.count++;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001694 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001695 vnew = vold;
1696 init = vnew.p.count;
1697 status = (init < (UT)vnew.p.ub);
1698 }
1699
1700 if (!status) {
1701 kmp_info_t **other_threads = team->t.t_threads;
1702 int while_limit = nproc; // nproc attempts to find a victim
1703 int while_index = 0;
1704
1705 // TODO: algorithm of searching for a victim
1706 // should be cleaned up and measured
1707 while ((!status) && (while_limit != ++while_index)) {
1708 union_i4 vold, vnew;
1709 kmp_int32 remaining;
1710 T victimIdx = pr->u.p.parm4;
1711 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1712 dispatch_private_info_template<T> *victim =
1713 reinterpret_cast<dispatch_private_info_template<T> *>(
1714 other_threads[victimIdx]
1715 ->th.th_dispatch->th_dispatch_pr_current);
1716 while ((victim == NULL || victim == pr ||
1717 (*(volatile T *)&victim->u.p.static_steal_counter !=
1718 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1719 oldVictimIdx != victimIdx) {
1720 victimIdx = (victimIdx + 1) % nproc;
1721 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1722 other_threads[victimIdx]
1723 ->th.th_dispatch->th_dispatch_pr_current);
Jonathan Peytonbd3a7632017-09-27 20:36:27 +00001724 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001725 if (!victim ||
1726 (*(volatile T *)&victim->u.p.static_steal_counter !=
1727 *(volatile T *)&pr->u.p.static_steal_counter)) {
1728 continue; // try once more (nproc attempts in total)
1729 // no victim is ready yet to participate in stealing
1730 // because all victims are still in kmp_init_dispatch
1731 }
1732 pr->u.p.parm4 = victimIdx; // new victim found
1733 while (1) { // CAS loop if victim has enough chunks to steal
1734 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1735 vnew = vold;
1736
1737 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1738 if (vnew.p.count >= (UT)vnew.p.ub ||
1739 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1740 pr->u.p.parm4 =
1741 (victimIdx + 1) % nproc; // shift start victim id
1742 break; // not enough chunks to steal, goto next victim
1743 }
1744 if (remaining > 3) {
1745 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1746 } else {
1747 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1748 }
1749 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1750 // TODO: Should this be acquire or release?
1751 if (KMP_COMPARE_AND_STORE_ACQ64(
1752 (volatile kmp_int64 *)&victim->u.p.count,
1753 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1754 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1755 // stealing succeeded
1756 KMP_COUNT_VALUE(FOR_static_steal_stolen,
1757 vold.p.ub - vnew.p.ub);
1758 status = 1;
1759 while_index = 0;
1760 // now update own count and ub
1761 init = vnew.p.ub;
1762 vold.p.count = init + 1;
1763#if KMP_ARCH_X86
1764 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1765 vold.b);
1766#else
1767 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1768#endif
1769 break;
1770 } // if (check CAS result)
1771 KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1772 } // while (try to steal from particular victim)
1773 } // while (search for victim)
1774 } // if (try to find victim and steal)
1775 } // if (4-byte induction variable)
1776 if (!status) {
1777 *p_lb = 0;
1778 *p_ub = 0;
1779 if (p_st != NULL)
1780 *p_st = 0;
1781 } else {
1782 start = pr->u.p.parm2;
1783 init *= chunk;
1784 limit = chunk + init - 1;
1785 incr = pr->u.p.st;
1786 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1787
1788 KMP_DEBUG_ASSERT(init <= trip);
1789 if ((last = (limit >= trip)) != 0)
1790 limit = trip;
1791 if (p_st != NULL)
1792 *p_st = incr;
1793
1794 if (incr == 1) {
1795 *p_lb = start + init;
1796 *p_ub = start + limit;
1797 } else {
1798 *p_lb = start + init * incr;
1799 *p_ub = start + limit * incr;
1800 }
1801
1802 if (pr->ordered) {
1803 pr->u.p.ordered_lower = init;
1804 pr->u.p.ordered_upper = limit;
1805#ifdef KMP_DEBUG
1806 {
1807 const char *buff;
1808 // create format specifiers before the debug output
1809 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1810 "ordered_lower:%%%s ordered_upper:%%%s\n",
1811 traits_t<UT>::spec, traits_t<UT>::spec);
1812 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1813 pr->u.p.ordered_upper));
1814 __kmp_str_free(&buff);
1815 }
1816#endif
1817 } // if
1818 } // if
1819 break;
1820 } // case
1821#endif // ( KMP_STATIC_STEAL_ENABLED )
1822 case kmp_sch_static_balanced: {
1823 KD_TRACE(
1824 100,
1825 ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1826 if ((status = !pr->u.p.count) !=
1827 0) { /* check if thread has any iteration to do */
1828 pr->u.p.count = 1;
1829 *p_lb = pr->u.p.lb;
1830 *p_ub = pr->u.p.ub;
1831 last = pr->u.p.parm1;
1832 if (p_st != NULL)
1833 *p_st = pr->u.p.st;
1834 } else { /* no iterations to do */
1835 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001836 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001837 if (pr->ordered) {
1838#ifdef KMP_DEBUG
1839 {
1840 const char *buff;
1841 // create format specifiers before the debug output
1842 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1843 "ordered_lower:%%%s ordered_upper:%%%s\n",
1844 traits_t<UT>::spec, traits_t<UT>::spec);
1845 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1846 pr->u.p.ordered_upper));
1847 __kmp_str_free(&buff);
1848 }
1849#endif
1850 } // if
1851 } // case
1852 break;
1853 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1854 merged here */
1855 case kmp_sch_static_chunked: {
1856 T parm1;
1857
1858 KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1859 "kmp_sch_static_[affinity|chunked] case\n",
1860 gtid));
1861 parm1 = pr->u.p.parm1;
1862
1863 trip = pr->u.p.tc - 1;
1864 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1865
1866 if ((status = (init <= trip)) != 0) {
1867 start = pr->u.p.lb;
1868 incr = pr->u.p.st;
1869 limit = parm1 + init - 1;
1870
1871 if ((last = (limit >= trip)) != 0)
1872 limit = trip;
1873
1874 if (p_st != NULL)
1875 *p_st = incr;
1876
1877 pr->u.p.count += th->th.th_team_nproc;
1878
1879 if (incr == 1) {
1880 *p_lb = start + init;
1881 *p_ub = start + limit;
1882 } else {
1883 *p_lb = start + init * incr;
1884 *p_ub = start + limit * incr;
1885 }
1886
1887 if (pr->ordered) {
1888 pr->u.p.ordered_lower = init;
1889 pr->u.p.ordered_upper = limit;
1890#ifdef KMP_DEBUG
1891 {
1892 const char *buff;
1893 // create format specifiers before the debug output
1894 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1895 "ordered_lower:%%%s ordered_upper:%%%s\n",
1896 traits_t<UT>::spec, traits_t<UT>::spec);
1897 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1898 pr->u.p.ordered_upper));
1899 __kmp_str_free(&buff);
1900 }
1901#endif
1902 } // if
1903 } // if
1904 } // case
1905 break;
1906
1907 case kmp_sch_dynamic_chunked: {
1908 T chunk = pr->u.p.parm1;
1909
1910 KD_TRACE(
1911 100,
1912 ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1913
1914 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1915 trip = pr->u.p.tc - 1;
1916
1917 if ((status = (init <= trip)) == 0) {
1918 *p_lb = 0;
1919 *p_ub = 0;
1920 if (p_st != NULL)
1921 *p_st = 0;
1922 } else {
1923 start = pr->u.p.lb;
1924 limit = chunk + init - 1;
1925 incr = pr->u.p.st;
1926
1927 if ((last = (limit >= trip)) != 0)
1928 limit = trip;
1929
1930 if (p_st != NULL)
1931 *p_st = incr;
1932
1933 if (incr == 1) {
1934 *p_lb = start + init;
1935 *p_ub = start + limit;
1936 } else {
1937 *p_lb = start + init * incr;
1938 *p_ub = start + limit * incr;
1939 }
1940
1941 if (pr->ordered) {
1942 pr->u.p.ordered_lower = init;
1943 pr->u.p.ordered_upper = limit;
1944#ifdef KMP_DEBUG
1945 {
1946 const char *buff;
1947 // create format specifiers before the debug output
1948 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1949 "ordered_lower:%%%s ordered_upper:%%%s\n",
1950 traits_t<UT>::spec, traits_t<UT>::spec);
1951 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1952 pr->u.p.ordered_upper));
1953 __kmp_str_free(&buff);
1954 }
1955#endif
1956 } // if
1957 } // if
1958 } // case
1959 break;
1960
1961 case kmp_sch_guided_iterative_chunked: {
1962 T chunkspec = pr->u.p.parm1;
1963 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1964 "iterative case\n",
1965 gtid));
1966 trip = pr->u.p.tc;
1967 // Start atomic part of calculations
1968 while (1) {
1969 ST remaining; // signed, because can be < 0
1970 init = sh->u.s.iteration; // shared value
1971 remaining = trip - init;
1972 if (remaining <= 0) { // AC: need to compare with 0 first
1973 // nothing to do, don't try atomic op
1974 status = 0;
1975 break;
1976 }
1977 if ((T)remaining <
1978 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1979 // use dynamic-style shcedule
1980 // atomically inrement iterations, get old value
Andrey Churbanovc47afcd2017-07-03 11:24:08 +00001981 init = test_then_add<ST>(
Andrey Churbanov5ba90c72017-07-17 09:03:14 +00001982 RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunkspec);
Jonathan Peyton30419822017-05-12 18:01:32 +00001983 remaining = trip - init;
1984 if (remaining <= 0) {
1985 status = 0; // all iterations got by other threads
1986 } else { // got some iterations to work on
1987 status = 1;
1988 if ((T)remaining > chunkspec) {
1989 limit = init + chunkspec - 1;
1990 } else {
1991 last = 1; // the last chunk
1992 limit = init + remaining - 1;
1993 } // if
1994 } // if
1995 break;
1996 } // if
1997 limit = init + (UT)(remaining *
1998 *(double *)&pr->u.p.parm3); // divide by K*nproc
Andrey Churbanov5ba90c72017-07-17 09:03:14 +00001999 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
Andrey Churbanovc47afcd2017-07-03 11:24:08 +00002000 (ST)init, (ST)limit)) {
Jonathan Peyton30419822017-05-12 18:01:32 +00002001 // CAS was successful, chunk obtained
2002 status = 1;
2003 --limit;
2004 break;
2005 } // if
2006 } // while
2007 if (status != 0) {
2008 start = pr->u.p.lb;
2009 incr = pr->u.p.st;
2010 if (p_st != NULL)
2011 *p_st = incr;
2012 *p_lb = start + init * incr;
2013 *p_ub = start + limit * incr;
2014 if (pr->ordered) {
2015 pr->u.p.ordered_lower = init;
2016 pr->u.p.ordered_upper = limit;
2017#ifdef KMP_DEBUG
2018 {
2019 const char *buff;
2020 // create format specifiers before the debug output
2021 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2022 "ordered_lower:%%%s ordered_upper:%%%s\n",
2023 traits_t<UT>::spec, traits_t<UT>::spec);
2024 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2025 pr->u.p.ordered_upper));
2026 __kmp_str_free(&buff);
2027 }
2028#endif
2029 } // if
2030 } else {
2031 *p_lb = 0;
2032 *p_ub = 0;
2033 if (p_st != NULL)
2034 *p_st = 0;
2035 } // if
2036 } // case
2037 break;
2038
Andrey Churbanovd454c732017-06-05 17:17:33 +00002039 case kmp_sch_guided_simd: {
2040 // same as iterative but curr-chunk adjusted to be multiple of given
2041 // chunk
2042 T chunk = pr->u.p.parm1;
2043 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2044 gtid));
2045 trip = pr->u.p.tc;
2046 // Start atomic part of calculations
2047 while (1) {
2048 ST remaining; // signed, because can be < 0
2049 init = sh->u.s.iteration; // shared value
2050 remaining = trip - init;
2051 if (remaining <= 0) { // AC: need to compare with 0 first
2052 status = 0; // nothing to do, don't try atomic op
2053 break;
2054 }
2055 KMP_DEBUG_ASSERT(init % chunk == 0);
2056 // compare with K*nproc*(chunk+1), K=2 by default
2057 if ((T)remaining < pr->u.p.parm2) {
2058 // use dynamic-style shcedule
2059 // atomically inrement iterations, get old value
Andrey Churbanovc47afcd2017-07-03 11:24:08 +00002060 init = test_then_add<ST>(
Andrey Churbanov5ba90c72017-07-17 09:03:14 +00002061 RCAST(volatile ST *, &sh->u.s.iteration), (ST)chunk);
Andrey Churbanovd454c732017-06-05 17:17:33 +00002062 remaining = trip - init;
2063 if (remaining <= 0) {
2064 status = 0; // all iterations got by other threads
2065 } else {
2066 // got some iterations to work on
2067 status = 1;
2068 if ((T)remaining > chunk) {
2069 limit = init + chunk - 1;
2070 } else {
2071 last = 1; // the last chunk
2072 limit = init + remaining - 1;
2073 } // if
2074 } // if
2075 break;
2076 } // if
2077 // divide by K*nproc
2078 UT span = remaining * (*(double *)&pr->u.p.parm3);
2079 UT rem = span % chunk;
2080 if (rem) // adjust so that span%chunk == 0
2081 span += chunk - rem;
2082 limit = init + span;
Andrey Churbanov5ba90c72017-07-17 09:03:14 +00002083 if (compare_and_swap<ST>(RCAST(volatile ST *, &sh->u.s.iteration),
Andrey Churbanovc47afcd2017-07-03 11:24:08 +00002084 (ST)init, (ST)limit)) {
Andrey Churbanovd454c732017-06-05 17:17:33 +00002085 // CAS was successful, chunk obtained
2086 status = 1;
2087 --limit;
2088 break;
2089 } // if
2090 } // while
2091 if (status != 0) {
2092 start = pr->u.p.lb;
2093 incr = pr->u.p.st;
2094 if (p_st != NULL)
2095 *p_st = incr;
2096 *p_lb = start + init * incr;
2097 *p_ub = start + limit * incr;
2098 if (pr->ordered) {
2099 pr->u.p.ordered_lower = init;
2100 pr->u.p.ordered_upper = limit;
2101#ifdef KMP_DEBUG
2102 {
2103 const char *buff;
2104 // create format specifiers before the debug output
2105 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2106 "ordered_lower:%%%s ordered_upper:%%%s\n",
2107 traits_t<UT>::spec, traits_t<UT>::spec);
2108 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2109 pr->u.p.ordered_upper));
2110 __kmp_str_free(&buff);
2111 }
2112#endif
2113 } // if
2114 } else {
2115 *p_lb = 0;
2116 *p_ub = 0;
2117 if (p_st != NULL)
2118 *p_st = 0;
2119 } // if
2120 } // case
2121 break;
2122
Jonathan Peyton30419822017-05-12 18:01:32 +00002123 case kmp_sch_guided_analytical_chunked: {
2124 T chunkspec = pr->u.p.parm1;
2125 UT chunkIdx;
2126#if KMP_OS_WINDOWS && KMP_ARCH_X86
2127 /* for storing original FPCW value for Windows* OS on
2128 IA-32 architecture 8-byte version */
2129 unsigned int oldFpcw;
2130 unsigned int fpcwSet = 0;
2131#endif
2132 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2133 "analytical case\n",
2134 gtid));
2135
2136 trip = pr->u.p.tc;
2137
2138 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2139 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2140 trip);
2141
2142 while (1) { /* this while loop is a safeguard against unexpected zero
2143 chunk sizes */
2144 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2145 if (chunkIdx >= (UT)pr->u.p.parm2) {
2146 --trip;
2147 /* use dynamic-style scheduling */
2148 init = chunkIdx * chunkspec + pr->u.p.count;
2149 /* need to verify init > 0 in case of overflow in the above
2150 * calculation */
2151 if ((status = (init > 0 && init <= trip)) != 0) {
2152 limit = init + chunkspec - 1;
2153
2154 if ((last = (limit >= trip)) != 0)
2155 limit = trip;
2156 }
2157 break;
2158 } else {
2159/* use exponential-style scheduling */
2160/* The following check is to workaround the lack of long double precision on
2161 Windows* OS.
2162 This check works around the possible effect that init != 0 for chunkIdx == 0.
2163 */
2164#if KMP_OS_WINDOWS && KMP_ARCH_X86
2165 /* If we haven't already done so, save original FPCW and set
2166 precision to 64-bit, as Windows* OS on IA-32 architecture
2167 defaults to 53-bit */
2168 if (!fpcwSet) {
2169 oldFpcw = _control87(0, 0);
2170 _control87(_PC_64, _MCW_PC);
2171 fpcwSet = 0x30000;
2172 }
2173#endif
2174 if (chunkIdx) {
2175 init = __kmp_dispatch_guided_remaining<T>(
2176 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2177 KMP_DEBUG_ASSERT(init);
2178 init = trip - init;
2179 } else
2180 init = 0;
2181 limit = trip - __kmp_dispatch_guided_remaining<T>(
2182 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2183 KMP_ASSERT(init <= limit);
2184 if (init < limit) {
2185 KMP_DEBUG_ASSERT(limit <= trip);
2186 --limit;
2187 status = 1;
2188 break;
2189 } // if
2190 } // if
2191 } // while (1)
2192#if KMP_OS_WINDOWS && KMP_ARCH_X86
2193 /* restore FPCW if necessary
2194 AC: check fpcwSet flag first because oldFpcw can be uninitialized
2195 here */
2196 if (fpcwSet && (oldFpcw & fpcwSet))
2197 _control87(oldFpcw, _MCW_PC);
2198#endif
2199 if (status != 0) {
2200 start = pr->u.p.lb;
2201 incr = pr->u.p.st;
2202 if (p_st != NULL)
2203 *p_st = incr;
2204 *p_lb = start + init * incr;
2205 *p_ub = start + limit * incr;
2206 if (pr->ordered) {
2207 pr->u.p.ordered_lower = init;
2208 pr->u.p.ordered_upper = limit;
2209#ifdef KMP_DEBUG
2210 {
2211 const char *buff;
2212 // create format specifiers before the debug output
2213 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2214 "ordered_lower:%%%s ordered_upper:%%%s\n",
2215 traits_t<UT>::spec, traits_t<UT>::spec);
2216 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2217 pr->u.p.ordered_upper));
2218 __kmp_str_free(&buff);
2219 }
2220#endif
2221 }
2222 } else {
2223 *p_lb = 0;
2224 *p_ub = 0;
2225 if (p_st != NULL)
2226 *p_st = 0;
2227 }
2228 } // case
2229 break;
2230
2231 case kmp_sch_trapezoidal: {
2232 UT index;
2233 T parm2 = pr->u.p.parm2;
2234 T parm3 = pr->u.p.parm3;
2235 T parm4 = pr->u.p.parm4;
2236 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2237 gtid));
2238
2239 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2240
2241 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2242 trip = pr->u.p.tc - 1;
2243
2244 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2245 *p_lb = 0;
2246 *p_ub = 0;
2247 if (p_st != NULL)
2248 *p_st = 0;
2249 } else {
2250 start = pr->u.p.lb;
2251 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2252 incr = pr->u.p.st;
2253
2254 if ((last = (limit >= trip)) != 0)
2255 limit = trip;
2256
2257 if (p_st != NULL)
2258 *p_st = incr;
2259
2260 if (incr == 1) {
2261 *p_lb = start + init;
2262 *p_ub = start + limit;
2263 } else {
2264 *p_lb = start + init * incr;
2265 *p_ub = start + limit * incr;
2266 }
2267
2268 if (pr->ordered) {
2269 pr->u.p.ordered_lower = init;
2270 pr->u.p.ordered_upper = limit;
2271#ifdef KMP_DEBUG
2272 {
2273 const char *buff;
2274 // create format specifiers before the debug output
2275 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2276 "ordered_lower:%%%s ordered_upper:%%%s\n",
2277 traits_t<UT>::spec, traits_t<UT>::spec);
2278 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2279 pr->u.p.ordered_upper));
2280 __kmp_str_free(&buff);
2281 }
2282#endif
2283 } // if
2284 } // if
2285 } // case
2286 break;
2287 default: {
2288 status = 0; // to avoid complaints on uninitialized variable use
Jonathan Peyton6a393f72017-09-05 15:43:58 +00002289 __kmp_fatal(KMP_MSG(UnknownSchedTypeDetected), // Primary message
2290 KMP_HNT(GetNewerLibrary), // Hint
2291 __kmp_msg_null // Variadic argument list terminator
2292 );
Jonathan Peyton30419822017-05-12 18:01:32 +00002293 } break;
2294 } // switch
2295 } // if tc == 0;
2296
2297 if (status == 0) {
2298 UT num_done;
2299
2300 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2301#ifdef KMP_DEBUG
2302 {
2303 const char *buff;
2304 // create format specifiers before the debug output
2305 buff = __kmp_str_format(
2306 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2307 traits_t<UT>::spec);
2308 KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2309 __kmp_str_free(&buff);
2310 }
2311#endif
2312
2313 if ((ST)num_done == th->th.th_team_nproc - 1) {
2314#if (KMP_STATIC_STEAL_ENABLED)
2315 if (pr->schedule == kmp_sch_static_steal &&
2316 traits_t<T>::type_size > 4) {
2317 int i;
2318 kmp_info_t **other_threads = team->t.t_threads;
2319 // loop complete, safe to destroy locks used for stealing
2320 for (i = 0; i < th->th.th_team_nproc; ++i) {
2321 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2322 KMP_ASSERT(lck != NULL);
2323 __kmp_destroy_lock(lck);
2324 __kmp_free(lck);
2325 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2326 }
2327 }
2328#endif
2329 /* NOTE: release this buffer to be reused */
2330
2331 KMP_MB(); /* Flush all pending memory write invalidates. */
2332
2333 sh->u.s.num_done = 0;
2334 sh->u.s.iteration = 0;
2335
2336 /* TODO replace with general release procedure? */
2337 if (pr->ordered) {
2338 sh->u.s.ordered_iteration = 0;
2339 }
2340
2341 KMP_MB(); /* Flush all pending memory write invalidates. */
2342
2343 sh->buffer_index += __kmp_dispatch_num_buffers;
2344 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2345 gtid, sh->buffer_index));
2346
2347 KMP_MB(); /* Flush all pending memory write invalidates. */
2348
2349 } // if
2350 if (__kmp_env_consistency_check) {
2351 if (pr->pushed_ws != ct_none) {
2352 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2353 }
2354 }
2355
2356 th->th.th_dispatch->th_deo_fcn = NULL;
2357 th->th.th_dispatch->th_dxo_fcn = NULL;
2358 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2359 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2360 } // if (status == 0)
2361#if KMP_OS_WINDOWS
2362 else if (last) {
2363 pr->u.p.last_upper = pr->u.p.ub;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002364 }
Jonathan Peyton30419822017-05-12 18:01:32 +00002365#endif /* KMP_OS_WINDOWS */
2366 if (p_last != NULL && status != 0)
2367 *p_last = last;
2368 } // if
2369
2370#ifdef KMP_DEBUG
2371 {
2372 const char *buff;
2373 // create format specifiers before the debug output
2374 buff = __kmp_str_format(
2375 "__kmp_dispatch_next: T#%%d normal case: "
2376 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2377 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2378 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2379 __kmp_str_free(&buff);
2380 }
2381#endif
2382#if INCLUDE_SSC_MARKS
2383 SSC_MARK_DISPATCH_NEXT();
2384#endif
2385 OMPT_LOOP_END;
2386 return status;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002387}
2388
Jonathan Peyton30419822017-05-12 18:01:32 +00002389template <typename T>
2390static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2391 kmp_int32 *plastiter, T *plower, T *pupper,
2392 typename traits_t<T>::signed_t incr) {
2393 typedef typename traits_t<T>::unsigned_t UT;
2394 typedef typename traits_t<T>::signed_t ST;
Ed Maste414544c2017-07-07 21:06:05 +00002395 kmp_uint32 team_id;
2396 kmp_uint32 nteams;
2397 UT trip_count;
2398 kmp_team_t *team;
Jonathan Peyton30419822017-05-12 18:01:32 +00002399 kmp_info_t *th;
2400
2401 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2402 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2403#ifdef KMP_DEBUG
2404 {
2405 const char *buff;
2406 // create format specifiers before the debug output
2407 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2408 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2409 traits_t<T>::spec, traits_t<T>::spec,
2410 traits_t<ST>::spec, traits_t<T>::spec);
2411 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2412 __kmp_str_free(&buff);
2413 }
2414#endif
2415
2416 if (__kmp_env_consistency_check) {
2417 if (incr == 0) {
2418 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2419 loc);
2420 }
2421 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2422 // The loop is illegal.
2423 // Some zero-trip loops maintained by compiler, e.g.:
2424 // for(i=10;i<0;++i) // lower >= upper - run-time check
2425 // for(i=0;i>10;--i) // lower <= upper - run-time check
2426 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2427 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2428 // Compiler does not check the following illegal loops:
2429 // for(i=0;i<10;i+=incr) // where incr<0
2430 // for(i=10;i>0;i-=incr) // where incr<0
2431 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2432 }
2433 }
2434 th = __kmp_threads[gtid];
2435 team = th->th.th_team;
2436#if OMP_40_ENABLED
2437 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2438 nteams = th->th.th_teams_size.nteams;
2439#endif
2440 team_id = team->t.t_master_tid;
2441 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2442
2443 // compute global trip count
2444 if (incr == 1) {
2445 trip_count = *pupper - *plower + 1;
2446 } else if (incr == -1) {
2447 trip_count = *plower - *pupper + 1;
2448 } else if (incr > 0) {
2449 // upper-lower can exceed the limit of signed type
2450 trip_count = (UT)(*pupper - *plower) / incr + 1;
2451 } else {
2452 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2453 }
2454
2455 if (trip_count <= nteams) {
2456 KMP_DEBUG_ASSERT(
2457 __kmp_static == kmp_sch_static_greedy ||
2458 __kmp_static ==
2459 kmp_sch_static_balanced); // Unknown static scheduling type.
2460 // only some teams get single iteration, others get nothing
2461 if (team_id < trip_count) {
2462 *pupper = *plower = *plower + team_id * incr;
2463 } else {
2464 *plower = *pupper + incr; // zero-trip loop
2465 }
2466 if (plastiter != NULL)
2467 *plastiter = (team_id == trip_count - 1);
2468 } else {
2469 if (__kmp_static == kmp_sch_static_balanced) {
Ed Maste414544c2017-07-07 21:06:05 +00002470 UT chunk = trip_count / nteams;
2471 UT extras = trip_count % nteams;
Jonathan Peyton30419822017-05-12 18:01:32 +00002472 *plower +=
2473 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2474 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2475 if (plastiter != NULL)
2476 *plastiter = (team_id == nteams - 1);
2477 } else {
Ed Maste414544c2017-07-07 21:06:05 +00002478 T chunk_inc_count =
Jonathan Peyton30419822017-05-12 18:01:32 +00002479 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
Ed Maste414544c2017-07-07 21:06:05 +00002480 T upper = *pupper;
Jonathan Peyton30419822017-05-12 18:01:32 +00002481 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2482 // Unknown static scheduling type.
2483 *plower += team_id * chunk_inc_count;
2484 *pupper = *plower + chunk_inc_count - incr;
2485 // Check/correct bounds if needed
2486 if (incr > 0) {
2487 if (*pupper < *plower)
2488 *pupper = traits_t<T>::max_value;
2489 if (plastiter != NULL)
2490 *plastiter = *plower <= upper && *pupper > upper - incr;
2491 if (*pupper > upper)
2492 *pupper = upper; // tracker C73258
2493 } else {
2494 if (*pupper > *plower)
2495 *pupper = traits_t<T>::min_value;
2496 if (plastiter != NULL)
2497 *plastiter = *plower >= upper && *pupper < upper - incr;
2498 if (*pupper < upper)
2499 *pupper = upper; // tracker C73258
2500 }
2501 }
2502 }
2503}
2504
2505//-----------------------------------------------------------------------------
Jim Cownie5e8470a2013-09-27 10:38:44 +00002506// Dispatch routines
2507// Transfer call to template< type T >
2508// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2509// T lb, T ub, ST st, ST chunk )
2510extern "C" {
2511
2512/*!
2513@ingroup WORK_SHARING
2514@{
2515@param loc Source location
2516@param gtid Global thread id
2517@param schedule Schedule type
2518@param lb Lower bound
2519@param ub Upper bound
2520@param st Step (or increment if you prefer)
2521@param chunk The chunk size to block with
2522
Jonathan Peyton30419822017-05-12 18:01:32 +00002523This function prepares the runtime to start a dynamically scheduled for loop,
2524saving the loop arguments.
Jim Cownie5e8470a2013-09-27 10:38:44 +00002525These functions are all identical apart from the types of the arguments.
2526*/
2527
Jonathan Peyton30419822017-05-12 18:01:32 +00002528void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2529 enum sched_type schedule, kmp_int32 lb,
2530 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2531 KMP_DEBUG_ASSERT(__kmp_init_serial);
2532 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002533}
2534/*!
2535See @ref __kmpc_dispatch_init_4
2536*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002537void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2538 enum sched_type schedule, kmp_uint32 lb,
2539 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2540 KMP_DEBUG_ASSERT(__kmp_init_serial);
2541 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002542}
2543
2544/*!
2545See @ref __kmpc_dispatch_init_4
2546*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002547void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2548 enum sched_type schedule, kmp_int64 lb,
2549 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2550 KMP_DEBUG_ASSERT(__kmp_init_serial);
2551 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002552}
2553
2554/*!
2555See @ref __kmpc_dispatch_init_4
2556*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002557void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2558 enum sched_type schedule, kmp_uint64 lb,
2559 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2560 KMP_DEBUG_ASSERT(__kmp_init_serial);
2561 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002562}
2563
2564/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002565See @ref __kmpc_dispatch_init_4
2566
2567Difference from __kmpc_dispatch_init set of functions is these functions
2568are called for composite distribute parallel for construct. Thus before
2569regular iterations dispatching we need to calc per-team iteration space.
2570
2571These functions are all identical apart from the types of the arguments.
2572*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002573void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2574 enum sched_type schedule, kmp_int32 *p_last,
2575 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2576 kmp_int32 chunk) {
2577 KMP_DEBUG_ASSERT(__kmp_init_serial);
2578 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2579 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002580}
2581
Jonathan Peyton30419822017-05-12 18:01:32 +00002582void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2583 enum sched_type schedule, kmp_int32 *p_last,
2584 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2585 kmp_int32 chunk) {
2586 KMP_DEBUG_ASSERT(__kmp_init_serial);
2587 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2588 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002589}
2590
Jonathan Peyton30419822017-05-12 18:01:32 +00002591void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2592 enum sched_type schedule, kmp_int32 *p_last,
2593 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2594 kmp_int64 chunk) {
2595 KMP_DEBUG_ASSERT(__kmp_init_serial);
2596 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2597 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002598}
2599
Jonathan Peyton30419822017-05-12 18:01:32 +00002600void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2601 enum sched_type schedule, kmp_int32 *p_last,
2602 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2603 kmp_int64 chunk) {
2604 KMP_DEBUG_ASSERT(__kmp_init_serial);
2605 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2606 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002607}
2608
2609/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002610@param loc Source code location
2611@param gtid Global thread id
Jonathan Peyton30419822017-05-12 18:01:32 +00002612@param p_last Pointer to a flag set to one if this is the last chunk or zero
2613otherwise
Jim Cownie5e8470a2013-09-27 10:38:44 +00002614@param p_lb Pointer to the lower bound for the next chunk of work
2615@param p_ub Pointer to the upper bound for the next chunk of work
2616@param p_st Pointer to the stride for the next chunk of work
2617@return one if there is work to be done, zero otherwise
2618
2619Get the next dynamically allocated chunk of work for this thread.
2620If there is no more work, then the lb,ub and stride need not be modified.
2621*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002622int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2623 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2624 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002625}
2626
2627/*!
2628See @ref __kmpc_dispatch_next_4
2629*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002630int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2631 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2632 kmp_int32 *p_st) {
2633 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002634}
2635
2636/*!
2637See @ref __kmpc_dispatch_next_4
2638*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002639int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2640 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2641 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002642}
2643
2644/*!
2645See @ref __kmpc_dispatch_next_4
2646*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002647int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2648 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2649 kmp_int64 *p_st) {
2650 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002651}
2652
2653/*!
2654@param loc Source code location
2655@param gtid Global thread id
2656
2657Mark the end of a dynamic loop.
2658*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002659void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2660 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002661}
2662
2663/*!
2664See @ref __kmpc_dispatch_fini_4
2665*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002666void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2667 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002668}
2669
2670/*!
2671See @ref __kmpc_dispatch_fini_4
2672*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002673void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2674 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002675}
2676
2677/*!
2678See @ref __kmpc_dispatch_fini_4
2679*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002680void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2681 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002682}
2683/*! @} */
2684
Jonathan Peyton30419822017-05-12 18:01:32 +00002685//-----------------------------------------------------------------------------
2686// Non-template routines from kmp_dispatch.cpp used in other sources
Jim Cownie5e8470a2013-09-27 10:38:44 +00002687
Jonathan Peyton30419822017-05-12 18:01:32 +00002688kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2689 return value == checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002690}
2691
Jonathan Peyton30419822017-05-12 18:01:32 +00002692kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2693 return value != checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002694}
2695
Jonathan Peyton30419822017-05-12 18:01:32 +00002696kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2697 return value < checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002698}
2699
Jonathan Peyton30419822017-05-12 18:01:32 +00002700kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2701 return value >= checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002702}
2703
Jonathan Peyton30419822017-05-12 18:01:32 +00002704kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2705 return value <= checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002706}
Jim Cownie5e8470a2013-09-27 10:38:44 +00002707
2708kmp_uint32
Jonathan Peyton30419822017-05-12 18:01:32 +00002709__kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2710 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2711 void *obj // Higher-level synchronization object, or NULL.
2712 ) {
2713 // note: we may not belong to a team at this point
Ed Maste414544c2017-07-07 21:06:05 +00002714 volatile kmp_uint32 *spin = spinner;
2715 kmp_uint32 check = checker;
2716 kmp_uint32 spins;
2717 kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2718 kmp_uint32 r;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002719
Andrey Churbanovc47afcd2017-07-03 11:24:08 +00002720 KMP_FSYNC_SPIN_INIT(obj, CCAST(kmp_uint32 *, spin));
Jonathan Peyton30419822017-05-12 18:01:32 +00002721 KMP_INIT_YIELD(spins);
2722 // main wait spin loop
2723 while (!f(r = TCR_4(*spin), check)) {
2724 KMP_FSYNC_SPIN_PREPARE(obj);
2725 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2726 split. It causes problems with infinite recursion because of exit lock */
2727 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2728 __kmp_abort_thread(); */
Jim Cownie5e8470a2013-09-27 10:38:44 +00002729
Jonathan Peyton30419822017-05-12 18:01:32 +00002730 /* if we have waited a bit, or are oversubscribed, yield */
2731 /* pause is in the following code */
2732 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2733 KMP_YIELD_SPIN(spins);
2734 }
2735 KMP_FSYNC_SPIN_ACQUIRED(obj);
2736 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002737}
2738
Jonathan Peyton30419822017-05-12 18:01:32 +00002739void __kmp_wait_yield_4_ptr(
2740 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2741 void *obj // Higher-level synchronization object, or NULL.
2742 ) {
2743 // note: we may not belong to a team at this point
Ed Maste414544c2017-07-07 21:06:05 +00002744 void *spin = spinner;
2745 kmp_uint32 check = checker;
2746 kmp_uint32 spins;
2747 kmp_uint32 (*f)(void *, kmp_uint32) = pred;
Paul Osmialowskif7cc6af2016-05-31 20:20:32 +00002748
Jonathan Peyton30419822017-05-12 18:01:32 +00002749 KMP_FSYNC_SPIN_INIT(obj, spin);
2750 KMP_INIT_YIELD(spins);
2751 // main wait spin loop
2752 while (!f(spin, check)) {
2753 KMP_FSYNC_SPIN_PREPARE(obj);
2754 /* if we have waited a bit, or are oversubscribed, yield */
2755 /* pause is in the following code */
2756 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2757 KMP_YIELD_SPIN(spins);
2758 }
2759 KMP_FSYNC_SPIN_ACQUIRED(obj);
Paul Osmialowskif7cc6af2016-05-31 20:20:32 +00002760}
2761
Jim Cownie5e8470a2013-09-27 10:38:44 +00002762} // extern "C"
2763
2764#ifdef KMP_GOMP_COMPAT
2765
Jonathan Peyton30419822017-05-12 18:01:32 +00002766void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2767 enum sched_type schedule, kmp_int32 lb,
2768 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2769 int push_ws) {
2770 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2771 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002772}
2773
Jonathan Peyton30419822017-05-12 18:01:32 +00002774void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2775 enum sched_type schedule, kmp_uint32 lb,
2776 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2777 int push_ws) {
2778 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2779 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002780}
2781
Jonathan Peyton30419822017-05-12 18:01:32 +00002782void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2783 enum sched_type schedule, kmp_int64 lb,
2784 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2785 int push_ws) {
2786 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2787 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002788}
2789
Jonathan Peyton30419822017-05-12 18:01:32 +00002790void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2791 enum sched_type schedule, kmp_uint64 lb,
2792 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2793 int push_ws) {
2794 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2795 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002796}
2797
Jonathan Peyton30419822017-05-12 18:01:32 +00002798void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2799 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002800}
2801
Jonathan Peyton30419822017-05-12 18:01:32 +00002802void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2803 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002804}
2805
Jonathan Peyton30419822017-05-12 18:01:32 +00002806void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2807 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002808}
2809
Jonathan Peyton30419822017-05-12 18:01:32 +00002810void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2811 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002812}
2813
2814#endif /* KMP_GOMP_COMPAT */
2815
2816/* ------------------------------------------------------------------------ */