blob: e6dde9e428899e83df5b6217d99aed6522072a3e [file] [log] [blame]
Jim Cownie5e8470a2013-09-27 10:38:44 +00001/*
2 * kmp_dispatch.cpp: dynamic scheduling - iteration initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
Jonathan Peyton30419822017-05-12 18:01:32 +000016/* Dynamic scheduling initialization and dispatch.
Jim Cownie5e8470a2013-09-27 10:38:44 +000017 *
18 * NOTE: __kmp_nth is a constant inside of any dispatch loop, however
19 * it may change values between parallel regions. __kmp_max_nth
20 * is the largest value __kmp_nth may take, 1 is the smallest.
Jim Cownie5e8470a2013-09-27 10:38:44 +000021 */
22
Jonathan Peyton30419822017-05-12 18:01:32 +000023// Need to raise Win version from XP to Vista here for support of
24// InterlockedExchange64
Andrey Churbanov429dbc22016-07-11 10:44:57 +000025#if defined(_WIN32_WINNT) && defined(_M_IX86)
26#undef _WIN32_WINNT
27#define _WIN32_WINNT 0x0502
28#endif
29
Jim Cownie5e8470a2013-09-27 10:38:44 +000030#include "kmp.h"
Jonathan Peyton30419822017-05-12 18:01:32 +000031#include "kmp_error.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000032#include "kmp_i18n.h"
33#include "kmp_itt.h"
Jim Cownie4cc4bb42014-10-07 16:25:50 +000034#include "kmp_stats.h"
Jonathan Peyton30419822017-05-12 18:01:32 +000035#include "kmp_str.h"
Jim Cownie5e8470a2013-09-27 10:38:44 +000036#if KMP_OS_WINDOWS && KMP_ARCH_X86
Jonathan Peyton30419822017-05-12 18:01:32 +000037#include <float.h>
Jim Cownie5e8470a2013-09-27 10:38:44 +000038#endif
39
Andrey Churbanovd7d088f2015-04-29 16:42:24 +000040#if OMPT_SUPPORT
41#include "ompt-internal.h"
42#include "ompt-specific.h"
43#endif
44
Jim Cownie5e8470a2013-09-27 10:38:44 +000045/* ------------------------------------------------------------------------ */
Jim Cownie5e8470a2013-09-27 10:38:44 +000046
Andrey Churbanov429dbc22016-07-11 10:44:57 +000047#if KMP_STATIC_STEAL_ENABLED
Jim Cownie5e8470a2013-09-27 10:38:44 +000048
Jonathan Peyton30419822017-05-12 18:01:32 +000049// replaces dispatch_private_info{32,64} structures and
50// dispatch_private_info{32,64}_t types
51template <typename T> struct dispatch_private_infoXX_template {
52 typedef typename traits_t<T>::unsigned_t UT;
53 typedef typename traits_t<T>::signed_t ST;
54 UT count; // unsigned
55 T ub;
56 /* Adding KMP_ALIGN_CACHE here doesn't help / can hurt performance */
57 T lb;
58 ST st; // signed
59 UT tc; // unsigned
60 T static_steal_counter; // for static_steal only; maybe better to put after ub
Jim Cownie5e8470a2013-09-27 10:38:44 +000061
Jonathan Peyton30419822017-05-12 18:01:32 +000062 /* parm[1-4] are used in different ways by different scheduling algorithms */
Jim Cownie5e8470a2013-09-27 10:38:44 +000063
Jonathan Peyton30419822017-05-12 18:01:32 +000064 // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on )
65 // a) parm3 is properly aligned and
66 // b) all parm1-4 are in the same cache line.
67 // Because of parm1-4 are used together, performance seems to be better
68 // if they are in the same line (not measured though).
Jim Cownie5e8470a2013-09-27 10:38:44 +000069
Jonathan Peyton30419822017-05-12 18:01:32 +000070 struct KMP_ALIGN(32) { // compiler does not accept sizeof(T)*4
71 T parm1;
72 T parm2;
73 T parm3;
74 T parm4;
75 };
Jim Cownie5e8470a2013-09-27 10:38:44 +000076
Jonathan Peyton30419822017-05-12 18:01:32 +000077 UT ordered_lower; // unsigned
78 UT ordered_upper; // unsigned
79#if KMP_OS_WINDOWS
80 T last_upper;
81#endif /* KMP_OS_WINDOWS */
82};
Jim Cownie5e8470a2013-09-27 10:38:44 +000083
84#else /* KMP_STATIC_STEAL_ENABLED */
85
Jonathan Peyton30419822017-05-12 18:01:32 +000086// replaces dispatch_private_info{32,64} structures and
87// dispatch_private_info{32,64}_t types
88template <typename T> struct dispatch_private_infoXX_template {
89 typedef typename traits_t<T>::unsigned_t UT;
90 typedef typename traits_t<T>::signed_t ST;
91 T lb;
92 T ub;
93 ST st; // signed
94 UT tc; // unsigned
Jim Cownie5e8470a2013-09-27 10:38:44 +000095
Jonathan Peyton30419822017-05-12 18:01:32 +000096 T parm1;
97 T parm2;
98 T parm3;
99 T parm4;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000100
Jonathan Peyton30419822017-05-12 18:01:32 +0000101 UT count; // unsigned
Jim Cownie5e8470a2013-09-27 10:38:44 +0000102
Jonathan Peyton30419822017-05-12 18:01:32 +0000103 UT ordered_lower; // unsigned
104 UT ordered_upper; // unsigned
105#if KMP_OS_WINDOWS
106 T last_upper;
107#endif /* KMP_OS_WINDOWS */
108};
Jim Cownie5e8470a2013-09-27 10:38:44 +0000109
110#endif /* KMP_STATIC_STEAL_ENABLED */
111
112// replaces dispatch_private_info structure and dispatch_private_info_t type
Jonathan Peyton30419822017-05-12 18:01:32 +0000113template <typename T> struct KMP_ALIGN_CACHE dispatch_private_info_template {
114 // duplicate alignment here, otherwise size of structure is not correct in our
115 // compiler
116 union KMP_ALIGN_CACHE private_info_tmpl {
117 dispatch_private_infoXX_template<T> p;
118 dispatch_private_info64_t p64;
119 } u;
120 enum sched_type schedule; /* scheduling algorithm */
121 kmp_uint32 ordered; /* ordered clause specified */
122 kmp_uint32 ordered_bumped;
123 // To retain the structure size after making ordered_iteration scalar
124 kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3];
125 dispatch_private_info *next; /* stack of buffers for nest of serial regions */
126 kmp_uint32 nomerge; /* don't merge iters if serialized */
127 kmp_uint32 type_size;
128 enum cons_type pushed_ws;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000129};
130
Jonathan Peyton30419822017-05-12 18:01:32 +0000131// replaces dispatch_shared_info{32,64} structures and
132// dispatch_shared_info{32,64}_t types
133template <typename UT> struct dispatch_shared_infoXX_template {
134 /* chunk index under dynamic, number of idle threads under static-steal;
135 iteration index otherwise */
136 volatile UT iteration;
137 volatile UT num_done;
138 volatile UT ordered_iteration;
139 // to retain the structure size making ordered_iteration scalar
140 UT ordered_dummy[KMP_MAX_ORDERED - 3];
Jim Cownie5e8470a2013-09-27 10:38:44 +0000141};
142
143// replaces dispatch_shared_info structure and dispatch_shared_info_t type
Jonathan Peyton30419822017-05-12 18:01:32 +0000144template <typename UT> struct dispatch_shared_info_template {
145 // we need union here to keep the structure size
146 union shared_info_tmpl {
147 dispatch_shared_infoXX_template<UT> s;
148 dispatch_shared_info64_t s64;
149 } u;
150 volatile kmp_uint32 buffer_index;
Jonathan Peytondf6818b2016-06-14 17:57:47 +0000151#if OMP_45_ENABLED
Jonathan Peyton30419822017-05-12 18:01:32 +0000152 volatile kmp_int32 doacross_buf_idx; // teamwise index
153 kmp_uint32 *doacross_flags; // array of iteration flags (0/1)
154 kmp_int32 doacross_num_done; // count finished threads
Jonathan Peyton71909c52016-03-02 22:42:06 +0000155#endif
Jonathan Peyton4d3c2132016-07-08 17:43:21 +0000156#if KMP_USE_HWLOC
Jonathan Peyton30419822017-05-12 18:01:32 +0000157 // When linking with libhwloc, the ORDERED EPCC test slowsdown on big
158 // machines (> 48 cores). Performance analysis showed that a cache thrash
159 // was occurring and this padding helps alleviate the problem.
160 char padding[64];
Jonathan Peyton4d3c2132016-07-08 17:43:21 +0000161#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +0000162};
163
164/* ------------------------------------------------------------------------ */
Jim Cownie5e8470a2013-09-27 10:38:44 +0000165
Jim Cownie5e8470a2013-09-27 10:38:44 +0000166#undef USE_TEST_LOCKS
167
168// test_then_add template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000169template <typename T> static __forceinline T test_then_add(volatile T *p, T d);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000170
Jonathan Peyton30419822017-05-12 18:01:32 +0000171template <>
172__forceinline kmp_int32 test_then_add<kmp_int32>(volatile kmp_int32 *p,
173 kmp_int32 d) {
174 kmp_int32 r;
175 r = KMP_TEST_THEN_ADD32(p, d);
176 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000177}
178
Jonathan Peyton30419822017-05-12 18:01:32 +0000179template <>
180__forceinline kmp_int64 test_then_add<kmp_int64>(volatile kmp_int64 *p,
181 kmp_int64 d) {
182 kmp_int64 r;
183 r = KMP_TEST_THEN_ADD64(p, d);
184 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000185}
186
187// test_then_inc_acq template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000188template <typename T> static __forceinline T test_then_inc_acq(volatile T *p);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000189
Jonathan Peyton30419822017-05-12 18:01:32 +0000190template <>
191__forceinline kmp_int32 test_then_inc_acq<kmp_int32>(volatile kmp_int32 *p) {
192 kmp_int32 r;
193 r = KMP_TEST_THEN_INC_ACQ32(p);
194 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000195}
196
Jonathan Peyton30419822017-05-12 18:01:32 +0000197template <>
198__forceinline kmp_int64 test_then_inc_acq<kmp_int64>(volatile kmp_int64 *p) {
199 kmp_int64 r;
200 r = KMP_TEST_THEN_INC_ACQ64(p);
201 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000202}
203
204// test_then_inc template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000205template <typename T> static __forceinline T test_then_inc(volatile T *p);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000206
Jonathan Peyton30419822017-05-12 18:01:32 +0000207template <>
208__forceinline kmp_int32 test_then_inc<kmp_int32>(volatile kmp_int32 *p) {
209 kmp_int32 r;
210 r = KMP_TEST_THEN_INC32(p);
211 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000212}
213
Jonathan Peyton30419822017-05-12 18:01:32 +0000214template <>
215__forceinline kmp_int64 test_then_inc<kmp_int64>(volatile kmp_int64 *p) {
216 kmp_int64 r;
217 r = KMP_TEST_THEN_INC64(p);
218 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000219}
220
221// compare_and_swap template (general template should NOT be used)
Jonathan Peyton30419822017-05-12 18:01:32 +0000222template <typename T>
223static __forceinline kmp_int32 compare_and_swap(volatile T *p, T c, T s);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000224
Jonathan Peyton30419822017-05-12 18:01:32 +0000225template <>
226__forceinline kmp_int32 compare_and_swap<kmp_int32>(volatile kmp_int32 *p,
227 kmp_int32 c, kmp_int32 s) {
228 return KMP_COMPARE_AND_STORE_REL32(p, c, s);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000229}
230
Jonathan Peyton30419822017-05-12 18:01:32 +0000231template <>
232__forceinline kmp_int32 compare_and_swap<kmp_int64>(volatile kmp_int64 *p,
233 kmp_int64 c, kmp_int64 s) {
234 return KMP_COMPARE_AND_STORE_REL64(p, c, s);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000235}
236
Jonathan Peyton30419822017-05-12 18:01:32 +0000237/* Spin wait loop that first does pause, then yield.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000238 Waits until function returns non-zero when called with *spinner and check.
239 Does NOT put threads to sleep.
240#if USE_ITT_BUILD
241 Arguments:
Jonathan Peyton30419822017-05-12 18:01:32 +0000242 obj -- is higher-level synchronization object to report to ittnotify.
243 It is used to report locks consistently. For example, if lock is
244 acquired immediately, its address is reported to ittnotify via
245 KMP_FSYNC_ACQUIRED(). However, it lock cannot be acquired immediately
246 and lock routine calls to KMP_WAIT_YIELD(), the later should report the
247 same address, not an address of low-level spinner.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000248#endif // USE_ITT_BUILD
249*/
Jonathan Peyton30419822017-05-12 18:01:32 +0000250template <typename UT>
Jim Cownie5e8470a2013-09-27 10:38:44 +0000251// ToDo: make inline function (move to header file for icl)
Jonathan Peyton30419822017-05-12 18:01:32 +0000252static UT // unsigned 4- or 8-byte type
253 __kmp_wait_yield(
254 volatile UT *spinner, UT checker,
255 kmp_uint32 (*pred)(UT, UT) USE_ITT_BUILD_ARG(
256 void *obj) // Higher-level synchronization object, or NULL.
257 ) {
258 // note: we may not belong to a team at this point
259 register volatile UT *spin = spinner;
260 register UT check = checker;
261 register kmp_uint32 spins;
262 register kmp_uint32 (*f)(UT, UT) = pred;
263 register UT r;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000264
Jonathan Peyton30419822017-05-12 18:01:32 +0000265 KMP_FSYNC_SPIN_INIT(obj, (void *)spin);
266 KMP_INIT_YIELD(spins);
267 // main wait spin loop
268 while (!f(r = *spin, check)) {
269 KMP_FSYNC_SPIN_PREPARE(obj);
270 /* GEH - remove this since it was accidentally introduced when kmp_wait was
271 split. It causes problems with infinite recursion because of exit lock */
272 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
273 __kmp_abort_thread(); */
274
275 // if we are oversubscribed, or have waited a bit (and
276 // KMP_LIBRARY=throughput, then yield. pause is in the following code
277 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
278 KMP_YIELD_SPIN(spins);
279 }
280 KMP_FSYNC_SPIN_ACQUIRED(obj);
281 return r;
282}
283
284template <typename UT> static kmp_uint32 __kmp_eq(UT value, UT checker) {
285 return value == checker;
286}
287
288template <typename UT> static kmp_uint32 __kmp_neq(UT value, UT checker) {
289 return value != checker;
290}
291
292template <typename UT> static kmp_uint32 __kmp_lt(UT value, UT checker) {
293 return value < checker;
294}
295
296template <typename UT> static kmp_uint32 __kmp_ge(UT value, UT checker) {
297 return value >= checker;
298}
299
300template <typename UT> static kmp_uint32 __kmp_le(UT value, UT checker) {
301 return value <= checker;
302}
303
304/* ------------------------------------------------------------------------ */
305
306static void __kmp_dispatch_deo_error(int *gtid_ref, int *cid_ref,
307 ident_t *loc_ref) {
308 kmp_info_t *th;
309
310 KMP_DEBUG_ASSERT(gtid_ref);
311
312 if (__kmp_env_consistency_check) {
313 th = __kmp_threads[*gtid_ref];
314 if (th->th.th_root->r.r_active &&
315 (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none)) {
316#if KMP_USE_DYNAMIC_LOCK
317 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL, 0);
318#else
319 __kmp_push_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref, NULL);
320#endif
321 }
322 }
323}
324
325template <typename UT>
326static void __kmp_dispatch_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
327 typedef typename traits_t<UT>::signed_t ST;
328 dispatch_private_info_template<UT> *pr;
329
330 int gtid = *gtid_ref;
331 // int cid = *cid_ref;
332 kmp_info_t *th = __kmp_threads[gtid];
333 KMP_DEBUG_ASSERT(th->th.th_dispatch);
334
335 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d called\n", gtid));
336 if (__kmp_env_consistency_check) {
337 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
338 th->th.th_dispatch->th_dispatch_pr_current);
339 if (pr->pushed_ws != ct_none) {
340#if KMP_USE_DYNAMIC_LOCK
341 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL, 0);
342#else
343 __kmp_push_sync(gtid, ct_ordered_in_pdo, loc_ref, NULL);
344#endif
345 }
346 }
347
348 if (!th->th.th_team->t.t_serialized) {
349 dispatch_shared_info_template<UT> *sh =
350 reinterpret_cast<dispatch_shared_info_template<UT> *>(
351 th->th.th_dispatch->th_dispatch_sh_current);
352 UT lower;
353
354 if (!__kmp_env_consistency_check) {
355 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
356 th->th.th_dispatch->th_dispatch_pr_current);
357 }
358 lower = pr->u.p.ordered_lower;
359
360#if !defined(KMP_GOMP_COMPAT)
361 if (__kmp_env_consistency_check) {
362 if (pr->ordered_bumped) {
363 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
364 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
365 ct_ordered_in_pdo, loc_ref,
366 &p->stack_data[p->w_top]);
367 }
368 }
369#endif /* !defined(KMP_GOMP_COMPAT) */
370
371 KMP_MB();
372#ifdef KMP_DEBUG
Jim Cownie5e8470a2013-09-27 10:38:44 +0000373 {
Jonathan Peyton30419822017-05-12 18:01:32 +0000374 const char *buff;
375 // create format specifiers before the debug output
376 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d before wait: "
377 "ordered_iter:%%%s lower:%%%s\n",
378 traits_t<UT>::spec, traits_t<UT>::spec);
379 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
380 __kmp_str_free(&buff);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000381 }
Jonathan Peyton30419822017-05-12 18:01:32 +0000382#endif
383
384 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
385 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
386 KMP_MB(); /* is this necessary? */
387#ifdef KMP_DEBUG
388 {
389 const char *buff;
390 // create format specifiers before the debug output
391 buff = __kmp_str_format("__kmp_dispatch_deo: T#%%d after wait: "
392 "ordered_iter:%%%s lower:%%%s\n",
393 traits_t<UT>::spec, traits_t<UT>::spec);
394 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
395 __kmp_str_free(&buff);
396 }
397#endif
398 }
399 KD_TRACE(100, ("__kmp_dispatch_deo: T#%d returned\n", gtid));
400}
401
402static void __kmp_dispatch_dxo_error(int *gtid_ref, int *cid_ref,
403 ident_t *loc_ref) {
404 kmp_info_t *th;
405
406 if (__kmp_env_consistency_check) {
407 th = __kmp_threads[*gtid_ref];
408 if (th->th.th_dispatch->th_dispatch_pr_current->pushed_ws != ct_none) {
409 __kmp_pop_sync(*gtid_ref, ct_ordered_in_pdo, loc_ref);
410 }
411 }
412}
413
414template <typename UT>
415static void __kmp_dispatch_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) {
416 typedef typename traits_t<UT>::signed_t ST;
417 dispatch_private_info_template<UT> *pr;
418
419 int gtid = *gtid_ref;
420 // int cid = *cid_ref;
421 kmp_info_t *th = __kmp_threads[gtid];
422 KMP_DEBUG_ASSERT(th->th.th_dispatch);
423
424 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d called\n", gtid));
425 if (__kmp_env_consistency_check) {
426 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
427 th->th.th_dispatch->th_dispatch_pr_current);
428 if (pr->pushed_ws != ct_none) {
429 __kmp_pop_sync(gtid, ct_ordered_in_pdo, loc_ref);
430 }
431 }
432
433 if (!th->th.th_team->t.t_serialized) {
434 dispatch_shared_info_template<UT> *sh =
435 reinterpret_cast<dispatch_shared_info_template<UT> *>(
436 th->th.th_dispatch->th_dispatch_sh_current);
437
438 if (!__kmp_env_consistency_check) {
439 pr = reinterpret_cast<dispatch_private_info_template<UT> *>(
440 th->th.th_dispatch->th_dispatch_pr_current);
441 }
442
443 KMP_FSYNC_RELEASING(&sh->u.s.ordered_iteration);
444#if !defined(KMP_GOMP_COMPAT)
445 if (__kmp_env_consistency_check) {
446 if (pr->ordered_bumped != 0) {
447 struct cons_header *p = __kmp_threads[gtid]->th.th_cons;
448 /* How to test it? - OM */
449 __kmp_error_construct2(kmp_i18n_msg_CnsMultipleNesting,
450 ct_ordered_in_pdo, loc_ref,
451 &p->stack_data[p->w_top]);
452 }
453 }
454#endif /* !defined(KMP_GOMP_COMPAT) */
455
456 KMP_MB(); /* Flush all pending memory write invalidates. */
457
458 pr->ordered_bumped += 1;
459
460 KD_TRACE(1000,
461 ("__kmp_dispatch_dxo: T#%d bumping ordered ordered_bumped=%d\n",
462 gtid, pr->ordered_bumped));
463
464 KMP_MB(); /* Flush all pending memory write invalidates. */
465
466 /* TODO use general release procedure? */
467 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
468
469 KMP_MB(); /* Flush all pending memory write invalidates. */
470 }
471 KD_TRACE(100, ("__kmp_dispatch_dxo: T#%d returned\n", gtid));
472}
473
474// Computes and returns x to the power of y, where y must a non-negative integer
475template <typename UT>
476static __forceinline long double __kmp_pow(long double x, UT y) {
477 long double s = 1.0L;
478
479 KMP_DEBUG_ASSERT(x > 0.0 && x < 1.0);
480 // KMP_DEBUG_ASSERT(y >= 0); // y is unsigned
481 while (y) {
482 if (y & 1)
483 s *= x;
484 x *= x;
485 y >>= 1;
486 }
487 return s;
488}
489
490/* Computes and returns the number of unassigned iterations after idx chunks
491 have been assigned (the total number of unassigned iterations in chunks with
492 index greater than or equal to idx). __forceinline seems to be broken so that
493 if we __forceinline this function, the behavior is wrong
494 (one of the unit tests, sch_guided_analytical_basic.cpp, fails) */
495template <typename T>
496static __inline typename traits_t<T>::unsigned_t
497__kmp_dispatch_guided_remaining(T tc, typename traits_t<T>::floating_t base,
498 typename traits_t<T>::unsigned_t idx) {
499 /* Note: On Windows* OS on IA-32 architecture and Intel(R) 64, at least for
500 ICL 8.1, long double arithmetic may not really have long double precision,
501 even with /Qlong_double. Currently, we workaround that in the caller code,
502 by manipulating the FPCW for Windows* OS on IA-32 architecture. The lack
503 of precision is not expected to be a correctness issue, though. */
504 typedef typename traits_t<T>::unsigned_t UT;
505
506 long double x = tc * __kmp_pow<UT>(base, idx);
507 UT r = (UT)x;
508 if (x == r)
Jim Cownie5e8470a2013-09-27 10:38:44 +0000509 return r;
Jonathan Peyton30419822017-05-12 18:01:32 +0000510 return r + 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000511}
512
513// Parameters of the guided-iterative algorithm:
514// p2 = n * nproc * ( chunk + 1 ) // point of switching to dynamic
515// p3 = 1 / ( n * nproc ) // remaining iterations multiplier
Jonathan Peyton30419822017-05-12 18:01:32 +0000516// by default n = 2. For example with n = 3 the chunks distribution will be more
517// flat.
Jim Cownie5e8470a2013-09-27 10:38:44 +0000518// With n = 1 first chunk is the same as for static schedule, e.g. trip / nproc.
519static int guided_int_param = 2;
Jonathan Peyton30419822017-05-12 18:01:32 +0000520static double guided_flt_param = 0.5; // = 1.0 / guided_int_param;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000521
522// UT - unsigned flavor of T, ST - signed flavor of T,
523// DBL - double if sizeof(T)==4, or long double if sizeof(T)==8
Jonathan Peyton30419822017-05-12 18:01:32 +0000524template <typename T>
Jim Cownie5e8470a2013-09-27 10:38:44 +0000525static void
Jonathan Peyton30419822017-05-12 18:01:32 +0000526__kmp_dispatch_init(ident_t *loc, int gtid, enum sched_type schedule, T lb,
527 T ub, typename traits_t<T>::signed_t st,
528 typename traits_t<T>::signed_t chunk, int push_ws) {
529 typedef typename traits_t<T>::unsigned_t UT;
530 typedef typename traits_t<T>::signed_t ST;
531 typedef typename traits_t<T>::floating_t DBL;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000532
Jonathan Peyton30419822017-05-12 18:01:32 +0000533 int active;
534 T tc;
535 kmp_info_t *th;
536 kmp_team_t *team;
537 kmp_uint32 my_buffer_index;
538 dispatch_private_info_template<T> *pr;
539 dispatch_shared_info_template<UT> volatile *sh;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000540
Jonathan Peyton30419822017-05-12 18:01:32 +0000541 KMP_BUILD_ASSERT(sizeof(dispatch_private_info_template<T>) ==
542 sizeof(dispatch_private_info));
543 KMP_BUILD_ASSERT(sizeof(dispatch_shared_info_template<UT>) ==
544 sizeof(dispatch_shared_info));
Jim Cownie5e8470a2013-09-27 10:38:44 +0000545
Jonathan Peyton30419822017-05-12 18:01:32 +0000546 if (!TCR_4(__kmp_init_parallel))
547 __kmp_parallel_initialize();
Jim Cownie5e8470a2013-09-27 10:38:44 +0000548
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000549#if INCLUDE_SSC_MARKS
Jonathan Peyton30419822017-05-12 18:01:32 +0000550 SSC_MARK_DISPATCH_INIT();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000551#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000552#ifdef KMP_DEBUG
553 {
554 const char *buff;
555 // create format specifiers before the debug output
556 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d called: schedule:%%d "
557 "chunk:%%%s lb:%%%s ub:%%%s st:%%%s\n",
558 traits_t<ST>::spec, traits_t<T>::spec,
559 traits_t<T>::spec, traits_t<ST>::spec);
560 KD_TRACE(10, (buff, gtid, schedule, chunk, lb, ub, st));
561 __kmp_str_free(&buff);
562 }
563#endif
564 /* setup data */
565 th = __kmp_threads[gtid];
566 team = th->th.th_team;
567 active = !team->t.t_serialized;
568 th->th.th_ident = loc;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000569
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000570#if USE_ITT_BUILD
Jonathan Peyton30419822017-05-12 18:01:32 +0000571 kmp_uint64 cur_chunk = chunk;
572 int itt_need_metadata_reporting = __itt_metadata_add_ptr &&
573 __kmp_forkjoin_frames_mode == 3 &&
574 KMP_MASTER_GTID(gtid) &&
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000575#if OMP_40_ENABLED
Jonathan Peyton30419822017-05-12 18:01:32 +0000576 th->th.th_teams_microtask == NULL &&
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000577#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000578 team->t.t_active_level == 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000579#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000580 if (!active) {
581 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
582 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
583 } else {
584 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
585 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000586
Jonathan Peyton30419822017-05-12 18:01:32 +0000587 my_buffer_index = th->th.th_dispatch->th_disp_index++;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000588
Jonathan Peyton30419822017-05-12 18:01:32 +0000589 /* What happens when number of threads changes, need to resize buffer? */
590 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
591 &th->th.th_dispatch
592 ->th_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
593 sh = reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
594 &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]);
595 }
Jim Cownie5e8470a2013-09-27 10:38:44 +0000596
Jonathan Peyton30419822017-05-12 18:01:32 +0000597#if (KMP_STATIC_STEAL_ENABLED)
598 if (SCHEDULE_HAS_NONMONOTONIC(schedule))
599 // AC: we now have only one implementation of stealing, so use it
600 schedule = kmp_sch_static_steal;
601 else
602#endif
603 schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
Jonathan Peytonea0fe1d2016-02-25 17:55:50 +0000604
Jonathan Peyton30419822017-05-12 18:01:32 +0000605 /* Pick up the nomerge/ordered bits from the scheduling type */
606 if ((schedule >= kmp_nm_lower) && (schedule < kmp_nm_upper)) {
607 pr->nomerge = TRUE;
608 schedule =
609 (enum sched_type)(((int)schedule) - (kmp_nm_lower - kmp_sch_lower));
610 } else {
611 pr->nomerge = FALSE;
612 }
613 pr->type_size = traits_t<T>::type_size; // remember the size of variables
614 if (kmp_ord_lower & schedule) {
615 pr->ordered = TRUE;
616 schedule =
617 (enum sched_type)(((int)schedule) - (kmp_ord_lower - kmp_sch_lower));
618 } else {
619 pr->ordered = FALSE;
620 }
Jonathan Peyton45be4502015-08-11 21:36:41 +0000621
Jonathan Peyton30419822017-05-12 18:01:32 +0000622 if (schedule == kmp_sch_static) {
623 schedule = __kmp_static;
624 } else {
625 if (schedule == kmp_sch_runtime) {
626 // Use the scheduling specified by OMP_SCHEDULE (or __kmp_sch_default if
627 // not specified)
628 schedule = team->t.t_sched.r_sched_type;
629 // Detail the schedule if needed (global controls are differentiated
630 // appropriately)
631 if (schedule == kmp_sch_guided_chunked) {
632 schedule = __kmp_guided;
633 } else if (schedule == kmp_sch_static) {
Jim Cownie5e8470a2013-09-27 10:38:44 +0000634 schedule = __kmp_static;
Jonathan Peyton30419822017-05-12 18:01:32 +0000635 }
636 // Use the chunk size specified by OMP_SCHEDULE (or default if not
637 // specified)
638 chunk = team->t.t_sched.chunk;
Jonathan Peyton00afbd02015-11-12 21:26:22 +0000639#if USE_ITT_BUILD
Jonathan Peyton30419822017-05-12 18:01:32 +0000640 cur_chunk = chunk;
Jonathan Peyton00afbd02015-11-12 21:26:22 +0000641#endif
Jonathan Peyton30419822017-05-12 18:01:32 +0000642#ifdef KMP_DEBUG
643 {
644 const char *buff;
Jim Cownie5e8470a2013-09-27 10:38:44 +0000645 // create format specifiers before the debug output
646 buff = __kmp_str_format(
Jonathan Peyton30419822017-05-12 18:01:32 +0000647 "__kmp_dispatch_init: T#%%d new: schedule:%%d chunk:%%%s\n",
648 traits_t<ST>::spec);
649 KD_TRACE(10, (buff, gtid, schedule, chunk));
650 __kmp_str_free(&buff);
Jim Cownie5e8470a2013-09-27 10:38:44 +0000651 }
Jonathan Peyton30419822017-05-12 18:01:32 +0000652#endif
653 } else {
654 if (schedule == kmp_sch_guided_chunked) {
655 schedule = __kmp_guided;
656 }
657 if (chunk <= 0) {
658 chunk = KMP_DEFAULT_CHUNK;
659 }
660 }
661
662 if (schedule == kmp_sch_auto) {
663 // mapping and differentiation: in the __kmp_do_serial_initialize()
664 schedule = __kmp_auto;
665#ifdef KMP_DEBUG
666 {
667 const char *buff;
668 // create format specifiers before the debug output
669 buff = __kmp_str_format("__kmp_dispatch_init: kmp_sch_auto: T#%%d new: "
670 "schedule:%%d chunk:%%%s\n",
671 traits_t<ST>::spec);
672 KD_TRACE(10, (buff, gtid, schedule, chunk));
673 __kmp_str_free(&buff);
674 }
675#endif
676 }
677
678 /* guided analytical not safe for too many threads */
679 if (schedule == kmp_sch_guided_analytical_chunked &&
680 th->th.th_team_nproc > 1 << 20) {
681 schedule = kmp_sch_guided_iterative_chunked;
682 KMP_WARNING(DispatchManyThreads);
683 }
Andrey Churbanovd454c732017-06-05 17:17:33 +0000684 if (schedule == kmp_sch_runtime_simd) {
685 // compiler provides simd_width in the chunk parameter
686 schedule = team->t.t_sched.r_sched_type;
687 // Detail the schedule if needed (global controls are differentiated
688 // appropriately)
689 if (schedule == kmp_sch_static || schedule == kmp_sch_auto ||
690 schedule == __kmp_static) {
691 schedule = kmp_sch_static_balanced_chunked;
692 } else {
693 if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) {
694 schedule = kmp_sch_guided_simd;
695 }
696 chunk = team->t.t_sched.chunk * chunk;
697 }
698#if USE_ITT_BUILD
699 cur_chunk = chunk;
700#endif
701#ifdef KMP_DEBUG
702 {
703 const char *buff;
704 // create format specifiers before the debug output
705 buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d"
706 " chunk:%%%s\n",
707 traits_t<ST>::spec);
708 KD_TRACE(10, (buff, gtid, schedule, chunk));
709 __kmp_str_free(&buff);
710 }
711#endif
712 }
Jonathan Peyton30419822017-05-12 18:01:32 +0000713 pr->u.p.parm1 = chunk;
714 }
715 KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper),
716 "unknown scheduling type");
717
718 pr->u.p.count = 0;
719
720 if (__kmp_env_consistency_check) {
721 if (st == 0) {
722 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited,
723 (pr->ordered ? ct_pdo_ordered : ct_pdo), loc);
724 }
725 }
726 // compute trip count
727 if (st == 1) { // most common case
728 if (ub >= lb) {
729 tc = ub - lb + 1;
730 } else { // ub < lb
731 tc = 0; // zero-trip
732 }
733 } else if (st < 0) {
734 if (lb >= ub) {
735 // AC: cast to unsigned is needed for loops like (i=2B; i>-2B; i-=1B),
736 // where the division needs to be unsigned regardless of the result type
737 tc = (UT)(lb - ub) / (-st) + 1;
738 } else { // lb < ub
739 tc = 0; // zero-trip
740 }
741 } else { // st > 0
742 if (ub >= lb) {
743 // AC: cast to unsigned is needed for loops like (i=-2B; i<2B; i+=1B),
744 // where the division needs to be unsigned regardless of the result type
745 tc = (UT)(ub - lb) / st + 1;
746 } else { // ub < lb
747 tc = 0; // zero-trip
748 }
749 }
750
751 // Any half-decent optimizer will remove this test when the blocks are empty
752 // since the macros expand to nothing when statistics are disabled.
753 if (schedule == __kmp_static) {
754 KMP_COUNT_BLOCK(OMP_FOR_static);
755 KMP_COUNT_VALUE(FOR_static_iterations, tc);
756 } else {
757 KMP_COUNT_BLOCK(OMP_FOR_dynamic);
758 KMP_COUNT_VALUE(FOR_dynamic_iterations, tc);
759 }
760
761 pr->u.p.lb = lb;
762 pr->u.p.ub = ub;
763 pr->u.p.st = st;
764 pr->u.p.tc = tc;
765
766#if KMP_OS_WINDOWS
767 pr->u.p.last_upper = ub + st;
768#endif /* KMP_OS_WINDOWS */
769
770 /* NOTE: only the active parallel region(s) has active ordered sections */
771
772 if (active) {
773 if (pr->ordered == 0) {
774 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo_error;
775 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo_error;
776 } else {
777 pr->ordered_bumped = 0;
778
779 pr->u.p.ordered_lower = 1;
780 pr->u.p.ordered_upper = 0;
781
782 th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo<UT>;
783 th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo<UT>;
784 }
785 }
786
787 if (__kmp_env_consistency_check) {
788 enum cons_type ws = pr->ordered ? ct_pdo_ordered : ct_pdo;
789 if (push_ws) {
790 __kmp_push_workshare(gtid, ws, loc);
791 pr->pushed_ws = ws;
792 } else {
793 __kmp_check_workshare(gtid, ws, loc);
794 pr->pushed_ws = ct_none;
795 }
796 }
797
798 switch (schedule) {
799#if (KMP_STATIC_STEAL_ENABLED)
800 case kmp_sch_static_steal: {
801 T nproc = th->th.th_team_nproc;
802 T ntc, init;
803
804 KD_TRACE(100,
805 ("__kmp_dispatch_init: T#%d kmp_sch_static_steal case\n", gtid));
806
807 ntc = (tc % chunk ? 1 : 0) + tc / chunk;
808 if (nproc > 1 && ntc >= nproc) {
809 KMP_COUNT_BLOCK(OMP_FOR_static_steal);
810 T id = __kmp_tid_from_gtid(gtid);
811 T small_chunk, extras;
812
813 small_chunk = ntc / nproc;
814 extras = ntc % nproc;
815
816 init = id * small_chunk + (id < extras ? id : extras);
817 pr->u.p.count = init;
818 pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0);
819
820 pr->u.p.parm2 = lb;
821 // pr->pfields.parm3 = 0; // it's not used in static_steal
822 pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid
823 pr->u.p.st = st;
824 if (traits_t<T>::type_size > 4) {
825 // AC: TODO: check if 16-byte CAS available and use it to
826 // improve performance (probably wait for explicit request
827 // before spending time on this).
828 // For now use dynamically allocated per-thread lock,
829 // free memory in __kmp_dispatch_next when status==0.
830 KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL);
831 th->th.th_dispatch->th_steal_lock =
832 (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t));
833 __kmp_init_lock(th->th.th_dispatch->th_steal_lock);
834 }
835 break;
836 } else {
837 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
838 "kmp_sch_static_balanced\n",
839 gtid));
840 schedule = kmp_sch_static_balanced;
841 /* too few iterations: fall-through to kmp_sch_static_balanced */
842 } // if
843 /* FALL-THROUGH to static balanced */
844 } // case
845#endif
846 case kmp_sch_static_balanced: {
847 T nproc = th->th.th_team_nproc;
848 T init, limit;
849
850 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_balanced case\n",
851 gtid));
852
853 if (nproc > 1) {
854 T id = __kmp_tid_from_gtid(gtid);
855
856 if (tc < nproc) {
857 if (id < tc) {
858 init = id;
859 limit = id;
860 pr->u.p.parm1 = (id == tc - 1); /* parm1 stores *plastiter */
861 } else {
862 pr->u.p.count = 1; /* means no more chunks to execute */
863 pr->u.p.parm1 = FALSE;
864 break;
865 }
866 } else {
867 T small_chunk = tc / nproc;
868 T extras = tc % nproc;
869 init = id * small_chunk + (id < extras ? id : extras);
870 limit = init + small_chunk - (id < extras ? 0 : 1);
871 pr->u.p.parm1 = (id == nproc - 1);
872 }
873 } else {
874 if (tc > 0) {
875 init = 0;
876 limit = tc - 1;
877 pr->u.p.parm1 = TRUE;
878 } else { // zero trip count
879 pr->u.p.count = 1; /* means no more chunks to execute */
880 pr->u.p.parm1 = FALSE;
881 break;
882 }
883 }
884#if USE_ITT_BUILD
885 // Calculate chunk for metadata report
886 if (itt_need_metadata_reporting)
887 cur_chunk = limit - init + 1;
888#endif
889 if (st == 1) {
890 pr->u.p.lb = lb + init;
891 pr->u.p.ub = lb + limit;
892 } else {
893 // calculated upper bound, "ub" is user-defined upper bound
894 T ub_tmp = lb + limit * st;
895 pr->u.p.lb = lb + init * st;
896 // adjust upper bound to "ub" if needed, so that MS lastprivate will match
897 // it exactly
898 if (st > 0) {
899 pr->u.p.ub = (ub_tmp + st > ub ? ub : ub_tmp);
900 } else {
901 pr->u.p.ub = (ub_tmp + st < ub ? ub : ub_tmp);
902 }
903 }
904 if (pr->ordered) {
905 pr->u.p.ordered_lower = init;
906 pr->u.p.ordered_upper = limit;
907 }
908 break;
909 } // case
Andrey Churbanovd454c732017-06-05 17:17:33 +0000910 case kmp_sch_static_balanced_chunked: {
911 // similar to balanced, but chunk adjusted to multiple of simd width
912 T nth = th->th.th_team_nproc;
913 KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)"
914 " -> falling-through to static_greedy\n",
915 gtid));
916 schedule = kmp_sch_static_greedy;
917 if (nth > 1)
918 pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1);
919 else
920 pr->u.p.parm1 = tc;
921 break;
922 } // case
923 case kmp_sch_guided_iterative_chunked:
924 case kmp_sch_guided_simd: {
Jonathan Peyton30419822017-05-12 18:01:32 +0000925 T nproc = th->th.th_team_nproc;
926 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked"
927 " case\n",
928 gtid));
929
930 if (nproc > 1) {
931 if ((2L * chunk + 1) * nproc >= tc) {
932 /* chunk size too large, switch to dynamic */
933 schedule = kmp_sch_dynamic_chunked;
934 } else {
935 // when remaining iters become less than parm2 - switch to dynamic
936 pr->u.p.parm2 = guided_int_param * nproc * (chunk + 1);
937 *(double *)&pr->u.p.parm3 =
938 guided_flt_param / nproc; // may occupy parm3 and parm4
939 }
940 } else {
941 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
942 "kmp_sch_static_greedy\n",
943 gtid));
944 schedule = kmp_sch_static_greedy;
945 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
946 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n",
947 gtid));
948 pr->u.p.parm1 = tc;
949 } // if
950 } // case
951 break;
952 case kmp_sch_guided_analytical_chunked: {
953 T nproc = th->th.th_team_nproc;
954 KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_analytical_chunked"
955 " case\n",
956 gtid));
957 if (nproc > 1) {
958 if ((2L * chunk + 1) * nproc >= tc) {
959 /* chunk size too large, switch to dynamic */
960 schedule = kmp_sch_dynamic_chunked;
961 } else {
962 /* commonly used term: (2 nproc - 1)/(2 nproc) */
963 DBL x;
964
965#if KMP_OS_WINDOWS && KMP_ARCH_X86
966 /* Linux* OS already has 64-bit computation by default for long double,
967 and on Windows* OS on Intel(R) 64, /Qlong_double doesn't work. On
968 Windows* OS on IA-32 architecture, we need to set precision to 64-bit
969 instead of the default 53-bit. Even though long double doesn't work
970 on Windows* OS on Intel(R) 64, the resulting lack of precision is not
971 expected to impact the correctness of the algorithm, but this has not
972 been mathematically proven. */
973 // save original FPCW and set precision to 64-bit, as
974 // Windows* OS on IA-32 architecture defaults to 53-bit
975 unsigned int oldFpcw = _control87(0, 0);
976 _control87(_PC_64, _MCW_PC); // 0,0x30000
977#endif
978 /* value used for comparison in solver for cross-over point */
979 long double target = ((long double)chunk * 2 + 1) * nproc / tc;
980
981 /* crossover point--chunk indexes equal to or greater than
982 this point switch to dynamic-style scheduling */
983 UT cross;
984
985 /* commonly used term: (2 nproc - 1)/(2 nproc) */
986 x = (long double)1.0 - (long double)0.5 / nproc;
987
988#ifdef KMP_DEBUG
989 { // test natural alignment
990 struct _test_a {
991 char a;
992 union {
993 char b;
994 DBL d;
995 };
996 } t;
997 ptrdiff_t natural_alignment =
998 (ptrdiff_t)&t.b - (ptrdiff_t)&t - (ptrdiff_t)1;
999 //__kmp_warn( " %llx %llx %lld", (long long)&t.d, (long long)&t, (long
1000 // long)natural_alignment );
1001 KMP_DEBUG_ASSERT(
1002 (((ptrdiff_t)&pr->u.p.parm3) & (natural_alignment)) == 0);
1003 }
1004#endif // KMP_DEBUG
1005
1006 /* save the term in thread private dispatch structure */
1007 *(DBL *)&pr->u.p.parm3 = x;
1008
1009 /* solve for the crossover point to the nearest integer i for which C_i
1010 <= chunk */
1011 {
1012 UT left, right, mid;
1013 long double p;
1014
1015 /* estimate initial upper and lower bound */
1016
1017 /* doesn't matter what value right is as long as it is positive, but
1018 it affects performance of the solver */
1019 right = 229;
1020 p = __kmp_pow<UT>(x, right);
1021 if (p > target) {
1022 do {
1023 p *= p;
1024 right <<= 1;
1025 } while (p > target && right < (1 << 27));
1026 /* lower bound is previous (failed) estimate of upper bound */
1027 left = right >> 1;
1028 } else {
1029 left = 0;
1030 }
1031
1032 /* bisection root-finding method */
1033 while (left + 1 < right) {
1034 mid = (left + right) / 2;
1035 if (__kmp_pow<UT>(x, mid) > target) {
1036 left = mid;
1037 } else {
1038 right = mid;
1039 }
1040 } // while
1041 cross = right;
1042 }
1043 /* assert sanity of computed crossover point */
1044 KMP_ASSERT(cross && __kmp_pow<UT>(x, cross - 1) > target &&
1045 __kmp_pow<UT>(x, cross) <= target);
1046
1047 /* save the crossover point in thread private dispatch structure */
1048 pr->u.p.parm2 = cross;
1049
1050// C75803
1051#if ((KMP_OS_LINUX || KMP_OS_WINDOWS) && KMP_ARCH_X86) && (!defined(KMP_I8))
1052#define GUIDED_ANALYTICAL_WORKAROUND (*(DBL *)&pr->u.p.parm3)
1053#else
1054#define GUIDED_ANALYTICAL_WORKAROUND (x)
1055#endif
1056 /* dynamic-style scheduling offset */
1057 pr->u.p.count = tc - __kmp_dispatch_guided_remaining(
1058 tc, GUIDED_ANALYTICAL_WORKAROUND, cross) -
1059 cross * chunk;
1060#if KMP_OS_WINDOWS && KMP_ARCH_X86
1061 // restore FPCW
1062 _control87(oldFpcw, _MCW_PC);
1063#endif
1064 } // if
1065 } else {
1066 KD_TRACE(100, ("__kmp_dispatch_init: T#%d falling-through to "
1067 "kmp_sch_static_greedy\n",
1068 gtid));
1069 schedule = kmp_sch_static_greedy;
1070 /* team->t.t_nproc == 1: fall-through to kmp_sch_static_greedy */
1071 pr->u.p.parm1 = tc;
1072 } // if
1073 } // case
1074 break;
1075 case kmp_sch_static_greedy:
1076 KD_TRACE(100,
1077 ("__kmp_dispatch_init: T#%d kmp_sch_static_greedy case\n", gtid));
1078 pr->u.p.parm1 = (th->th.th_team_nproc > 1)
1079 ? (tc + th->th.th_team_nproc - 1) / th->th.th_team_nproc
1080 : tc;
1081 break;
1082 case kmp_sch_static_chunked:
1083 case kmp_sch_dynamic_chunked:
1084 if (pr->u.p.parm1 <= 0) {
1085 pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
1086 }
1087 KD_TRACE(100, ("__kmp_dispatch_init: T#%d "
1088 "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
1089 gtid));
1090 break;
1091 case kmp_sch_trapezoidal: {
1092 /* TSS: trapezoid self-scheduling, minimum chunk_size = parm1 */
1093
1094 T parm1, parm2, parm3, parm4;
1095 KD_TRACE(100,
1096 ("__kmp_dispatch_init: T#%d kmp_sch_trapezoidal case\n", gtid));
1097
1098 parm1 = chunk;
1099
1100 /* F : size of the first cycle */
1101 parm2 = (tc / (2 * th->th.th_team_nproc));
1102
1103 if (parm2 < 1) {
1104 parm2 = 1;
1105 }
1106
1107 /* L : size of the last cycle. Make sure the last cycle is not larger
1108 than the first cycle. */
1109 if (parm1 < 1) {
1110 parm1 = 1;
1111 } else if (parm1 > parm2) {
1112 parm1 = parm2;
1113 }
1114
1115 /* N : number of cycles */
1116 parm3 = (parm2 + parm1);
1117 parm3 = (2 * tc + parm3 - 1) / parm3;
1118
1119 if (parm3 < 2) {
1120 parm3 = 2;
1121 }
1122
1123 /* sigma : decreasing incr of the trapezoid */
1124 parm4 = (parm3 - 1);
1125 parm4 = (parm2 - parm1) / parm4;
1126
1127 // pointless check, because parm4 >= 0 always
1128 // if ( parm4 < 0 ) {
1129 // parm4 = 0;
1130 //}
1131
1132 pr->u.p.parm1 = parm1;
1133 pr->u.p.parm2 = parm2;
1134 pr->u.p.parm3 = parm3;
1135 pr->u.p.parm4 = parm4;
1136 } // case
1137 break;
1138
1139 default: {
1140 __kmp_msg(kmp_ms_fatal, // Severity
1141 KMP_MSG(UnknownSchedTypeDetected), // Primary message
1142 KMP_HNT(GetNewerLibrary), // Hint
1143 __kmp_msg_null // Variadic argument list terminator
1144 );
1145 } break;
1146 } // switch
1147 pr->schedule = schedule;
1148 if (active) {
1149 /* The name of this buffer should be my_buffer_index when it's free to use
1150 * it */
1151
1152 KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d "
1153 "sh->buffer_index:%d\n",
1154 gtid, my_buffer_index, sh->buffer_index));
1155 __kmp_wait_yield<kmp_uint32>(&sh->buffer_index, my_buffer_index,
1156 __kmp_eq<kmp_uint32> USE_ITT_BUILD_ARG(NULL));
1157 // Note: KMP_WAIT_YIELD() cannot be used there: buffer index and
1158 // my_buffer_index are *always* 32-bit integers.
1159 KMP_MB(); /* is this necessary? */
1160 KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d "
1161 "sh->buffer_index:%d\n",
1162 gtid, my_buffer_index, sh->buffer_index));
1163
1164 th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr;
1165 th->th.th_dispatch->th_dispatch_sh_current = (dispatch_shared_info_t *)sh;
1166#if USE_ITT_BUILD
1167 if (pr->ordered) {
1168 __kmp_itt_ordered_init(gtid);
1169 }; // if
1170 // Report loop metadata
1171 if (itt_need_metadata_reporting) {
1172 // Only report metadata by master of active team at level 1
1173 kmp_uint64 schedtype = 0;
1174 switch (schedule) {
1175 case kmp_sch_static_chunked:
1176 case kmp_sch_static_balanced: // Chunk is calculated in the switch above
1177 break;
1178 case kmp_sch_static_greedy:
1179 cur_chunk = pr->u.p.parm1;
1180 break;
1181 case kmp_sch_dynamic_chunked:
1182 schedtype = 1;
1183 break;
1184 case kmp_sch_guided_iterative_chunked:
1185 case kmp_sch_guided_analytical_chunked:
Andrey Churbanovd454c732017-06-05 17:17:33 +00001186 case kmp_sch_guided_simd:
Jonathan Peyton30419822017-05-12 18:01:32 +00001187 schedtype = 2;
1188 break;
1189 default:
1190 // Should we put this case under "static"?
1191 // case kmp_sch_static_steal:
1192 schedtype = 3;
1193 break;
1194 }
1195 __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk);
1196 }
1197#endif /* USE_ITT_BUILD */
1198 }; // if
1199
1200#ifdef KMP_DEBUG
1201 {
1202 const char *buff;
1203 // create format specifiers before the debug output
1204 buff = __kmp_str_format(
1205 "__kmp_dispatch_init: T#%%d returning: schedule:%%d ordered:%%%s "
1206 "lb:%%%s ub:%%%s"
1207 " st:%%%s tc:%%%s count:%%%s\n\tordered_lower:%%%s ordered_upper:%%%s"
1208 " parm1:%%%s parm2:%%%s parm3:%%%s parm4:%%%s\n",
1209 traits_t<UT>::spec, traits_t<T>::spec, traits_t<T>::spec,
1210 traits_t<ST>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1211 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<T>::spec,
1212 traits_t<T>::spec, traits_t<T>::spec, traits_t<T>::spec);
1213 KD_TRACE(10, (buff, gtid, pr->schedule, pr->ordered, pr->u.p.lb, pr->u.p.ub,
1214 pr->u.p.st, pr->u.p.tc, pr->u.p.count, pr->u.p.ordered_lower,
1215 pr->u.p.ordered_upper, pr->u.p.parm1, pr->u.p.parm2,
1216 pr->u.p.parm3, pr->u.p.parm4));
1217 __kmp_str_free(&buff);
1218 }
1219#endif
1220#if (KMP_STATIC_STEAL_ENABLED)
1221 // It cannot be guaranteed that after execution of a loop with some other
1222 // schedule kind all the parm3 variables will contain the same value. Even if
1223 // all parm3 will be the same, it still exists a bad case like using 0 and 1
1224 // rather than program life-time increment. So the dedicated variable is
1225 // required. The 'static_steal_counter' is used.
1226 if (schedule == kmp_sch_static_steal) {
1227 // Other threads will inspect this variable when searching for a victim.
1228 // This is a flag showing that other threads may steal from this thread
1229 // since then.
1230 volatile T *p = &pr->u.p.static_steal_counter;
1231 *p = *p + 1;
1232 }
1233#endif // ( KMP_STATIC_STEAL_ENABLED )
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001234
1235#if OMPT_SUPPORT && OMPT_TRACE
Jonathan Peyton30419822017-05-12 18:01:32 +00001236 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_begin)) {
1237 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL);
1238 ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
1239 ompt_callbacks.ompt_callback(ompt_event_loop_begin)(
1240 team_info->parallel_id, task_info->task_id, team_info->microtask);
1241 }
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001242#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001243}
1244
Jonathan Peyton30419822017-05-12 18:01:32 +00001245/* For ordered loops, either __kmp_dispatch_finish() should be called after
Jim Cownie5e8470a2013-09-27 10:38:44 +00001246 * every iteration, or __kmp_dispatch_finish_chunk() should be called after
1247 * every chunk of iterations. If the ordered section(s) were not executed
1248 * for this iteration (or every iteration in this chunk), we need to set the
Jonathan Peyton30419822017-05-12 18:01:32 +00001249 * ordered iteration counters so that the next thread can proceed. */
1250template <typename UT>
1251static void __kmp_dispatch_finish(int gtid, ident_t *loc) {
1252 typedef typename traits_t<UT>::signed_t ST;
1253 kmp_info_t *th = __kmp_threads[gtid];
Jim Cownie5e8470a2013-09-27 10:38:44 +00001254
Jonathan Peyton30419822017-05-12 18:01:32 +00001255 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d called\n", gtid));
1256 if (!th->th.th_team->t.t_serialized) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00001257
Jonathan Peyton30419822017-05-12 18:01:32 +00001258 dispatch_private_info_template<UT> *pr =
1259 reinterpret_cast<dispatch_private_info_template<UT> *>(
1260 th->th.th_dispatch->th_dispatch_pr_current);
1261 dispatch_shared_info_template<UT> volatile *sh =
1262 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1263 th->th.th_dispatch->th_dispatch_sh_current);
1264 KMP_DEBUG_ASSERT(pr);
1265 KMP_DEBUG_ASSERT(sh);
1266 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1267 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001268
Jonathan Peyton30419822017-05-12 18:01:32 +00001269 if (pr->ordered_bumped) {
1270 KD_TRACE(
1271 1000,
1272 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1273 gtid));
1274 pr->ordered_bumped = 0;
1275 } else {
1276 UT lower = pr->u.p.ordered_lower;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001277
Jonathan Peyton30419822017-05-12 18:01:32 +00001278#ifdef KMP_DEBUG
1279 {
1280 const char *buff;
1281 // create format specifiers before the debug output
1282 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d before wait: "
1283 "ordered_iteration:%%%s lower:%%%s\n",
1284 traits_t<UT>::spec, traits_t<UT>::spec);
1285 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1286 __kmp_str_free(&buff);
1287 }
1288#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001289
Jonathan Peyton30419822017-05-12 18:01:32 +00001290 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1291 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
1292 KMP_MB(); /* is this necessary? */
1293#ifdef KMP_DEBUG
1294 {
1295 const char *buff;
1296 // create format specifiers before the debug output
1297 buff = __kmp_str_format("__kmp_dispatch_finish: T#%%d after wait: "
1298 "ordered_iteration:%%%s lower:%%%s\n",
1299 traits_t<UT>::spec, traits_t<UT>::spec);
1300 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower));
1301 __kmp_str_free(&buff);
1302 }
1303#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001304
Jonathan Peyton30419822017-05-12 18:01:32 +00001305 test_then_inc<ST>((volatile ST *)&sh->u.s.ordered_iteration);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001306 } // if
Jonathan Peyton30419822017-05-12 18:01:32 +00001307 } // if
1308 KD_TRACE(100, ("__kmp_dispatch_finish: T#%d returned\n", gtid));
Jim Cownie5e8470a2013-09-27 10:38:44 +00001309}
1310
1311#ifdef KMP_GOMP_COMPAT
1312
Jonathan Peyton30419822017-05-12 18:01:32 +00001313template <typename UT>
1314static void __kmp_dispatch_finish_chunk(int gtid, ident_t *loc) {
1315 typedef typename traits_t<UT>::signed_t ST;
1316 kmp_info_t *th = __kmp_threads[gtid];
Jim Cownie5e8470a2013-09-27 10:38:44 +00001317
Jonathan Peyton30419822017-05-12 18:01:32 +00001318 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d called\n", gtid));
1319 if (!th->th.th_team->t.t_serialized) {
1320 // int cid;
1321 dispatch_private_info_template<UT> *pr =
1322 reinterpret_cast<dispatch_private_info_template<UT> *>(
1323 th->th.th_dispatch->th_dispatch_pr_current);
1324 dispatch_shared_info_template<UT> volatile *sh =
1325 reinterpret_cast<dispatch_shared_info_template<UT> volatile *>(
1326 th->th.th_dispatch->th_dispatch_sh_current);
1327 KMP_DEBUG_ASSERT(pr);
1328 KMP_DEBUG_ASSERT(sh);
1329 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1330 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001331
Jonathan Peyton30419822017-05-12 18:01:32 +00001332 // for (cid = 0; cid < KMP_MAX_ORDERED; ++cid) {
1333 UT lower = pr->u.p.ordered_lower;
1334 UT upper = pr->u.p.ordered_upper;
1335 UT inc = upper - lower + 1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001336
Jonathan Peyton30419822017-05-12 18:01:32 +00001337 if (pr->ordered_bumped == inc) {
1338 KD_TRACE(
1339 1000,
1340 ("__kmp_dispatch_finish: T#%d resetting ordered_bumped to zero\n",
1341 gtid));
1342 pr->ordered_bumped = 0;
1343 } else {
1344 inc -= pr->ordered_bumped;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001345
Jonathan Peyton30419822017-05-12 18:01:32 +00001346#ifdef KMP_DEBUG
1347 {
1348 const char *buff;
1349 // create format specifiers before the debug output
1350 buff = __kmp_str_format(
1351 "__kmp_dispatch_finish_chunk: T#%%d before wait: "
1352 "ordered_iteration:%%%s lower:%%%s upper:%%%s\n",
1353 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec);
1354 KD_TRACE(1000, (buff, gtid, sh->u.s.ordered_iteration, lower, upper));
1355 __kmp_str_free(&buff);
1356 }
1357#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001358
Jonathan Peyton30419822017-05-12 18:01:32 +00001359 __kmp_wait_yield<UT>(&sh->u.s.ordered_iteration, lower,
1360 __kmp_ge<UT> USE_ITT_BUILD_ARG(NULL));
Jim Cownie5e8470a2013-09-27 10:38:44 +00001361
Jonathan Peyton30419822017-05-12 18:01:32 +00001362 KMP_MB(); /* is this necessary? */
1363 KD_TRACE(1000, ("__kmp_dispatch_finish_chunk: T#%d resetting "
1364 "ordered_bumped to zero\n",
1365 gtid));
1366 pr->ordered_bumped = 0;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001367//!!!!! TODO check if the inc should be unsigned, or signed???
Jonathan Peyton30419822017-05-12 18:01:32 +00001368#ifdef KMP_DEBUG
1369 {
1370 const char *buff;
1371 // create format specifiers before the debug output
1372 buff = __kmp_str_format(
1373 "__kmp_dispatch_finish_chunk: T#%%d after wait: "
1374 "ordered_iteration:%%%s inc:%%%s lower:%%%s upper:%%%s\n",
1375 traits_t<UT>::spec, traits_t<UT>::spec, traits_t<UT>::spec,
1376 traits_t<UT>::spec);
1377 KD_TRACE(1000,
1378 (buff, gtid, sh->u.s.ordered_iteration, inc, lower, upper));
1379 __kmp_str_free(&buff);
1380 }
1381#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001382
Jonathan Peyton30419822017-05-12 18:01:32 +00001383 test_then_add<ST>((volatile ST *)&sh->u.s.ordered_iteration, inc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001384 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001385 // }
1386 }
1387 KD_TRACE(100, ("__kmp_dispatch_finish_chunk: T#%d returned\n", gtid));
Jim Cownie5e8470a2013-09-27 10:38:44 +00001388}
1389
1390#endif /* KMP_GOMP_COMPAT */
1391
Jonathan Peyton30419822017-05-12 18:01:32 +00001392/* Define a macro for exiting __kmp_dispatch_next(). If status is 0 (no more
1393 work), then tell OMPT the loop is over. In some cases kmp_dispatch_fini()
1394 is not called. */
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001395#if OMPT_SUPPORT && OMPT_TRACE
1396#define OMPT_LOOP_END \
Jonathan Peyton30419822017-05-12 18:01:32 +00001397 if (status == 0) { \
1398 if (ompt_enabled && ompt_callbacks.ompt_callback(ompt_event_loop_end)) { \
1399 ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); \
1400 ompt_task_info_t *task_info = __ompt_get_taskinfo(0); \
1401 ompt_callbacks.ompt_callback(ompt_event_loop_end)( \
1402 team_info->parallel_id, task_info->task_id); \
1403 } \
1404 }
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001405#else
1406#define OMPT_LOOP_END // no-op
1407#endif
1408
Jonathan Peyton30419822017-05-12 18:01:32 +00001409template <typename T>
1410static int __kmp_dispatch_next(ident_t *loc, int gtid, kmp_int32 *p_last,
1411 T *p_lb, T *p_ub,
1412 typename traits_t<T>::signed_t *p_st) {
Jim Cownie5e8470a2013-09-27 10:38:44 +00001413
Jonathan Peyton30419822017-05-12 18:01:32 +00001414 typedef typename traits_t<T>::unsigned_t UT;
1415 typedef typename traits_t<T>::signed_t ST;
1416 typedef typename traits_t<T>::floating_t DBL;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001417
Jonathan Peyton30419822017-05-12 18:01:32 +00001418 // This is potentially slightly misleading, schedule(runtime) will appear here
1419 // even if the actual runtme schedule is static. (Which points out a
1420 // disadavantage of schedule(runtime): even when static scheduling is used it
1421 // costs more than a compile time choice to use static scheduling would.)
1422 KMP_TIME_PARTITIONED_BLOCK(FOR_dynamic_scheduling);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001423
Jonathan Peyton30419822017-05-12 18:01:32 +00001424 int status;
1425 dispatch_private_info_template<T> *pr;
1426 kmp_info_t *th = __kmp_threads[gtid];
1427 kmp_team_t *team = th->th.th_team;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001428
Jonathan Peyton30419822017-05-12 18:01:32 +00001429 KMP_DEBUG_ASSERT(p_lb && p_ub && p_st); // AC: these cannot be NULL
1430#ifdef KMP_DEBUG
1431 {
1432 const char *buff;
1433 // create format specifiers before the debug output
1434 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d called p_lb:%%%s "
1435 "p_ub:%%%s p_st:%%%s p_last: %%p\n",
1436 traits_t<T>::spec, traits_t<T>::spec,
1437 traits_t<ST>::spec);
1438 KD_TRACE(1000, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last));
1439 __kmp_str_free(&buff);
1440 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001441#endif
Jim Cownie5e8470a2013-09-27 10:38:44 +00001442
Jonathan Peyton30419822017-05-12 18:01:32 +00001443 if (team->t.t_serialized) {
1444 /* NOTE: serialize this dispatch becase we are not at the active level */
1445 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1446 th->th.th_dispatch->th_disp_buffer); /* top of the stack */
1447 KMP_DEBUG_ASSERT(pr);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001448
Jonathan Peyton30419822017-05-12 18:01:32 +00001449 if ((status = (pr->u.p.tc != 0)) == 0) {
1450 *p_lb = 0;
1451 *p_ub = 0;
1452 // if ( p_last != NULL )
1453 // *p_last = 0;
1454 if (p_st != NULL)
1455 *p_st = 0;
1456 if (__kmp_env_consistency_check) {
1457 if (pr->pushed_ws != ct_none) {
1458 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001459 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001460 }
1461 } else if (pr->nomerge) {
1462 kmp_int32 last;
1463 T start;
1464 UT limit, trip, init;
1465 ST incr;
1466 T chunk = pr->u.p.parm1;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001467
Jonathan Peyton30419822017-05-12 18:01:32 +00001468 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n",
1469 gtid));
1470
1471 init = chunk * pr->u.p.count++;
1472 trip = pr->u.p.tc - 1;
1473
1474 if ((status = (init <= trip)) == 0) {
1475 *p_lb = 0;
1476 *p_ub = 0;
1477 // if ( p_last != NULL )
1478 // *p_last = 0;
1479 if (p_st != NULL)
1480 *p_st = 0;
1481 if (__kmp_env_consistency_check) {
1482 if (pr->pushed_ws != ct_none) {
1483 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
1484 }
1485 }
1486 } else {
1487 start = pr->u.p.lb;
1488 limit = chunk + init - 1;
1489 incr = pr->u.p.st;
1490
1491 if ((last = (limit >= trip)) != 0) {
1492 limit = trip;
1493#if KMP_OS_WINDOWS
1494 pr->u.p.last_upper = pr->u.p.ub;
1495#endif /* KMP_OS_WINDOWS */
1496 }
1497 if (p_last != NULL)
1498 *p_last = last;
1499 if (p_st != NULL)
1500 *p_st = incr;
1501 if (incr == 1) {
1502 *p_lb = start + init;
1503 *p_ub = start + limit;
1504 } else {
1505 *p_lb = start + init * incr;
1506 *p_ub = start + limit * incr;
1507 }
1508
1509 if (pr->ordered) {
1510 pr->u.p.ordered_lower = init;
1511 pr->u.p.ordered_upper = limit;
1512#ifdef KMP_DEBUG
1513 {
1514 const char *buff;
1515 // create format specifiers before the debug output
1516 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1517 "ordered_lower:%%%s ordered_upper:%%%s\n",
1518 traits_t<UT>::spec, traits_t<UT>::spec);
1519 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1520 pr->u.p.ordered_upper));
1521 __kmp_str_free(&buff);
1522 }
1523#endif
1524 } // if
1525 } // if
1526 } else {
1527 pr->u.p.tc = 0;
1528 *p_lb = pr->u.p.lb;
1529 *p_ub = pr->u.p.ub;
1530#if KMP_OS_WINDOWS
1531 pr->u.p.last_upper = *p_ub;
1532#endif /* KMP_OS_WINDOWS */
1533 if (p_last != NULL)
1534 *p_last = TRUE;
1535 if (p_st != NULL)
1536 *p_st = pr->u.p.st;
1537 } // if
1538#ifdef KMP_DEBUG
Jim Cownie5e8470a2013-09-27 10:38:44 +00001539 {
Jonathan Peyton30419822017-05-12 18:01:32 +00001540 const char *buff;
1541 // create format specifiers before the debug output
1542 buff = __kmp_str_format(
1543 "__kmp_dispatch_next: T#%%d serialized case: p_lb:%%%s "
1544 "p_ub:%%%s p_st:%%%s p_last:%%p %%d returning:%%d\n",
1545 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
1546 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, *p_st, p_last, *p_last, status));
1547 __kmp_str_free(&buff);
Jim Cownie5e8470a2013-09-27 10:38:44 +00001548 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001549#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001550#if INCLUDE_SSC_MARKS
1551 SSC_MARK_DISPATCH_NEXT();
1552#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001553 OMPT_LOOP_END;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001554 return status;
Jonathan Peyton30419822017-05-12 18:01:32 +00001555 } else {
1556 kmp_int32 last = 0;
1557 dispatch_shared_info_template<UT> *sh;
1558 T start;
1559 ST incr;
1560 UT limit, trip, init;
Jim Cownie5e8470a2013-09-27 10:38:44 +00001561
Jonathan Peyton30419822017-05-12 18:01:32 +00001562 KMP_DEBUG_ASSERT(th->th.th_dispatch ==
1563 &th->th.th_team->t.t_dispatch[th->th.th_info.ds.ds_tid]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001564
Jonathan Peyton30419822017-05-12 18:01:32 +00001565 pr = reinterpret_cast<dispatch_private_info_template<T> *>(
1566 th->th.th_dispatch->th_dispatch_pr_current);
1567 KMP_DEBUG_ASSERT(pr);
1568 sh = reinterpret_cast<dispatch_shared_info_template<UT> *>(
1569 th->th.th_dispatch->th_dispatch_sh_current);
1570 KMP_DEBUG_ASSERT(sh);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001571
Jonathan Peyton30419822017-05-12 18:01:32 +00001572 if (pr->u.p.tc == 0) {
1573 // zero trip count
1574 status = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001575 } else {
Jonathan Peyton30419822017-05-12 18:01:32 +00001576 switch (pr->schedule) {
1577#if (KMP_STATIC_STEAL_ENABLED)
1578 case kmp_sch_static_steal: {
1579 T chunk = pr->u.p.parm1;
1580 int nproc = th->th.th_team_nproc;
Jonathan Peyton45be4502015-08-11 21:36:41 +00001581
Jonathan Peyton30419822017-05-12 18:01:32 +00001582 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_static_steal case\n",
1583 gtid));
1584
1585 trip = pr->u.p.tc - 1;
1586
1587 if (traits_t<T>::type_size > 4) {
1588 // use lock for 8-byte and CAS for 4-byte induction
1589 // variable. TODO (optional): check and use 16-byte CAS
1590 kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock;
1591 KMP_DEBUG_ASSERT(lck != NULL);
1592 if (pr->u.p.count < (UT)pr->u.p.ub) {
1593 __kmp_acquire_lock(lck, gtid);
1594 // try to get own chunk of iterations
1595 init = (pr->u.p.count)++;
1596 status = (init < (UT)pr->u.p.ub);
1597 __kmp_release_lock(lck, gtid);
1598 } else {
1599 status = 0; // no own chunks
1600 }
1601 if (!status) { // try to steal
1602 kmp_info_t **other_threads = team->t.t_threads;
1603 int while_limit = nproc; // nproc attempts to find a victim
1604 int while_index = 0;
1605 // TODO: algorithm of searching for a victim
1606 // should be cleaned up and measured
1607 while ((!status) && (while_limit != ++while_index)) {
1608 T remaining;
1609 T victimIdx = pr->u.p.parm4;
1610 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1611 dispatch_private_info_template<T> *victim =
1612 reinterpret_cast<dispatch_private_info_template<T> *>(
1613 other_threads[victimIdx]
1614 ->th.th_dispatch->th_dispatch_pr_current);
1615 while ((victim == NULL || victim == pr ||
1616 (*(volatile T *)&victim->u.p.static_steal_counter !=
1617 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1618 oldVictimIdx != victimIdx) {
1619 victimIdx = (victimIdx + 1) % nproc;
1620 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1621 other_threads[victimIdx]
1622 ->th.th_dispatch->th_dispatch_pr_current);
1623 };
1624 if (!victim ||
1625 (*(volatile T *)&victim->u.p.static_steal_counter !=
1626 *(volatile T *)&pr->u.p.static_steal_counter)) {
1627 continue; // try once more (nproc attempts in total)
1628 // no victim is ready yet to participate in stealing
1629 // because all victims are still in kmp_init_dispatch
1630 }
1631 if (victim->u.p.count + 2 > (UT)victim->u.p.ub) {
1632 pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid
1633 continue; // not enough chunks to steal, goto next victim
1634 }
1635
1636 lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock;
1637 KMP_ASSERT(lck != NULL);
1638 __kmp_acquire_lock(lck, gtid);
1639 limit = victim->u.p.ub; // keep initial ub
1640 if (victim->u.p.count >= limit ||
1641 (remaining = limit - victim->u.p.count) < 2) {
1642 __kmp_release_lock(lck, gtid);
1643 pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim
1644 continue; // not enough chunks to steal
1645 }
1646 // stealing succeded, reduce victim's ub by 1/4 of undone chunks
1647 // or by 1
1648 if (remaining > 3) {
1649 KMP_COUNT_VALUE(FOR_static_steal_stolen, remaining >> 2);
1650 init = (victim->u.p.ub -=
1651 (remaining >> 2)); // steal 1/4 of remaining
1652 } else {
1653 KMP_COUNT_VALUE(FOR_static_steal_stolen, 1);
1654 init =
1655 (victim->u.p.ub -= 1); // steal 1 chunk of 2 or 3 remaining
1656 }
1657 __kmp_release_lock(lck, gtid);
1658
1659 KMP_DEBUG_ASSERT(init + 1 <= limit);
1660 pr->u.p.parm4 = victimIdx; // remember victim to steal from
1661 status = 1;
1662 while_index = 0;
1663 // now update own count and ub with stolen range but init chunk
1664 __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid);
1665 pr->u.p.count = init + 1;
1666 pr->u.p.ub = limit;
1667 __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid);
1668 } // while (search for victim)
1669 } // if (try to find victim and steal)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001670 } else {
Jonathan Peyton30419822017-05-12 18:01:32 +00001671 // 4-byte induction variable, use 8-byte CAS for pair (count, ub)
1672 typedef union {
1673 struct {
1674 UT count;
1675 T ub;
1676 } p;
1677 kmp_int64 b;
1678 } union_i4;
1679 // All operations on 'count' or 'ub' must be combined atomically
1680 // together.
1681 {
1682 union_i4 vold, vnew;
1683 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1684 vnew = vold;
1685 vnew.p.count++;
1686 while (!KMP_COMPARE_AND_STORE_ACQ64(
1687 (volatile kmp_int64 *)&pr->u.p.count,
1688 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1689 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1690 KMP_CPU_PAUSE();
1691 vold.b = *(volatile kmp_int64 *)(&pr->u.p.count);
1692 vnew = vold;
1693 vnew.p.count++;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001694 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001695 vnew = vold;
1696 init = vnew.p.count;
1697 status = (init < (UT)vnew.p.ub);
1698 }
1699
1700 if (!status) {
1701 kmp_info_t **other_threads = team->t.t_threads;
1702 int while_limit = nproc; // nproc attempts to find a victim
1703 int while_index = 0;
1704
1705 // TODO: algorithm of searching for a victim
1706 // should be cleaned up and measured
1707 while ((!status) && (while_limit != ++while_index)) {
1708 union_i4 vold, vnew;
1709 kmp_int32 remaining;
1710 T victimIdx = pr->u.p.parm4;
1711 T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1;
1712 dispatch_private_info_template<T> *victim =
1713 reinterpret_cast<dispatch_private_info_template<T> *>(
1714 other_threads[victimIdx]
1715 ->th.th_dispatch->th_dispatch_pr_current);
1716 while ((victim == NULL || victim == pr ||
1717 (*(volatile T *)&victim->u.p.static_steal_counter !=
1718 *(volatile T *)&pr->u.p.static_steal_counter)) &&
1719 oldVictimIdx != victimIdx) {
1720 victimIdx = (victimIdx + 1) % nproc;
1721 victim = reinterpret_cast<dispatch_private_info_template<T> *>(
1722 other_threads[victimIdx]
1723 ->th.th_dispatch->th_dispatch_pr_current);
1724 };
1725 if (!victim ||
1726 (*(volatile T *)&victim->u.p.static_steal_counter !=
1727 *(volatile T *)&pr->u.p.static_steal_counter)) {
1728 continue; // try once more (nproc attempts in total)
1729 // no victim is ready yet to participate in stealing
1730 // because all victims are still in kmp_init_dispatch
1731 }
1732 pr->u.p.parm4 = victimIdx; // new victim found
1733 while (1) { // CAS loop if victim has enough chunks to steal
1734 vold.b = *(volatile kmp_int64 *)(&victim->u.p.count);
1735 vnew = vold;
1736
1737 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1738 if (vnew.p.count >= (UT)vnew.p.ub ||
1739 (remaining = vnew.p.ub - vnew.p.count) < 2) {
1740 pr->u.p.parm4 =
1741 (victimIdx + 1) % nproc; // shift start victim id
1742 break; // not enough chunks to steal, goto next victim
1743 }
1744 if (remaining > 3) {
1745 vnew.p.ub -= (remaining >> 2); // try to steal 1/4 remaining
1746 } else {
1747 vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining
1748 }
1749 KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip);
1750 // TODO: Should this be acquire or release?
1751 if (KMP_COMPARE_AND_STORE_ACQ64(
1752 (volatile kmp_int64 *)&victim->u.p.count,
1753 *VOLATILE_CAST(kmp_int64 *) & vold.b,
1754 *VOLATILE_CAST(kmp_int64 *) & vnew.b)) {
1755 // stealing succeeded
1756 KMP_COUNT_VALUE(FOR_static_steal_stolen,
1757 vold.p.ub - vnew.p.ub);
1758 status = 1;
1759 while_index = 0;
1760 // now update own count and ub
1761 init = vnew.p.ub;
1762 vold.p.count = init + 1;
1763#if KMP_ARCH_X86
1764 KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count),
1765 vold.b);
1766#else
1767 *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b;
1768#endif
1769 break;
1770 } // if (check CAS result)
1771 KMP_CPU_PAUSE(); // CAS failed, repeat attempt
1772 } // while (try to steal from particular victim)
1773 } // while (search for victim)
1774 } // if (try to find victim and steal)
1775 } // if (4-byte induction variable)
1776 if (!status) {
1777 *p_lb = 0;
1778 *p_ub = 0;
1779 if (p_st != NULL)
1780 *p_st = 0;
1781 } else {
1782 start = pr->u.p.parm2;
1783 init *= chunk;
1784 limit = chunk + init - 1;
1785 incr = pr->u.p.st;
1786 KMP_COUNT_VALUE(FOR_static_steal_chunks, 1);
1787
1788 KMP_DEBUG_ASSERT(init <= trip);
1789 if ((last = (limit >= trip)) != 0)
1790 limit = trip;
1791 if (p_st != NULL)
1792 *p_st = incr;
1793
1794 if (incr == 1) {
1795 *p_lb = start + init;
1796 *p_ub = start + limit;
1797 } else {
1798 *p_lb = start + init * incr;
1799 *p_ub = start + limit * incr;
1800 }
1801
1802 if (pr->ordered) {
1803 pr->u.p.ordered_lower = init;
1804 pr->u.p.ordered_upper = limit;
1805#ifdef KMP_DEBUG
1806 {
1807 const char *buff;
1808 // create format specifiers before the debug output
1809 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1810 "ordered_lower:%%%s ordered_upper:%%%s\n",
1811 traits_t<UT>::spec, traits_t<UT>::spec);
1812 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1813 pr->u.p.ordered_upper));
1814 __kmp_str_free(&buff);
1815 }
1816#endif
1817 } // if
1818 } // if
1819 break;
1820 } // case
1821#endif // ( KMP_STATIC_STEAL_ENABLED )
1822 case kmp_sch_static_balanced: {
1823 KD_TRACE(
1824 100,
1825 ("__kmp_dispatch_next: T#%d kmp_sch_static_balanced case\n", gtid));
1826 if ((status = !pr->u.p.count) !=
1827 0) { /* check if thread has any iteration to do */
1828 pr->u.p.count = 1;
1829 *p_lb = pr->u.p.lb;
1830 *p_ub = pr->u.p.ub;
1831 last = pr->u.p.parm1;
1832 if (p_st != NULL)
1833 *p_st = pr->u.p.st;
1834 } else { /* no iterations to do */
1835 pr->u.p.lb = pr->u.p.ub + pr->u.p.st;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001836 }
Jonathan Peyton30419822017-05-12 18:01:32 +00001837 if (pr->ordered) {
1838#ifdef KMP_DEBUG
1839 {
1840 const char *buff;
1841 // create format specifiers before the debug output
1842 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1843 "ordered_lower:%%%s ordered_upper:%%%s\n",
1844 traits_t<UT>::spec, traits_t<UT>::spec);
1845 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1846 pr->u.p.ordered_upper));
1847 __kmp_str_free(&buff);
1848 }
1849#endif
1850 } // if
1851 } // case
1852 break;
1853 case kmp_sch_static_greedy: /* original code for kmp_sch_static_greedy was
1854 merged here */
1855 case kmp_sch_static_chunked: {
1856 T parm1;
1857
1858 KD_TRACE(100, ("__kmp_dispatch_next: T#%d "
1859 "kmp_sch_static_[affinity|chunked] case\n",
1860 gtid));
1861 parm1 = pr->u.p.parm1;
1862
1863 trip = pr->u.p.tc - 1;
1864 init = parm1 * (pr->u.p.count + __kmp_tid_from_gtid(gtid));
1865
1866 if ((status = (init <= trip)) != 0) {
1867 start = pr->u.p.lb;
1868 incr = pr->u.p.st;
1869 limit = parm1 + init - 1;
1870
1871 if ((last = (limit >= trip)) != 0)
1872 limit = trip;
1873
1874 if (p_st != NULL)
1875 *p_st = incr;
1876
1877 pr->u.p.count += th->th.th_team_nproc;
1878
1879 if (incr == 1) {
1880 *p_lb = start + init;
1881 *p_ub = start + limit;
1882 } else {
1883 *p_lb = start + init * incr;
1884 *p_ub = start + limit * incr;
1885 }
1886
1887 if (pr->ordered) {
1888 pr->u.p.ordered_lower = init;
1889 pr->u.p.ordered_upper = limit;
1890#ifdef KMP_DEBUG
1891 {
1892 const char *buff;
1893 // create format specifiers before the debug output
1894 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1895 "ordered_lower:%%%s ordered_upper:%%%s\n",
1896 traits_t<UT>::spec, traits_t<UT>::spec);
1897 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1898 pr->u.p.ordered_upper));
1899 __kmp_str_free(&buff);
1900 }
1901#endif
1902 } // if
1903 } // if
1904 } // case
1905 break;
1906
1907 case kmp_sch_dynamic_chunked: {
1908 T chunk = pr->u.p.parm1;
1909
1910 KD_TRACE(
1911 100,
1912 ("__kmp_dispatch_next: T#%d kmp_sch_dynamic_chunked case\n", gtid));
1913
1914 init = chunk * test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
1915 trip = pr->u.p.tc - 1;
1916
1917 if ((status = (init <= trip)) == 0) {
1918 *p_lb = 0;
1919 *p_ub = 0;
1920 if (p_st != NULL)
1921 *p_st = 0;
1922 } else {
1923 start = pr->u.p.lb;
1924 limit = chunk + init - 1;
1925 incr = pr->u.p.st;
1926
1927 if ((last = (limit >= trip)) != 0)
1928 limit = trip;
1929
1930 if (p_st != NULL)
1931 *p_st = incr;
1932
1933 if (incr == 1) {
1934 *p_lb = start + init;
1935 *p_ub = start + limit;
1936 } else {
1937 *p_lb = start + init * incr;
1938 *p_ub = start + limit * incr;
1939 }
1940
1941 if (pr->ordered) {
1942 pr->u.p.ordered_lower = init;
1943 pr->u.p.ordered_upper = limit;
1944#ifdef KMP_DEBUG
1945 {
1946 const char *buff;
1947 // create format specifiers before the debug output
1948 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
1949 "ordered_lower:%%%s ordered_upper:%%%s\n",
1950 traits_t<UT>::spec, traits_t<UT>::spec);
1951 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
1952 pr->u.p.ordered_upper));
1953 __kmp_str_free(&buff);
1954 }
1955#endif
1956 } // if
1957 } // if
1958 } // case
1959 break;
1960
1961 case kmp_sch_guided_iterative_chunked: {
1962 T chunkspec = pr->u.p.parm1;
1963 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
1964 "iterative case\n",
1965 gtid));
1966 trip = pr->u.p.tc;
1967 // Start atomic part of calculations
1968 while (1) {
1969 ST remaining; // signed, because can be < 0
1970 init = sh->u.s.iteration; // shared value
1971 remaining = trip - init;
1972 if (remaining <= 0) { // AC: need to compare with 0 first
1973 // nothing to do, don't try atomic op
1974 status = 0;
1975 break;
1976 }
1977 if ((T)remaining <
1978 pr->u.p.parm2) { // compare with K*nproc*(chunk+1), K=2 by default
1979 // use dynamic-style shcedule
1980 // atomically inrement iterations, get old value
1981 init = test_then_add<ST>((ST *)&sh->u.s.iteration, (ST)chunkspec);
1982 remaining = trip - init;
1983 if (remaining <= 0) {
1984 status = 0; // all iterations got by other threads
1985 } else { // got some iterations to work on
1986 status = 1;
1987 if ((T)remaining > chunkspec) {
1988 limit = init + chunkspec - 1;
1989 } else {
1990 last = 1; // the last chunk
1991 limit = init + remaining - 1;
1992 } // if
1993 } // if
1994 break;
1995 } // if
1996 limit = init + (UT)(remaining *
1997 *(double *)&pr->u.p.parm3); // divide by K*nproc
1998 if (compare_and_swap<ST>((ST *)&sh->u.s.iteration, (ST)init,
1999 (ST)limit)) {
2000 // CAS was successful, chunk obtained
2001 status = 1;
2002 --limit;
2003 break;
2004 } // if
2005 } // while
2006 if (status != 0) {
2007 start = pr->u.p.lb;
2008 incr = pr->u.p.st;
2009 if (p_st != NULL)
2010 *p_st = incr;
2011 *p_lb = start + init * incr;
2012 *p_ub = start + limit * incr;
2013 if (pr->ordered) {
2014 pr->u.p.ordered_lower = init;
2015 pr->u.p.ordered_upper = limit;
2016#ifdef KMP_DEBUG
2017 {
2018 const char *buff;
2019 // create format specifiers before the debug output
2020 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2021 "ordered_lower:%%%s ordered_upper:%%%s\n",
2022 traits_t<UT>::spec, traits_t<UT>::spec);
2023 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2024 pr->u.p.ordered_upper));
2025 __kmp_str_free(&buff);
2026 }
2027#endif
2028 } // if
2029 } else {
2030 *p_lb = 0;
2031 *p_ub = 0;
2032 if (p_st != NULL)
2033 *p_st = 0;
2034 } // if
2035 } // case
2036 break;
2037
Andrey Churbanovd454c732017-06-05 17:17:33 +00002038 case kmp_sch_guided_simd: {
2039 // same as iterative but curr-chunk adjusted to be multiple of given
2040 // chunk
2041 T chunk = pr->u.p.parm1;
2042 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n",
2043 gtid));
2044 trip = pr->u.p.tc;
2045 // Start atomic part of calculations
2046 while (1) {
2047 ST remaining; // signed, because can be < 0
2048 init = sh->u.s.iteration; // shared value
2049 remaining = trip - init;
2050 if (remaining <= 0) { // AC: need to compare with 0 first
2051 status = 0; // nothing to do, don't try atomic op
2052 break;
2053 }
2054 KMP_DEBUG_ASSERT(init % chunk == 0);
2055 // compare with K*nproc*(chunk+1), K=2 by default
2056 if ((T)remaining < pr->u.p.parm2) {
2057 // use dynamic-style shcedule
2058 // atomically inrement iterations, get old value
2059 init = test_then_add<ST>((ST *)&sh->u.s.iteration, (ST)chunk);
2060 remaining = trip - init;
2061 if (remaining <= 0) {
2062 status = 0; // all iterations got by other threads
2063 } else {
2064 // got some iterations to work on
2065 status = 1;
2066 if ((T)remaining > chunk) {
2067 limit = init + chunk - 1;
2068 } else {
2069 last = 1; // the last chunk
2070 limit = init + remaining - 1;
2071 } // if
2072 } // if
2073 break;
2074 } // if
2075 // divide by K*nproc
2076 UT span = remaining * (*(double *)&pr->u.p.parm3);
2077 UT rem = span % chunk;
2078 if (rem) // adjust so that span%chunk == 0
2079 span += chunk - rem;
2080 limit = init + span;
2081 if (compare_and_swap<ST>((ST *)&sh->u.s.iteration, (ST)init,
2082 (ST)limit)) {
2083 // CAS was successful, chunk obtained
2084 status = 1;
2085 --limit;
2086 break;
2087 } // if
2088 } // while
2089 if (status != 0) {
2090 start = pr->u.p.lb;
2091 incr = pr->u.p.st;
2092 if (p_st != NULL)
2093 *p_st = incr;
2094 *p_lb = start + init * incr;
2095 *p_ub = start + limit * incr;
2096 if (pr->ordered) {
2097 pr->u.p.ordered_lower = init;
2098 pr->u.p.ordered_upper = limit;
2099#ifdef KMP_DEBUG
2100 {
2101 const char *buff;
2102 // create format specifiers before the debug output
2103 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2104 "ordered_lower:%%%s ordered_upper:%%%s\n",
2105 traits_t<UT>::spec, traits_t<UT>::spec);
2106 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2107 pr->u.p.ordered_upper));
2108 __kmp_str_free(&buff);
2109 }
2110#endif
2111 } // if
2112 } else {
2113 *p_lb = 0;
2114 *p_ub = 0;
2115 if (p_st != NULL)
2116 *p_st = 0;
2117 } // if
2118 } // case
2119 break;
2120
Jonathan Peyton30419822017-05-12 18:01:32 +00002121 case kmp_sch_guided_analytical_chunked: {
2122 T chunkspec = pr->u.p.parm1;
2123 UT chunkIdx;
2124#if KMP_OS_WINDOWS && KMP_ARCH_X86
2125 /* for storing original FPCW value for Windows* OS on
2126 IA-32 architecture 8-byte version */
2127 unsigned int oldFpcw;
2128 unsigned int fpcwSet = 0;
2129#endif
2130 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_chunked "
2131 "analytical case\n",
2132 gtid));
2133
2134 trip = pr->u.p.tc;
2135
2136 KMP_DEBUG_ASSERT(th->th.th_team_nproc > 1);
2137 KMP_DEBUG_ASSERT((2UL * chunkspec + 1) * (UT)th->th.th_team_nproc <
2138 trip);
2139
2140 while (1) { /* this while loop is a safeguard against unexpected zero
2141 chunk sizes */
2142 chunkIdx = test_then_inc_acq<ST>((volatile ST *)&sh->u.s.iteration);
2143 if (chunkIdx >= (UT)pr->u.p.parm2) {
2144 --trip;
2145 /* use dynamic-style scheduling */
2146 init = chunkIdx * chunkspec + pr->u.p.count;
2147 /* need to verify init > 0 in case of overflow in the above
2148 * calculation */
2149 if ((status = (init > 0 && init <= trip)) != 0) {
2150 limit = init + chunkspec - 1;
2151
2152 if ((last = (limit >= trip)) != 0)
2153 limit = trip;
2154 }
2155 break;
2156 } else {
2157/* use exponential-style scheduling */
2158/* The following check is to workaround the lack of long double precision on
2159 Windows* OS.
2160 This check works around the possible effect that init != 0 for chunkIdx == 0.
2161 */
2162#if KMP_OS_WINDOWS && KMP_ARCH_X86
2163 /* If we haven't already done so, save original FPCW and set
2164 precision to 64-bit, as Windows* OS on IA-32 architecture
2165 defaults to 53-bit */
2166 if (!fpcwSet) {
2167 oldFpcw = _control87(0, 0);
2168 _control87(_PC_64, _MCW_PC);
2169 fpcwSet = 0x30000;
2170 }
2171#endif
2172 if (chunkIdx) {
2173 init = __kmp_dispatch_guided_remaining<T>(
2174 trip, *(DBL *)&pr->u.p.parm3, chunkIdx);
2175 KMP_DEBUG_ASSERT(init);
2176 init = trip - init;
2177 } else
2178 init = 0;
2179 limit = trip - __kmp_dispatch_guided_remaining<T>(
2180 trip, *(DBL *)&pr->u.p.parm3, chunkIdx + 1);
2181 KMP_ASSERT(init <= limit);
2182 if (init < limit) {
2183 KMP_DEBUG_ASSERT(limit <= trip);
2184 --limit;
2185 status = 1;
2186 break;
2187 } // if
2188 } // if
2189 } // while (1)
2190#if KMP_OS_WINDOWS && KMP_ARCH_X86
2191 /* restore FPCW if necessary
2192 AC: check fpcwSet flag first because oldFpcw can be uninitialized
2193 here */
2194 if (fpcwSet && (oldFpcw & fpcwSet))
2195 _control87(oldFpcw, _MCW_PC);
2196#endif
2197 if (status != 0) {
2198 start = pr->u.p.lb;
2199 incr = pr->u.p.st;
2200 if (p_st != NULL)
2201 *p_st = incr;
2202 *p_lb = start + init * incr;
2203 *p_ub = start + limit * incr;
2204 if (pr->ordered) {
2205 pr->u.p.ordered_lower = init;
2206 pr->u.p.ordered_upper = limit;
2207#ifdef KMP_DEBUG
2208 {
2209 const char *buff;
2210 // create format specifiers before the debug output
2211 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2212 "ordered_lower:%%%s ordered_upper:%%%s\n",
2213 traits_t<UT>::spec, traits_t<UT>::spec);
2214 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2215 pr->u.p.ordered_upper));
2216 __kmp_str_free(&buff);
2217 }
2218#endif
2219 }
2220 } else {
2221 *p_lb = 0;
2222 *p_ub = 0;
2223 if (p_st != NULL)
2224 *p_st = 0;
2225 }
2226 } // case
2227 break;
2228
2229 case kmp_sch_trapezoidal: {
2230 UT index;
2231 T parm2 = pr->u.p.parm2;
2232 T parm3 = pr->u.p.parm3;
2233 T parm4 = pr->u.p.parm4;
2234 KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_trapezoidal case\n",
2235 gtid));
2236
2237 index = test_then_inc<ST>((volatile ST *)&sh->u.s.iteration);
2238
2239 init = (index * ((2 * parm2) - (index - 1) * parm4)) / 2;
2240 trip = pr->u.p.tc - 1;
2241
2242 if ((status = ((T)index < parm3 && init <= trip)) == 0) {
2243 *p_lb = 0;
2244 *p_ub = 0;
2245 if (p_st != NULL)
2246 *p_st = 0;
2247 } else {
2248 start = pr->u.p.lb;
2249 limit = ((index + 1) * (2 * parm2 - index * parm4)) / 2 - 1;
2250 incr = pr->u.p.st;
2251
2252 if ((last = (limit >= trip)) != 0)
2253 limit = trip;
2254
2255 if (p_st != NULL)
2256 *p_st = incr;
2257
2258 if (incr == 1) {
2259 *p_lb = start + init;
2260 *p_ub = start + limit;
2261 } else {
2262 *p_lb = start + init * incr;
2263 *p_ub = start + limit * incr;
2264 }
2265
2266 if (pr->ordered) {
2267 pr->u.p.ordered_lower = init;
2268 pr->u.p.ordered_upper = limit;
2269#ifdef KMP_DEBUG
2270 {
2271 const char *buff;
2272 // create format specifiers before the debug output
2273 buff = __kmp_str_format("__kmp_dispatch_next: T#%%d "
2274 "ordered_lower:%%%s ordered_upper:%%%s\n",
2275 traits_t<UT>::spec, traits_t<UT>::spec);
2276 KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower,
2277 pr->u.p.ordered_upper));
2278 __kmp_str_free(&buff);
2279 }
2280#endif
2281 } // if
2282 } // if
2283 } // case
2284 break;
2285 default: {
2286 status = 0; // to avoid complaints on uninitialized variable use
2287 __kmp_msg(kmp_ms_fatal, // Severity
2288 KMP_MSG(UnknownSchedTypeDetected), // Primary message
2289 KMP_HNT(GetNewerLibrary), // Hint
2290 __kmp_msg_null // Variadic argument list terminator
2291 );
2292 } break;
2293 } // switch
2294 } // if tc == 0;
2295
2296 if (status == 0) {
2297 UT num_done;
2298
2299 num_done = test_then_inc<ST>((volatile ST *)&sh->u.s.num_done);
2300#ifdef KMP_DEBUG
2301 {
2302 const char *buff;
2303 // create format specifiers before the debug output
2304 buff = __kmp_str_format(
2305 "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n",
2306 traits_t<UT>::spec);
2307 KD_TRACE(100, (buff, gtid, sh->u.s.num_done));
2308 __kmp_str_free(&buff);
2309 }
2310#endif
2311
2312 if ((ST)num_done == th->th.th_team_nproc - 1) {
2313#if (KMP_STATIC_STEAL_ENABLED)
2314 if (pr->schedule == kmp_sch_static_steal &&
2315 traits_t<T>::type_size > 4) {
2316 int i;
2317 kmp_info_t **other_threads = team->t.t_threads;
2318 // loop complete, safe to destroy locks used for stealing
2319 for (i = 0; i < th->th.th_team_nproc; ++i) {
2320 kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock;
2321 KMP_ASSERT(lck != NULL);
2322 __kmp_destroy_lock(lck);
2323 __kmp_free(lck);
2324 other_threads[i]->th.th_dispatch->th_steal_lock = NULL;
2325 }
2326 }
2327#endif
2328 /* NOTE: release this buffer to be reused */
2329
2330 KMP_MB(); /* Flush all pending memory write invalidates. */
2331
2332 sh->u.s.num_done = 0;
2333 sh->u.s.iteration = 0;
2334
2335 /* TODO replace with general release procedure? */
2336 if (pr->ordered) {
2337 sh->u.s.ordered_iteration = 0;
2338 }
2339
2340 KMP_MB(); /* Flush all pending memory write invalidates. */
2341
2342 sh->buffer_index += __kmp_dispatch_num_buffers;
2343 KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n",
2344 gtid, sh->buffer_index));
2345
2346 KMP_MB(); /* Flush all pending memory write invalidates. */
2347
2348 } // if
2349 if (__kmp_env_consistency_check) {
2350 if (pr->pushed_ws != ct_none) {
2351 pr->pushed_ws = __kmp_pop_workshare(gtid, pr->pushed_ws, loc);
2352 }
2353 }
2354
2355 th->th.th_dispatch->th_deo_fcn = NULL;
2356 th->th.th_dispatch->th_dxo_fcn = NULL;
2357 th->th.th_dispatch->th_dispatch_sh_current = NULL;
2358 th->th.th_dispatch->th_dispatch_pr_current = NULL;
2359 } // if (status == 0)
2360#if KMP_OS_WINDOWS
2361 else if (last) {
2362 pr->u.p.last_upper = pr->u.p.ub;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002363 }
Jonathan Peyton30419822017-05-12 18:01:32 +00002364#endif /* KMP_OS_WINDOWS */
2365 if (p_last != NULL && status != 0)
2366 *p_last = last;
2367 } // if
2368
2369#ifdef KMP_DEBUG
2370 {
2371 const char *buff;
2372 // create format specifiers before the debug output
2373 buff = __kmp_str_format(
2374 "__kmp_dispatch_next: T#%%d normal case: "
2375 "p_lb:%%%s p_ub:%%%s p_st:%%%s p_last:%%p returning:%%d\n",
2376 traits_t<T>::spec, traits_t<T>::spec, traits_t<ST>::spec);
2377 KD_TRACE(10, (buff, gtid, *p_lb, *p_ub, p_st ? *p_st : 0, p_last, status));
2378 __kmp_str_free(&buff);
2379 }
2380#endif
2381#if INCLUDE_SSC_MARKS
2382 SSC_MARK_DISPATCH_NEXT();
2383#endif
2384 OMPT_LOOP_END;
2385 return status;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002386}
2387
Jonathan Peyton30419822017-05-12 18:01:32 +00002388template <typename T>
2389static void __kmp_dist_get_bounds(ident_t *loc, kmp_int32 gtid,
2390 kmp_int32 *plastiter, T *plower, T *pupper,
2391 typename traits_t<T>::signed_t incr) {
2392 typedef typename traits_t<T>::unsigned_t UT;
2393 typedef typename traits_t<T>::signed_t ST;
2394 register kmp_uint32 team_id;
2395 register kmp_uint32 nteams;
2396 register UT trip_count;
2397 register kmp_team_t *team;
2398 kmp_info_t *th;
2399
2400 KMP_DEBUG_ASSERT(plastiter && plower && pupper);
2401 KE_TRACE(10, ("__kmpc_dist_get_bounds called (%d)\n", gtid));
2402#ifdef KMP_DEBUG
2403 {
2404 const char *buff;
2405 // create format specifiers before the debug output
2406 buff = __kmp_str_format("__kmpc_dist_get_bounds: T#%%d liter=%%d "
2407 "iter=(%%%s, %%%s, %%%s) signed?<%s>\n",
2408 traits_t<T>::spec, traits_t<T>::spec,
2409 traits_t<ST>::spec, traits_t<T>::spec);
2410 KD_TRACE(100, (buff, gtid, *plastiter, *plower, *pupper, incr));
2411 __kmp_str_free(&buff);
2412 }
2413#endif
2414
2415 if (__kmp_env_consistency_check) {
2416 if (incr == 0) {
2417 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrZeroProhibited, ct_pdo,
2418 loc);
2419 }
2420 if (incr > 0 ? (*pupper < *plower) : (*plower < *pupper)) {
2421 // The loop is illegal.
2422 // Some zero-trip loops maintained by compiler, e.g.:
2423 // for(i=10;i<0;++i) // lower >= upper - run-time check
2424 // for(i=0;i>10;--i) // lower <= upper - run-time check
2425 // for(i=0;i>10;++i) // incr > 0 - compile-time check
2426 // for(i=10;i<0;--i) // incr < 0 - compile-time check
2427 // Compiler does not check the following illegal loops:
2428 // for(i=0;i<10;i+=incr) // where incr<0
2429 // for(i=10;i>0;i-=incr) // where incr<0
2430 __kmp_error_construct(kmp_i18n_msg_CnsLoopIncrIllegal, ct_pdo, loc);
2431 }
2432 }
2433 th = __kmp_threads[gtid];
2434 team = th->th.th_team;
2435#if OMP_40_ENABLED
2436 KMP_DEBUG_ASSERT(th->th.th_teams_microtask); // we are in the teams construct
2437 nteams = th->th.th_teams_size.nteams;
2438#endif
2439 team_id = team->t.t_master_tid;
2440 KMP_DEBUG_ASSERT(nteams == team->t.t_parent->t.t_nproc);
2441
2442 // compute global trip count
2443 if (incr == 1) {
2444 trip_count = *pupper - *plower + 1;
2445 } else if (incr == -1) {
2446 trip_count = *plower - *pupper + 1;
2447 } else if (incr > 0) {
2448 // upper-lower can exceed the limit of signed type
2449 trip_count = (UT)(*pupper - *plower) / incr + 1;
2450 } else {
2451 trip_count = (UT)(*plower - *pupper) / (-incr) + 1;
2452 }
2453
2454 if (trip_count <= nteams) {
2455 KMP_DEBUG_ASSERT(
2456 __kmp_static == kmp_sch_static_greedy ||
2457 __kmp_static ==
2458 kmp_sch_static_balanced); // Unknown static scheduling type.
2459 // only some teams get single iteration, others get nothing
2460 if (team_id < trip_count) {
2461 *pupper = *plower = *plower + team_id * incr;
2462 } else {
2463 *plower = *pupper + incr; // zero-trip loop
2464 }
2465 if (plastiter != NULL)
2466 *plastiter = (team_id == trip_count - 1);
2467 } else {
2468 if (__kmp_static == kmp_sch_static_balanced) {
2469 register UT chunk = trip_count / nteams;
2470 register UT extras = trip_count % nteams;
2471 *plower +=
2472 incr * (team_id * chunk + (team_id < extras ? team_id : extras));
2473 *pupper = *plower + chunk * incr - (team_id < extras ? 0 : incr);
2474 if (plastiter != NULL)
2475 *plastiter = (team_id == nteams - 1);
2476 } else {
2477 register T chunk_inc_count =
2478 (trip_count / nteams + ((trip_count % nteams) ? 1 : 0)) * incr;
2479 register T upper = *pupper;
2480 KMP_DEBUG_ASSERT(__kmp_static == kmp_sch_static_greedy);
2481 // Unknown static scheduling type.
2482 *plower += team_id * chunk_inc_count;
2483 *pupper = *plower + chunk_inc_count - incr;
2484 // Check/correct bounds if needed
2485 if (incr > 0) {
2486 if (*pupper < *plower)
2487 *pupper = traits_t<T>::max_value;
2488 if (plastiter != NULL)
2489 *plastiter = *plower <= upper && *pupper > upper - incr;
2490 if (*pupper > upper)
2491 *pupper = upper; // tracker C73258
2492 } else {
2493 if (*pupper > *plower)
2494 *pupper = traits_t<T>::min_value;
2495 if (plastiter != NULL)
2496 *plastiter = *plower >= upper && *pupper < upper - incr;
2497 if (*pupper < upper)
2498 *pupper = upper; // tracker C73258
2499 }
2500 }
2501 }
2502}
2503
2504//-----------------------------------------------------------------------------
Jim Cownie5e8470a2013-09-27 10:38:44 +00002505// Dispatch routines
2506// Transfer call to template< type T >
2507// __kmp_dispatch_init( ident_t *loc, int gtid, enum sched_type schedule,
2508// T lb, T ub, ST st, ST chunk )
2509extern "C" {
2510
2511/*!
2512@ingroup WORK_SHARING
2513@{
2514@param loc Source location
2515@param gtid Global thread id
2516@param schedule Schedule type
2517@param lb Lower bound
2518@param ub Upper bound
2519@param st Step (or increment if you prefer)
2520@param chunk The chunk size to block with
2521
Jonathan Peyton30419822017-05-12 18:01:32 +00002522This function prepares the runtime to start a dynamically scheduled for loop,
2523saving the loop arguments.
Jim Cownie5e8470a2013-09-27 10:38:44 +00002524These functions are all identical apart from the types of the arguments.
2525*/
2526
Jonathan Peyton30419822017-05-12 18:01:32 +00002527void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2528 enum sched_type schedule, kmp_int32 lb,
2529 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk) {
2530 KMP_DEBUG_ASSERT(__kmp_init_serial);
2531 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002532}
2533/*!
2534See @ref __kmpc_dispatch_init_4
2535*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002536void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2537 enum sched_type schedule, kmp_uint32 lb,
2538 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk) {
2539 KMP_DEBUG_ASSERT(__kmp_init_serial);
2540 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002541}
2542
2543/*!
2544See @ref __kmpc_dispatch_init_4
2545*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002546void __kmpc_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2547 enum sched_type schedule, kmp_int64 lb,
2548 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk) {
2549 KMP_DEBUG_ASSERT(__kmp_init_serial);
2550 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002551}
2552
2553/*!
2554See @ref __kmpc_dispatch_init_4
2555*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002556void __kmpc_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2557 enum sched_type schedule, kmp_uint64 lb,
2558 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk) {
2559 KMP_DEBUG_ASSERT(__kmp_init_serial);
2560 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002561}
2562
2563/*!
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002564See @ref __kmpc_dispatch_init_4
2565
2566Difference from __kmpc_dispatch_init set of functions is these functions
2567are called for composite distribute parallel for construct. Thus before
2568regular iterations dispatching we need to calc per-team iteration space.
2569
2570These functions are all identical apart from the types of the arguments.
2571*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002572void __kmpc_dist_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2573 enum sched_type schedule, kmp_int32 *p_last,
2574 kmp_int32 lb, kmp_int32 ub, kmp_int32 st,
2575 kmp_int32 chunk) {
2576 KMP_DEBUG_ASSERT(__kmp_init_serial);
2577 __kmp_dist_get_bounds<kmp_int32>(loc, gtid, p_last, &lb, &ub, st);
2578 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002579}
2580
Jonathan Peyton30419822017-05-12 18:01:32 +00002581void __kmpc_dist_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2582 enum sched_type schedule, kmp_int32 *p_last,
2583 kmp_uint32 lb, kmp_uint32 ub, kmp_int32 st,
2584 kmp_int32 chunk) {
2585 KMP_DEBUG_ASSERT(__kmp_init_serial);
2586 __kmp_dist_get_bounds<kmp_uint32>(loc, gtid, p_last, &lb, &ub, st);
2587 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002588}
2589
Jonathan Peyton30419822017-05-12 18:01:32 +00002590void __kmpc_dist_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2591 enum sched_type schedule, kmp_int32 *p_last,
2592 kmp_int64 lb, kmp_int64 ub, kmp_int64 st,
2593 kmp_int64 chunk) {
2594 KMP_DEBUG_ASSERT(__kmp_init_serial);
2595 __kmp_dist_get_bounds<kmp_int64>(loc, gtid, p_last, &lb, &ub, st);
2596 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002597}
2598
Jonathan Peyton30419822017-05-12 18:01:32 +00002599void __kmpc_dist_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2600 enum sched_type schedule, kmp_int32 *p_last,
2601 kmp_uint64 lb, kmp_uint64 ub, kmp_int64 st,
2602 kmp_int64 chunk) {
2603 KMP_DEBUG_ASSERT(__kmp_init_serial);
2604 __kmp_dist_get_bounds<kmp_uint64>(loc, gtid, p_last, &lb, &ub, st);
2605 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk, true);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00002606}
2607
2608/*!
Jim Cownie5e8470a2013-09-27 10:38:44 +00002609@param loc Source code location
2610@param gtid Global thread id
Jonathan Peyton30419822017-05-12 18:01:32 +00002611@param p_last Pointer to a flag set to one if this is the last chunk or zero
2612otherwise
Jim Cownie5e8470a2013-09-27 10:38:44 +00002613@param p_lb Pointer to the lower bound for the next chunk of work
2614@param p_ub Pointer to the upper bound for the next chunk of work
2615@param p_st Pointer to the stride for the next chunk of work
2616@return one if there is work to be done, zero otherwise
2617
2618Get the next dynamically allocated chunk of work for this thread.
2619If there is no more work, then the lb,ub and stride need not be modified.
2620*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002621int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2622 kmp_int32 *p_lb, kmp_int32 *p_ub, kmp_int32 *p_st) {
2623 return __kmp_dispatch_next<kmp_int32>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002624}
2625
2626/*!
2627See @ref __kmpc_dispatch_next_4
2628*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002629int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2630 kmp_uint32 *p_lb, kmp_uint32 *p_ub,
2631 kmp_int32 *p_st) {
2632 return __kmp_dispatch_next<kmp_uint32>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002633}
2634
2635/*!
2636See @ref __kmpc_dispatch_next_4
2637*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002638int __kmpc_dispatch_next_8(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2639 kmp_int64 *p_lb, kmp_int64 *p_ub, kmp_int64 *p_st) {
2640 return __kmp_dispatch_next<kmp_int64>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002641}
2642
2643/*!
2644See @ref __kmpc_dispatch_next_4
2645*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002646int __kmpc_dispatch_next_8u(ident_t *loc, kmp_int32 gtid, kmp_int32 *p_last,
2647 kmp_uint64 *p_lb, kmp_uint64 *p_ub,
2648 kmp_int64 *p_st) {
2649 return __kmp_dispatch_next<kmp_uint64>(loc, gtid, p_last, p_lb, p_ub, p_st);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002650}
2651
2652/*!
2653@param loc Source code location
2654@param gtid Global thread id
2655
2656Mark the end of a dynamic loop.
2657*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002658void __kmpc_dispatch_fini_4(ident_t *loc, kmp_int32 gtid) {
2659 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002660}
2661
2662/*!
2663See @ref __kmpc_dispatch_fini_4
2664*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002665void __kmpc_dispatch_fini_8(ident_t *loc, kmp_int32 gtid) {
2666 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002667}
2668
2669/*!
2670See @ref __kmpc_dispatch_fini_4
2671*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002672void __kmpc_dispatch_fini_4u(ident_t *loc, kmp_int32 gtid) {
2673 __kmp_dispatch_finish<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002674}
2675
2676/*!
2677See @ref __kmpc_dispatch_fini_4
2678*/
Jonathan Peyton30419822017-05-12 18:01:32 +00002679void __kmpc_dispatch_fini_8u(ident_t *loc, kmp_int32 gtid) {
2680 __kmp_dispatch_finish<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002681}
2682/*! @} */
2683
Jonathan Peyton30419822017-05-12 18:01:32 +00002684//-----------------------------------------------------------------------------
2685// Non-template routines from kmp_dispatch.cpp used in other sources
Jim Cownie5e8470a2013-09-27 10:38:44 +00002686
Jonathan Peyton30419822017-05-12 18:01:32 +00002687kmp_uint32 __kmp_eq_4(kmp_uint32 value, kmp_uint32 checker) {
2688 return value == checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002689}
2690
Jonathan Peyton30419822017-05-12 18:01:32 +00002691kmp_uint32 __kmp_neq_4(kmp_uint32 value, kmp_uint32 checker) {
2692 return value != checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002693}
2694
Jonathan Peyton30419822017-05-12 18:01:32 +00002695kmp_uint32 __kmp_lt_4(kmp_uint32 value, kmp_uint32 checker) {
2696 return value < checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002697}
2698
Jonathan Peyton30419822017-05-12 18:01:32 +00002699kmp_uint32 __kmp_ge_4(kmp_uint32 value, kmp_uint32 checker) {
2700 return value >= checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002701}
2702
Jonathan Peyton30419822017-05-12 18:01:32 +00002703kmp_uint32 __kmp_le_4(kmp_uint32 value, kmp_uint32 checker) {
2704 return value <= checker;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002705}
Jim Cownie5e8470a2013-09-27 10:38:44 +00002706
2707kmp_uint32
Jonathan Peyton30419822017-05-12 18:01:32 +00002708__kmp_wait_yield_4(volatile kmp_uint32 *spinner, kmp_uint32 checker,
2709 kmp_uint32 (*pred)(kmp_uint32, kmp_uint32),
2710 void *obj // Higher-level synchronization object, or NULL.
2711 ) {
2712 // note: we may not belong to a team at this point
2713 register volatile kmp_uint32 *spin = spinner;
2714 register kmp_uint32 check = checker;
2715 register kmp_uint32 spins;
2716 register kmp_uint32 (*f)(kmp_uint32, kmp_uint32) = pred;
2717 register kmp_uint32 r;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002718
Jonathan Peyton30419822017-05-12 18:01:32 +00002719 KMP_FSYNC_SPIN_INIT(obj, (void *)spin);
2720 KMP_INIT_YIELD(spins);
2721 // main wait spin loop
2722 while (!f(r = TCR_4(*spin), check)) {
2723 KMP_FSYNC_SPIN_PREPARE(obj);
2724 /* GEH - remove this since it was accidentally introduced when kmp_wait was
2725 split. It causes problems with infinite recursion because of exit lock */
2726 /* if ( TCR_4(__kmp_global.g.g_done) && __kmp_global.g.g_abort)
2727 __kmp_abort_thread(); */
Jim Cownie5e8470a2013-09-27 10:38:44 +00002728
Jonathan Peyton30419822017-05-12 18:01:32 +00002729 /* if we have waited a bit, or are oversubscribed, yield */
2730 /* pause is in the following code */
2731 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2732 KMP_YIELD_SPIN(spins);
2733 }
2734 KMP_FSYNC_SPIN_ACQUIRED(obj);
2735 return r;
Jim Cownie5e8470a2013-09-27 10:38:44 +00002736}
2737
Jonathan Peyton30419822017-05-12 18:01:32 +00002738void __kmp_wait_yield_4_ptr(
2739 void *spinner, kmp_uint32 checker, kmp_uint32 (*pred)(void *, kmp_uint32),
2740 void *obj // Higher-level synchronization object, or NULL.
2741 ) {
2742 // note: we may not belong to a team at this point
2743 register void *spin = spinner;
2744 register kmp_uint32 check = checker;
2745 register kmp_uint32 spins;
2746 register kmp_uint32 (*f)(void *, kmp_uint32) = pred;
Paul Osmialowskif7cc6af2016-05-31 20:20:32 +00002747
Jonathan Peyton30419822017-05-12 18:01:32 +00002748 KMP_FSYNC_SPIN_INIT(obj, spin);
2749 KMP_INIT_YIELD(spins);
2750 // main wait spin loop
2751 while (!f(spin, check)) {
2752 KMP_FSYNC_SPIN_PREPARE(obj);
2753 /* if we have waited a bit, or are oversubscribed, yield */
2754 /* pause is in the following code */
2755 KMP_YIELD(TCR_4(__kmp_nth) > __kmp_avail_proc);
2756 KMP_YIELD_SPIN(spins);
2757 }
2758 KMP_FSYNC_SPIN_ACQUIRED(obj);
Paul Osmialowskif7cc6af2016-05-31 20:20:32 +00002759}
2760
Jim Cownie5e8470a2013-09-27 10:38:44 +00002761} // extern "C"
2762
2763#ifdef KMP_GOMP_COMPAT
2764
Jonathan Peyton30419822017-05-12 18:01:32 +00002765void __kmp_aux_dispatch_init_4(ident_t *loc, kmp_int32 gtid,
2766 enum sched_type schedule, kmp_int32 lb,
2767 kmp_int32 ub, kmp_int32 st, kmp_int32 chunk,
2768 int push_ws) {
2769 __kmp_dispatch_init<kmp_int32>(loc, gtid, schedule, lb, ub, st, chunk,
2770 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002771}
2772
Jonathan Peyton30419822017-05-12 18:01:32 +00002773void __kmp_aux_dispatch_init_4u(ident_t *loc, kmp_int32 gtid,
2774 enum sched_type schedule, kmp_uint32 lb,
2775 kmp_uint32 ub, kmp_int32 st, kmp_int32 chunk,
2776 int push_ws) {
2777 __kmp_dispatch_init<kmp_uint32>(loc, gtid, schedule, lb, ub, st, chunk,
2778 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002779}
2780
Jonathan Peyton30419822017-05-12 18:01:32 +00002781void __kmp_aux_dispatch_init_8(ident_t *loc, kmp_int32 gtid,
2782 enum sched_type schedule, kmp_int64 lb,
2783 kmp_int64 ub, kmp_int64 st, kmp_int64 chunk,
2784 int push_ws) {
2785 __kmp_dispatch_init<kmp_int64>(loc, gtid, schedule, lb, ub, st, chunk,
2786 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002787}
2788
Jonathan Peyton30419822017-05-12 18:01:32 +00002789void __kmp_aux_dispatch_init_8u(ident_t *loc, kmp_int32 gtid,
2790 enum sched_type schedule, kmp_uint64 lb,
2791 kmp_uint64 ub, kmp_int64 st, kmp_int64 chunk,
2792 int push_ws) {
2793 __kmp_dispatch_init<kmp_uint64>(loc, gtid, schedule, lb, ub, st, chunk,
2794 push_ws);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002795}
2796
Jonathan Peyton30419822017-05-12 18:01:32 +00002797void __kmp_aux_dispatch_fini_chunk_4(ident_t *loc, kmp_int32 gtid) {
2798 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002799}
2800
Jonathan Peyton30419822017-05-12 18:01:32 +00002801void __kmp_aux_dispatch_fini_chunk_8(ident_t *loc, kmp_int32 gtid) {
2802 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002803}
2804
Jonathan Peyton30419822017-05-12 18:01:32 +00002805void __kmp_aux_dispatch_fini_chunk_4u(ident_t *loc, kmp_int32 gtid) {
2806 __kmp_dispatch_finish_chunk<kmp_uint32>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002807}
2808
Jonathan Peyton30419822017-05-12 18:01:32 +00002809void __kmp_aux_dispatch_fini_chunk_8u(ident_t *loc, kmp_int32 gtid) {
2810 __kmp_dispatch_finish_chunk<kmp_uint64>(gtid, loc);
Jim Cownie5e8470a2013-09-27 10:38:44 +00002811}
2812
2813#endif /* KMP_GOMP_COMPAT */
2814
2815/* ------------------------------------------------------------------------ */