blob: e5125b2d59e0996008c566005708a379af7b19cf [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
Jonathan Peytona0e159f2015-10-08 18:23:38 +000020#include "kmp_os.h"
21
Jim Cownie4cc4bb42014-10-07 16:25:50 +000022
23#if KMP_MIC
24#include <immintrin.h>
25#define USE_NGO_STORES 1
26#endif // KMP_MIC
27
28#if KMP_MIC && USE_NGO_STORES
29// ICV copying
30#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
31#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
32#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
33#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
34#else
35#define ngo_load(src) ((void)0)
36#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
Jonathan Peyton01b58b72015-07-09 18:20:51 +000037#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
Jim Cownie4cc4bb42014-10-07 16:25:50 +000038#define ngo_sync() ((void)0)
39#endif /* KMP_MIC && USE_NGO_STORES */
40
41void __kmp_print_structure(void); // Forward declaration
42
43// ---------------------------- Barrier Algorithms ----------------------------
44
45// Linear Barrier
46static void
47__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
48 void (*reduce)(void *, void *)
49 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
50{
Jonathan Peyton45be4502015-08-11 21:36:41 +000051 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +000052 register kmp_team_t *team = this_thr->th.th_team;
53 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
54 register kmp_info_t **other_threads = team->t.t_threads;
55
56 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
57 gtid, team->t.t_id, tid, bt));
58 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
59
60#if USE_ITT_BUILD && USE_ITT_NOTIFY
61 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +000062 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +000063 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
64 }
65#endif
66 // We now perform a linear reduction to signal that all of the threads have arrived.
67 if (!KMP_MASTER_TID(tid)) {
68 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
Jonathan Peytond26e2132015-09-10 18:44:30 +000069 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +000070 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
71 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
72 // Mark arrival to master thread
73 /* After performing this write, a worker thread may not assume that the team is valid
74 any more - it could be deallocated by the master thread at any time. */
75 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
76 flag.release();
77 } else {
78 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
79 register int nproc = this_thr->th.th_team_nproc;
80 register int i;
81 // Don't have to worry about sleep bit here or atomic since team setting
Jonathan Peytond26e2132015-09-10 18:44:30 +000082 register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
Jim Cownie4cc4bb42014-10-07 16:25:50 +000083
84 // Collect all the worker team member threads.
85 for (i=1; i<nproc; ++i) {
86#if KMP_CACHE_MANAGE
87 // Prefetch next thread's arrived count
88 if (i+1 < nproc)
89 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
90#endif /* KMP_CACHE_MANAGE */
91 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +000092 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +000093 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
94 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
95
96 // Wait for worker thread to arrive
97 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
98 flag.wait(this_thr, FALSE
99 USE_ITT_BUILD_ARG(itt_sync_obj) );
100#if USE_ITT_BUILD && USE_ITT_NOTIFY
101 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000102 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000103 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
104 other_threads[i]->th.th_bar_min_time);
105 }
106#endif
107 if (reduce) {
108 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
109 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
110 (*reduce)(this_thr->th.th_local.reduce_data,
111 other_threads[i]->th.th_local.reduce_data);
112 }
113 }
114 // Don't have to worry about sleep bit here or atomic since team setting
115 team_bar->b_arrived = new_state;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000116 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000117 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
118 }
119 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
120 gtid, team->t.t_id, tid, bt));
121}
122
123static void
124__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
125 int propagate_icvs
126 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
127{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000128 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000129 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
130 register kmp_team_t *team;
131
132 if (KMP_MASTER_TID(tid)) {
133 register unsigned int i;
134 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
135 register kmp_info_t **other_threads;
136
137 team = __kmp_threads[gtid]->th.th_team;
138 KMP_DEBUG_ASSERT(team != NULL);
139 other_threads = team->t.t_threads;
140
141 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
142 gtid, team->t.t_id, tid, bt));
143
144 if (nproc > 1) {
145#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000146 {
147 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
148 if (propagate_icvs) {
149 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
150 for (i=1; i<nproc; ++i) {
151 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
152 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
153 &team->t.t_implicit_task_taskdata[0].td_icvs);
154 }
155 ngo_sync();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000156 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000157 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000158#endif // KMP_BARRIER_ICV_PUSH
159
160 // Now, release all of the worker threads
161 for (i=1; i<nproc; ++i) {
162#if KMP_CACHE_MANAGE
163 // Prefetch next thread's go flag
164 if (i+1 < nproc)
165 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
166#endif /* KMP_CACHE_MANAGE */
167 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
168 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
169 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
170 &other_threads[i]->th.th_bar[bt].bb.b_go,
171 other_threads[i]->th.th_bar[bt].bb.b_go,
172 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
173 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
174 flag.release();
175 }
176 }
177 } else { // Wait for the MASTER thread to release us
178 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
179 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
180 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
181 flag.wait(this_thr, TRUE
182 USE_ITT_BUILD_ARG(itt_sync_obj) );
183#if USE_ITT_BUILD && USE_ITT_NOTIFY
184 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
185 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
186 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
187 // Cancel wait on previous parallel region...
188 __kmp_itt_task_starting(itt_sync_obj);
189
190 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
191 return;
192
193 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
194 if (itt_sync_obj != NULL)
195 // Call prepare as early as possible for "new" barrier
196 __kmp_itt_task_finished(itt_sync_obj);
197 } else
198#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
199 // Early exit for reaping threads releasing forkjoin barrier
200 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
201 return;
202 // The worker thread may now assume that the team is valid.
203#ifdef KMP_DEBUG
204 tid = __kmp_tid_from_gtid(gtid);
205 team = __kmp_threads[gtid]->th.th_team;
206#endif
207 KMP_DEBUG_ASSERT(team != NULL);
208 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
209 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
210 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
211 KMP_MB(); // Flush all pending memory write invalidates.
212 }
213 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
214 gtid, team->t.t_id, tid, bt));
215}
216
217// Tree barrier
218static void
219__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
220 void (*reduce)(void *, void *)
221 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
222{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000223 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000224 register kmp_team_t *team = this_thr->th.th_team;
225 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
226 register kmp_info_t **other_threads = team->t.t_threads;
227 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
228 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
229 register kmp_uint32 branch_factor = 1 << branch_bits;
230 register kmp_uint32 child;
231 register kmp_uint32 child_tid;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000232 register kmp_uint64 new_state;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000233
234 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
235 gtid, team->t.t_id, tid, bt));
236 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
237
238#if USE_ITT_BUILD && USE_ITT_NOTIFY
239 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000240 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000241 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
242 }
243#endif
244 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
245 child_tid = (tid << branch_bits) + 1;
246 if (child_tid < nproc) {
247 // Parent threads wait for all their children to arrive
248 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
249 child = 1;
250 do {
251 register kmp_info_t *child_thr = other_threads[child_tid];
252 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
253#if KMP_CACHE_MANAGE
254 // Prefetch next thread's arrived count
255 if (child+1 <= branch_factor && child_tid+1 < nproc)
256 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
257#endif /* KMP_CACHE_MANAGE */
258 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000259 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000260 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
261 &child_bar->b_arrived, new_state));
262 // Wait for child to arrive
263 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
264 flag.wait(this_thr, FALSE
265 USE_ITT_BUILD_ARG(itt_sync_obj) );
266#if USE_ITT_BUILD && USE_ITT_NOTIFY
267 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000268 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000269 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
270 child_thr->th.th_bar_min_time);
271 }
272#endif
273 if (reduce) {
274 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
275 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
276 team->t.t_id, child_tid));
277 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
278 }
279 child++;
280 child_tid++;
281 }
282 while (child <= branch_factor && child_tid < nproc);
283 }
284
285 if (!KMP_MASTER_TID(tid)) { // Worker threads
286 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
287
288 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000289 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000290 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
291 &thr_bar->b_arrived, thr_bar->b_arrived,
292 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
293
294 // Mark arrival to parent thread
295 /* After performing this write, a worker thread may not assume that the team is valid
296 any more - it could be deallocated by the master thread at any time. */
297 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
298 flag.release();
299 } else {
300 // Need to update the team arrived pointer if we are the master thread
301 if (nproc > 1) // New value was already computed above
302 team->t.t_bar[bt].b_arrived = new_state;
303 else
304 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000305 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000306 gtid, team->t.t_id, tid, team->t.t_id,
307 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
308 }
309 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
310 gtid, team->t.t_id, tid, bt));
311}
312
313static void
314__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
315 int propagate_icvs
316 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
317{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000318 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000319 register kmp_team_t *team;
320 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
321 register kmp_uint32 nproc;
322 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
323 register kmp_uint32 branch_factor = 1 << branch_bits;
324 register kmp_uint32 child;
325 register kmp_uint32 child_tid;
326
327 // Perform a tree release for all of the threads that have been gathered
328 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
329 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
330 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
331 // Wait for parent thread to release us
332 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
333 flag.wait(this_thr, TRUE
334 USE_ITT_BUILD_ARG(itt_sync_obj) );
335#if USE_ITT_BUILD && USE_ITT_NOTIFY
336 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
337 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
338 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
339 // Cancel wait on previous parallel region...
340 __kmp_itt_task_starting(itt_sync_obj);
341
342 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
343 return;
344
345 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
346 if (itt_sync_obj != NULL)
347 // Call prepare as early as possible for "new" barrier
348 __kmp_itt_task_finished(itt_sync_obj);
349 } else
350#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
351 // Early exit for reaping threads releasing forkjoin barrier
352 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
353 return;
354
355 // The worker thread may now assume that the team is valid.
356 team = __kmp_threads[gtid]->th.th_team;
357 KMP_DEBUG_ASSERT(team != NULL);
358 tid = __kmp_tid_from_gtid(gtid);
359
360 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
361 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
362 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
363 KMP_MB(); // Flush all pending memory write invalidates.
364 } else {
365 team = __kmp_threads[gtid]->th.th_team;
366 KMP_DEBUG_ASSERT(team != NULL);
367 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
368 gtid, team->t.t_id, tid, bt));
369 }
370 nproc = this_thr->th.th_team_nproc;
371 child_tid = (tid << branch_bits) + 1;
372
373 if (child_tid < nproc) {
374 register kmp_info_t **other_threads = team->t.t_threads;
375 child = 1;
376 // Parent threads release all their children
377 do {
378 register kmp_info_t *child_thr = other_threads[child_tid];
379 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
380#if KMP_CACHE_MANAGE
381 // Prefetch next thread's go count
382 if (child+1 <= branch_factor && child_tid+1 < nproc)
383 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
384#endif /* KMP_CACHE_MANAGE */
385
386#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000387 {
388 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
389 if (propagate_icvs) {
390 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
391 team, child_tid, FALSE);
392 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
393 &team->t.t_implicit_task_taskdata[0].td_icvs);
394 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000395 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000396#endif // KMP_BARRIER_ICV_PUSH
397 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
398 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
399 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
400 child_tid, &child_bar->b_go, child_bar->b_go,
401 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
402 // Release child from barrier
403 kmp_flag_64 flag(&child_bar->b_go, child_thr);
404 flag.release();
405 child++;
406 child_tid++;
407 }
408 while (child <= branch_factor && child_tid < nproc);
409 }
410 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
411 gtid, team->t.t_id, tid, bt));
412}
413
414
415// Hyper Barrier
416static void
417__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
418 void (*reduce)(void *, void *)
419 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
420{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000421 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000422 register kmp_team_t *team = this_thr->th.th_team;
423 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
424 register kmp_info_t **other_threads = team->t.t_threads;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000425 register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000426 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
427 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
428 register kmp_uint32 branch_factor = 1 << branch_bits;
429 register kmp_uint32 offset;
430 register kmp_uint32 level;
431
432 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
433 gtid, team->t.t_id, tid, bt));
434
435 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
436
437#if USE_ITT_BUILD && USE_ITT_NOTIFY
438 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000439 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000440 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
441 }
442#endif
443 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
444 arrived, and reduce any required data as we go. */
445 kmp_flag_64 p_flag(&thr_bar->b_arrived);
446 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
447 {
448 register kmp_uint32 child;
449 register kmp_uint32 child_tid;
450
451 if (((tid >> level) & (branch_factor - 1)) != 0) {
452 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
453
454 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000455 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000456 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
457 &thr_bar->b_arrived, thr_bar->b_arrived,
458 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
459 // Mark arrival to parent thread
460 /* After performing this write (in the last iteration of the enclosing for loop),
461 a worker thread may not assume that the team is valid any more - it could be
462 deallocated by the master thread at any time. */
463 p_flag.set_waiter(other_threads[parent_tid]);
Jonathan Peyton1bd61b42015-10-08 19:44:16 +0000464 p_flag.release();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000465 break;
466 }
467
468 // Parent threads wait for children to arrive
469 if (new_state == KMP_BARRIER_UNUSED_STATE)
470 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
471 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
472 child++, child_tid+=(1 << level))
473 {
474 register kmp_info_t *child_thr = other_threads[child_tid];
475 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
476#if KMP_CACHE_MANAGE
477 register kmp_uint32 next_child_tid = child_tid + (1 << level);
478 // Prefetch next thread's arrived count
479 if (child+1 < branch_factor && next_child_tid < num_threads)
480 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
481#endif /* KMP_CACHE_MANAGE */
482 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000483 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000484 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
485 &child_bar->b_arrived, new_state));
486 // Wait for child to arrive
487 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
488 c_flag.wait(this_thr, FALSE
489 USE_ITT_BUILD_ARG(itt_sync_obj) );
490#if USE_ITT_BUILD && USE_ITT_NOTIFY
491 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000492 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000493 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
494 child_thr->th.th_bar_min_time);
495 }
496#endif
497 if (reduce) {
498 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
499 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
500 team->t.t_id, child_tid));
501 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
502 }
503 }
504 }
505
506 if (KMP_MASTER_TID(tid)) {
507 // Need to update the team arrived pointer if we are the master thread
508 if (new_state == KMP_BARRIER_UNUSED_STATE)
509 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
510 else
511 team->t.t_bar[bt].b_arrived = new_state;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000512 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000513 gtid, team->t.t_id, tid, team->t.t_id,
514 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
515 }
516 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
517 gtid, team->t.t_id, tid, bt));
518}
519
520// The reverse versions seem to beat the forward versions overall
521#define KMP_REVERSE_HYPER_BAR
522static void
523__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
524 int propagate_icvs
525 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
526{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000527 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000528 register kmp_team_t *team;
529 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
530 register kmp_info_t **other_threads;
531 register kmp_uint32 num_threads;
532 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
533 register kmp_uint32 branch_factor = 1 << branch_bits;
534 register kmp_uint32 child;
535 register kmp_uint32 child_tid;
536 register kmp_uint32 offset;
537 register kmp_uint32 level;
538
539 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
540 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
541 order of the corresponding gather, otherwise threads are released in the same order. */
542 if (KMP_MASTER_TID(tid)) { // master
543 team = __kmp_threads[gtid]->th.th_team;
544 KMP_DEBUG_ASSERT(team != NULL);
545 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
546 gtid, team->t.t_id, tid, bt));
547#if KMP_BARRIER_ICV_PUSH
548 if (propagate_icvs) { // master already has ICVs in final destination; copy
549 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
550 }
551#endif
552 }
553 else { // Handle fork barrier workers who aren't part of a team yet
554 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
555 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
556 // Wait for parent thread to release us
557 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
558 flag.wait(this_thr, TRUE
559 USE_ITT_BUILD_ARG(itt_sync_obj) );
560#if USE_ITT_BUILD && USE_ITT_NOTIFY
561 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
562 // In fork barrier where we could not get the object reliably
563 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
564 // Cancel wait on previous parallel region...
565 __kmp_itt_task_starting(itt_sync_obj);
566
567 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
568 return;
569
570 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
571 if (itt_sync_obj != NULL)
572 // Call prepare as early as possible for "new" barrier
573 __kmp_itt_task_finished(itt_sync_obj);
574 } else
575#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
576 // Early exit for reaping threads releasing forkjoin barrier
577 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
578 return;
579
580 // The worker thread may now assume that the team is valid.
581 team = __kmp_threads[gtid]->th.th_team;
582 KMP_DEBUG_ASSERT(team != NULL);
583 tid = __kmp_tid_from_gtid(gtid);
584
585 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
586 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
587 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
588 KMP_MB(); // Flush all pending memory write invalidates.
589 }
590 num_threads = this_thr->th.th_team_nproc;
591 other_threads = team->t.t_threads;
592
593#ifdef KMP_REVERSE_HYPER_BAR
594 // Count up to correct level for parent
595 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
596 level+=branch_bits, offset<<=branch_bits);
597
598 // Now go down from there
599 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
600 level-=branch_bits, offset>>=branch_bits)
601#else
602 // Go down the tree, level by level
603 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
604#endif // KMP_REVERSE_HYPER_BAR
605 {
606#ifdef KMP_REVERSE_HYPER_BAR
607 /* Now go in reverse order through the children, highest to lowest.
608 Initial setting of child is conservative here. */
609 child = num_threads >> ((level==0)?level:level-1);
610 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
611 child>=1; child--, child_tid-=(1<<level))
612#else
613 if (((tid >> level) & (branch_factor - 1)) != 0)
614 // No need to go lower than this, since this is the level parent would be notified
615 break;
616 // Iterate through children on this level of the tree
617 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
618 child++, child_tid+=(1<<level))
619#endif // KMP_REVERSE_HYPER_BAR
620 {
621 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
622 else {
623 register kmp_info_t *child_thr = other_threads[child_tid];
624 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
625#if KMP_CACHE_MANAGE
626 register kmp_uint32 next_child_tid = child_tid - (1 << level);
627 // Prefetch next thread's go count
628# ifdef KMP_REVERSE_HYPER_BAR
629 if (child-1 >= 1 && next_child_tid < num_threads)
630# else
631 if (child+1 < branch_factor && next_child_tid < num_threads)
632# endif // KMP_REVERSE_HYPER_BAR
633 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
634#endif /* KMP_CACHE_MANAGE */
635
636#if KMP_BARRIER_ICV_PUSH
637 if (propagate_icvs) // push my fixed ICVs to my child
638 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
639#endif // KMP_BARRIER_ICV_PUSH
640
641 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
642 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
643 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
644 child_tid, &child_bar->b_go, child_bar->b_go,
645 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
646 // Release child from barrier
647 kmp_flag_64 flag(&child_bar->b_go, child_thr);
648 flag.release();
649 }
650 }
651 }
652#if KMP_BARRIER_ICV_PUSH
653 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
654 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
655 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
656 }
657#endif
658 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
659 gtid, team->t.t_id, tid, bt));
660}
661
662// Hierarchical Barrier
663
664// Initialize thread barrier data
665/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
666 minimum amount of initialization required based on how the team has changed. Returns true if
667 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
668 team size increases, threads already in the team will respond to on-core wakeup on their parent
669 thread, but threads newly added to the team will only be listening on the their local b_go. */
670static bool
671__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
672 int gtid, int tid, kmp_team_t *team)
673{
674 // Checks to determine if (re-)initialization is needed
675 bool uninitialized = thr_bar->team == NULL;
676 bool team_changed = team != thr_bar->team;
677 bool team_sz_changed = nproc != thr_bar->nproc;
678 bool tid_changed = tid != thr_bar->old_tid;
679 bool retval = false;
680
681 if (uninitialized || team_sz_changed) {
682 __kmp_get_hierarchy(nproc, thr_bar);
683 }
684
685 if (uninitialized || team_sz_changed || tid_changed) {
686 thr_bar->my_level = thr_bar->depth-1; // default for master
687 thr_bar->parent_tid = -1; // default for master
688 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
689 kmp_uint32 d=0;
690 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
691 kmp_uint32 rem;
692 if (d == thr_bar->depth-2) { // reached level right below the master
693 thr_bar->parent_tid = 0;
694 thr_bar->my_level = d;
695 break;
696 }
697 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
698 // thread is not a subtree root at next level, so this is max
699 thr_bar->parent_tid = tid - rem;
700 thr_bar->my_level = d;
701 break;
702 }
703 ++d;
704 }
705 }
706 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
707 thr_bar->old_tid = tid;
708 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
Jonathan Peytonb0b83c82015-11-09 16:28:32 +0000709 thr_bar->team = team;
710 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000711 }
712 if (uninitialized || team_changed || tid_changed) {
713 thr_bar->team = team;
714 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
715 retval = true;
716 }
717 if (uninitialized || team_sz_changed || tid_changed) {
718 thr_bar->nproc = nproc;
719 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
720 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
721 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
722 thr_bar->leaf_kids = nproc - tid - 1;
723 thr_bar->leaf_state = 0;
724 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
725 }
726 return retval;
727}
728
729static void
730__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
731 int gtid, int tid, void (*reduce) (void *, void *)
732 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
733{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000734 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000735 register kmp_team_t *team = this_thr->th.th_team;
736 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
737 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
738 register kmp_info_t **other_threads = team->t.t_threads;
739 register kmp_uint64 new_state;
740
Andrey Churbanov42a79212015-01-27 16:50:31 +0000741 int level = team->t.t_level;
Jonathan Peyton441f3372015-09-21 17:24:46 +0000742#if OMP_40_ENABLED
Andrey Churbanov42a79212015-01-27 16:50:31 +0000743 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
744 if (this_thr->th.th_teams_size.nteams > 1)
745 ++level; // level was not increased in teams construct for team_of_masters
Jonathan Peyton441f3372015-09-21 17:24:46 +0000746#endif
Andrey Churbanov42a79212015-01-27 16:50:31 +0000747 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000748 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
749
750 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
751 gtid, team->t.t_id, tid, bt));
752 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
753
Andrey Churbanove6bfb732015-05-06 18:34:15 +0000754#if USE_ITT_BUILD && USE_ITT_NOTIFY
755 // Barrier imbalance - save arrive time to the thread
756 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
757 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
758 }
759#endif
760
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000761 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
762
763 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
764 register kmp_int32 child_tid;
765 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
766 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
767 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
Jonathan Peytond26e2132015-09-10 18:44:30 +0000768 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
Jonathan Peyton90862c42015-11-12 21:40:39 +0000769 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n",
770 gtid, team->t.t_id, tid));
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000771 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
772 flag.wait(this_thr, FALSE
773 USE_ITT_BUILD_ARG(itt_sync_obj) );
774 if (reduce) {
775 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
776 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
777 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
778 team->t.t_id, child_tid));
779 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
780 }
781 }
782 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
783 }
784 // Next, wait for higher level children on each child's b_arrived flag
785 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
786 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
787 if (last > nproc) last = nproc;
788 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
789 register kmp_info_t *child_thr = other_threads[child_tid];
790 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
791 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000792 "arrived(%p) == %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000793 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
794 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
795 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
796 flag.wait(this_thr, FALSE
797 USE_ITT_BUILD_ARG(itt_sync_obj) );
798 if (reduce) {
799 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
800 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
801 team->t.t_id, child_tid));
802 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
803 }
804 }
805 }
806 }
807 else { // Blocktime is not infinite
808 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
809 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
810 if (last > nproc) last = nproc;
811 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
812 register kmp_info_t *child_thr = other_threads[child_tid];
813 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
814 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000815 "arrived(%p) == %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000816 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
817 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
818 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
819 flag.wait(this_thr, FALSE
820 USE_ITT_BUILD_ARG(itt_sync_obj) );
821 if (reduce) {
822 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
823 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
824 team->t.t_id, child_tid));
825 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
826 }
827 }
828 }
829 }
830 }
831 // All subordinates are gathered; now release parent if not master thread
832
833 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
834 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000835 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000836 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
837 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
838 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
839 the team is valid any more - it could be deallocated by the master thread at any time. */
840 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
841 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
842 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
843 flag.release();
844 }
845 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
Jonathan Peytond26e2132015-09-10 18:44:30 +0000846 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000847 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
848 flag.set_waiter(other_threads[thr_bar->parent_tid]);
849 flag.release();
850 }
851 } else { // Master thread needs to update the team's b_arrived value
Jonathan Peytond26e2132015-09-10 18:44:30 +0000852 team->t.t_bar[bt].b_arrived = new_state;
853 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000854 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
855 }
856 // Is the team access below unsafe or just technically invalid?
857 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
858 gtid, team->t.t_id, tid, bt));
859}
860
861static void
862__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
863 int propagate_icvs
864 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
865{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000866 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000867 register kmp_team_t *team;
868 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
869 register kmp_uint32 nproc;
870 bool team_change = false; // indicates on-core barrier shouldn't be used
871
872 if (KMP_MASTER_TID(tid)) {
873 team = __kmp_threads[gtid]->th.th_team;
874 KMP_DEBUG_ASSERT(team != NULL);
875 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
876 gtid, team->t.t_id, tid, bt));
877 }
878 else { // Worker threads
879 // Wait for parent thread to release me
880 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
881 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
882 // Use traditional method of waiting on my own b_go flag
883 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
884 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
885 flag.wait(this_thr, TRUE
886 USE_ITT_BUILD_ARG(itt_sync_obj) );
887 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
888 }
889 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
890 // Wait on my "offset" bits on parent's b_go flag
891 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
892 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
893 bt, this_thr
894 USE_ITT_BUILD_ARG(itt_sync_obj) );
Jonathan Peytona0e159f2015-10-08 18:23:38 +0000895 flag.wait(this_thr, TRUE);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000896 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
897 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
898 }
899 else { // Reset my bits on parent's b_go flag
900 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
901 }
902 }
903 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
904 // Early exit for reaping threads releasing forkjoin barrier
905 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
906 return;
907 // The worker thread may now assume that the team is valid.
908 team = __kmp_threads[gtid]->th.th_team;
909 KMP_DEBUG_ASSERT(team != NULL);
910 tid = __kmp_tid_from_gtid(gtid);
911
912 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
913 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
914 KMP_MB(); // Flush all pending memory write invalidates.
915 }
916
Jonathan Peytona0e159f2015-10-08 18:23:38 +0000917 nproc = this_thr->th.th_team_nproc;
Andrey Churbanov42a79212015-01-27 16:50:31 +0000918 int level = team->t.t_level;
Jonathan Peyton441f3372015-09-21 17:24:46 +0000919#if OMP_40_ENABLED
Andrey Churbanov42a79212015-01-27 16:50:31 +0000920 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
921 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
922 ++level; // level was not increased in teams construct for team_of_workers
923 if( this_thr->th.th_teams_size.nteams > 1 )
924 ++level; // level was not increased in teams construct for team_of_masters
925 }
Jonathan Peyton441f3372015-09-21 17:24:46 +0000926#endif
Andrey Churbanov42a79212015-01-27 16:50:31 +0000927 if (level == 1) thr_bar->use_oncore_barrier = 1;
928 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000929
930 // If the team size has increased, we still communicate with old leaves via oncore barrier.
931 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
932 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
933 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
934 // But if the entire team changes, we won't use oncore barrier at all
935 if (team_change) old_leaf_kids = 0;
936
937#if KMP_BARRIER_ICV_PUSH
938 if (propagate_icvs) {
Jonathan Peyton2211cfe2015-08-12 20:59:48 +0000939 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000940 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
941 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
942 }
943 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
944 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
945 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
946 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
947 &thr_bar->parent_bar->th_fixed_icvs);
948 // non-leaves will get ICVs piggybacked with b_go via NGO store
949 }
950 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
951 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
952 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
953 else // leaves copy parent's fixed ICVs directly to local ICV store
954 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
955 &thr_bar->parent_bar->th_fixed_icvs);
956 }
957 }
958#endif // KMP_BARRIER_ICV_PUSH
959
960 // Now, release my children
961 if (thr_bar->my_level) { // not a leaf
962 register kmp_int32 child_tid;
963 kmp_uint32 last;
964 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
965 if (KMP_MASTER_TID(tid)) { // do a flat release
966 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
967 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
968 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
969 ngo_load(&thr_bar->th_fixed_icvs);
970 // This loops over all the threads skipping only the leaf nodes in the hierarchy
971 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
972 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
973 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
974 " go(%p): %u => %u\n",
975 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
976 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
977 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
978 // Use ngo store (if available) to both store ICVs and release child via child's b_go
979 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
980 }
981 ngo_sync();
982 }
983 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
984 // Now, release leaf children
985 if (thr_bar->leaf_kids) { // if there are any
986 // We test team_change on the off-chance that the level 1 team changed.
987 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
988 if (old_leaf_kids) { // release old leaf kids
989 thr_bar->b_go |= old_leaf_state;
990 }
991 // Release new leaf kids
992 last = tid+thr_bar->skip_per_level[1];
993 if (last > nproc) last = nproc;
994 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
995 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
996 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
997 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
998 " T#%d(%d:%d) go(%p): %u => %u\n",
999 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1000 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1001 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1002 // Release child using child's b_go flag
1003 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1004 flag.release();
1005 }
1006 }
1007 else { // Release all children at once with leaf_state bits on my own b_go flag
1008 thr_bar->b_go |= thr_bar->leaf_state;
1009 }
1010 }
1011 }
1012 else { // Blocktime is not infinite; do a simple hierarchical release
1013 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1014 last = tid+thr_bar->skip_per_level[d+1];
1015 kmp_uint32 skip = thr_bar->skip_per_level[d];
1016 if (last > nproc) last = nproc;
1017 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1018 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1019 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1020 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1021 " go(%p): %u => %u\n",
1022 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1023 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1024 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1025 // Release child using child's b_go flag
1026 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1027 flag.release();
1028 }
1029 }
1030 }
1031#if KMP_BARRIER_ICV_PUSH
1032 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1033 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1034#endif // KMP_BARRIER_ICV_PUSH
1035 }
1036 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1037 gtid, team->t.t_id, tid, bt));
1038}
1039
1040// ---------------------------- End of Barrier Algorithms ----------------------------
1041
1042// Internal function to do a barrier.
1043/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1044 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1045 Returns 0 if master thread, 1 if worker thread. */
1046int
1047__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1048 void *reduce_data, void (*reduce)(void *, void *))
1049{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001050 KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001051 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1052 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001053 register int tid = __kmp_tid_from_gtid(gtid);
1054 register kmp_info_t *this_thr = __kmp_threads[gtid];
1055 register kmp_team_t *team = this_thr->th.th_team;
1056 register int status = 0;
1057 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001058#if OMPT_SUPPORT
1059 ompt_task_id_t my_task_id;
1060 ompt_parallel_id_t my_parallel_id;
1061#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001062
1063 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1064 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1065
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001066#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001067 if (ompt_enabled) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001068#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001069 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1070 my_parallel_id = team->t.ompt_team_info.parallel_id;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001071
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001072#if OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001073 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1074 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1075 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001076 my_parallel_id, my_task_id);
1077 }
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001078 }
1079#endif
1080 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1081 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1082 my_parallel_id, my_task_id);
1083 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001084#endif
1085 // It is OK to report the barrier state after the barrier begin callback.
1086 // According to the OMPT specification, a compliant implementation may
1087 // even delay reporting this state until the barrier begins to wait.
1088 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001089 }
1090#endif
1091
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001092 if (! team->t.t_serialized) {
1093#if USE_ITT_BUILD
1094 // This value will be used in itt notify events below.
1095 void *itt_sync_obj = NULL;
1096# if USE_ITT_NOTIFY
1097 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1098 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1099# endif
1100#endif /* USE_ITT_BUILD */
1101 if (__kmp_tasking_mode == tskm_extra_barrier) {
1102 __kmp_tasking_barrier(team, this_thr, gtid);
1103 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1104 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1105 }
1106
1107 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1108 the team struct is not guaranteed to exist. */
1109 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1110 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1111 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1112 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1113 }
1114
1115#if USE_ITT_BUILD
1116 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1117 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1118#endif /* USE_ITT_BUILD */
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001119#if USE_DEBUGGER
1120 // Let the debugger know: the thread arrived to the barrier and waiting.
1121 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1122 team->t.t_bar[bt].b_master_arrived += 1;
1123 } else {
1124 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1125 } // if
1126#endif /* USE_DEBUGGER */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001127 if (reduce != NULL) {
1128 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1129 this_thr->th.th_local.reduce_data = reduce_data;
1130 }
Jonathan Peytonb0b83c82015-11-09 16:28:32 +00001131
1132 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1133 __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1
1134
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001135 switch (__kmp_barrier_gather_pattern[bt]) {
1136 case bp_hyper_bar: {
1137 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1138 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1139 USE_ITT_BUILD_ARG(itt_sync_obj) );
1140 break;
1141 }
1142 case bp_hierarchical_bar: {
1143 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1144 USE_ITT_BUILD_ARG(itt_sync_obj));
1145 break;
1146 }
1147 case bp_tree_bar: {
1148 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1149 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1150 USE_ITT_BUILD_ARG(itt_sync_obj) );
1151 break;
1152 }
1153 default: {
1154 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1155 USE_ITT_BUILD_ARG(itt_sync_obj) );
1156 }
1157 }
1158
1159 KMP_MB();
1160
1161 if (KMP_MASTER_TID(tid)) {
1162 status = 0;
1163 if (__kmp_tasking_mode != tskm_immediate_exec) {
1164 __kmp_task_team_wait(this_thr, team
1165 USE_ITT_BUILD_ARG(itt_sync_obj) );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001166 }
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001167#if USE_DEBUGGER
1168 // Let the debugger know: All threads are arrived and starting leaving the barrier.
1169 team->t.t_bar[bt].b_team_arrived += 1;
1170#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001171
1172#if USE_ITT_BUILD
1173 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1174 before the final summation into the shared variable is done (final summation can be a
1175 long operation for array reductions). */
1176 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1177 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1178#endif /* USE_ITT_BUILD */
1179#if USE_ITT_BUILD && USE_ITT_NOTIFY
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001180 // Barrier - report frame end (only if active_level == 1)
1181 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1182#if OMP_40_ENABLED
1183 this_thr->th.th_teams_microtask == NULL &&
1184#endif
1185 team->t.t_active_level == 1)
1186 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001187 kmp_uint64 cur_time = __itt_get_timestamp();
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001188 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001189 int nproc = this_thr->th.th_team_nproc;
1190 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001191 switch(__kmp_forkjoin_frames_mode) {
1192 case 1:
1193 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1194 this_thr->th.th_frame_time = cur_time;
1195 break;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001196 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001197 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1198 break;
1199 case 3:
1200 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001201 // Initialize with master's wait time
1202 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001203 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1204 this_thr->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001205 for (i=1; i<nproc; ++i) {
1206 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001207 other_threads[i]->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001208 }
1209 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1210 }
1211 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1212 this_thr->th.th_frame_time = cur_time;
1213 break;
1214 }
1215 }
1216#endif /* USE_ITT_BUILD */
1217 } else {
1218 status = 1;
1219#if USE_ITT_BUILD
1220 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1221 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1222#endif /* USE_ITT_BUILD */
1223 }
1224 if (status == 1 || ! is_split) {
1225 switch (__kmp_barrier_release_pattern[bt]) {
1226 case bp_hyper_bar: {
1227 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1228 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1229 USE_ITT_BUILD_ARG(itt_sync_obj) );
1230 break;
1231 }
1232 case bp_hierarchical_bar: {
1233 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1234 USE_ITT_BUILD_ARG(itt_sync_obj) );
1235 break;
1236 }
1237 case bp_tree_bar: {
1238 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1239 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1240 USE_ITT_BUILD_ARG(itt_sync_obj) );
1241 break;
1242 }
1243 default: {
1244 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1245 USE_ITT_BUILD_ARG(itt_sync_obj) );
1246 }
1247 }
1248 if (__kmp_tasking_mode != tskm_immediate_exec) {
1249 __kmp_task_team_sync(this_thr, team);
1250 }
1251 }
1252
1253#if USE_ITT_BUILD
1254 /* GEH: TODO: Move this under if-condition above and also include in
1255 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1256 of the threads for split barriers. */
1257 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1258 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1259#endif /* USE_ITT_BUILD */
1260 } else { // Team is serialized.
1261 status = 0;
1262 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jonathan Peytondf6818b2016-06-14 17:57:47 +00001263#if OMP_45_ENABLED
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001264 if ( this_thr->th.th_task_team != NULL ) {
1265 void *itt_sync_obj = NULL;
1266#if USE_ITT_NOTIFY
1267 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1268 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1269 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1270 }
1271#endif
1272
Jonathan Peytonfe9a1d72015-08-26 19:58:48 +00001273 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001274 __kmp_task_team_wait(this_thr, team
1275 USE_ITT_BUILD_ARG(itt_sync_obj));
Jonathan Peyton54127982015-11-04 21:37:48 +00001276 __kmp_task_team_setup(this_thr, team, 0);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001277
1278#if USE_ITT_BUILD
1279 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1280 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1281#endif /* USE_ITT_BUILD */
1282 }
1283#else
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001284 // The task team should be NULL for serialized code (tasks will be executed immediately)
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001285 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001286 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001287#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001288 }
1289 }
1290 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1291 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001292
1293#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001294 if (ompt_enabled) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001295#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001296 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001297 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1298 my_parallel_id, my_task_id);
1299 }
1300#endif
1301 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1302 }
1303#endif
1304
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001305 return status;
1306}
1307
1308
1309void
1310__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1311{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001312 KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001313 int tid = __kmp_tid_from_gtid(gtid);
1314 kmp_info_t *this_thr = __kmp_threads[gtid];
1315 kmp_team_t *team = this_thr->th.th_team;
1316
1317 if (!team->t.t_serialized) {
1318 if (KMP_MASTER_GTID(gtid)) {
1319 switch (__kmp_barrier_release_pattern[bt]) {
1320 case bp_hyper_bar: {
1321 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1322 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1323 USE_ITT_BUILD_ARG(NULL) );
1324 break;
1325 }
1326 case bp_hierarchical_bar: {
1327 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1328 USE_ITT_BUILD_ARG(NULL));
1329 break;
1330 }
1331 case bp_tree_bar: {
1332 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1333 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1334 USE_ITT_BUILD_ARG(NULL) );
1335 break;
1336 }
1337 default: {
1338 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1339 USE_ITT_BUILD_ARG(NULL) );
1340 }
1341 }
1342 if (__kmp_tasking_mode != tskm_immediate_exec) {
1343 __kmp_task_team_sync(this_thr, team);
1344 } // if
1345 }
1346 }
1347}
1348
1349
1350void
1351__kmp_join_barrier(int gtid)
1352{
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001353 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
1354 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001355 KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001356 register kmp_info_t *this_thr = __kmp_threads[gtid];
1357 register kmp_team_t *team;
1358 register kmp_uint nproc;
1359 kmp_info_t *master_thread;
1360 int tid;
1361#ifdef KMP_DEBUG
1362 int team_id;
1363#endif /* KMP_DEBUG */
1364#if USE_ITT_BUILD
1365 void *itt_sync_obj = NULL;
1366# if USE_ITT_NOTIFY
1367 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1368 // Get object created at fork_barrier
1369 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1370# endif
1371#endif /* USE_ITT_BUILD */
1372 KMP_MB();
1373
1374 // Get current info
1375 team = this_thr->th.th_team;
1376 nproc = this_thr->th.th_team_nproc;
1377 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1378 tid = __kmp_tid_from_gtid(gtid);
1379#ifdef KMP_DEBUG
1380 team_id = team->t.t_id;
1381#endif /* KMP_DEBUG */
1382 master_thread = this_thr->th.th_team_master;
1383#ifdef KMP_DEBUG
1384 if (master_thread != team->t.t_threads[0]) {
1385 __kmp_print_structure();
1386 }
1387#endif /* KMP_DEBUG */
1388 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1389 KMP_MB();
1390
1391 // Verify state
1392 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1393 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1394 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1395 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1396 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1397
Jonathan Peyton61118492016-05-20 19:03:38 +00001398#if OMPT_SUPPORT
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001399#if OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001400 if (ompt_enabled &&
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001401 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1402 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1403 team->t.ompt_team_info.parallel_id,
1404 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1405 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001406#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001407 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1408#endif
1409
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001410 if (__kmp_tasking_mode == tskm_extra_barrier) {
1411 __kmp_tasking_barrier(team, this_thr, gtid);
1412 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1413 }
1414# ifdef KMP_DEBUG
1415 if (__kmp_tasking_mode != tskm_immediate_exec) {
1416 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001417 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001418 this_thr->th.th_task_team));
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001419 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001420 }
1421# endif /* KMP_DEBUG */
1422
1423 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1424 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1425 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1426 since the values are not used by __kmp_wait_template() in that case. */
1427 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1428 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1429 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1430 }
1431
1432#if USE_ITT_BUILD
1433 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1434 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1435#endif /* USE_ITT_BUILD */
1436
1437 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1438 case bp_hyper_bar: {
1439 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1440 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1441 USE_ITT_BUILD_ARG(itt_sync_obj) );
1442 break;
1443 }
1444 case bp_hierarchical_bar: {
1445 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1446 USE_ITT_BUILD_ARG(itt_sync_obj) );
1447 break;
1448 }
1449 case bp_tree_bar: {
1450 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1451 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1452 USE_ITT_BUILD_ARG(itt_sync_obj) );
1453 break;
1454 }
1455 default: {
1456 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1457 USE_ITT_BUILD_ARG(itt_sync_obj) );
1458 }
1459 }
1460
1461 /* From this point on, the team data structure may be deallocated at any time by the
1462 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1463 data items that need to be referenced before the end of the barrier should be moved to
1464 the kmp_task_team_t structs. */
1465 if (KMP_MASTER_TID(tid)) {
1466 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001467 __kmp_task_team_wait(this_thr, team
1468 USE_ITT_BUILD_ARG(itt_sync_obj) );
1469 }
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001470#if KMP_STATS_ENABLED
1471 // Have master thread flag the workers to indicate they are now waiting for
1472 // next parallel region, Also wake them up so they switch their timers to idle.
1473 for (int i=0; i<team->t.t_nproc; ++i) {
1474 kmp_info_t* team_thread = team->t.t_threads[i];
1475 if (team_thread == this_thr)
1476 continue;
1477 team_thread->th.th_stats->setIdleFlag();
1478 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL)
1479 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc);
1480 }
1481#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001482#if USE_ITT_BUILD
1483 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1484 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1485#endif /* USE_ITT_BUILD */
1486
1487# if USE_ITT_BUILD && USE_ITT_NOTIFY
1488 // Join barrier - report frame end
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001489 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1490#if OMP_40_ENABLED
1491 this_thr->th.th_teams_microtask == NULL &&
1492#endif
1493 team->t.t_active_level == 1)
1494 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001495 kmp_uint64 cur_time = __itt_get_timestamp();
1496 ident_t * loc = team->t.t_ident;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001497 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001498 int nproc = this_thr->th.th_team_nproc;
1499 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001500 switch(__kmp_forkjoin_frames_mode) {
1501 case 1:
1502 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1503 break;
1504 case 2:
1505 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1506 break;
1507 case 3:
1508 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001509 // Initialize with master's wait time
1510 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001511 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1512 this_thr->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001513 for (i=1; i<nproc; ++i) {
1514 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001515 other_threads[i]->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001516 }
1517 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1518 }
1519 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1520 this_thr->th.th_frame_time = cur_time;
1521 break;
1522 }
1523 }
1524# endif /* USE_ITT_BUILD */
1525 }
1526#if USE_ITT_BUILD
1527 else {
1528 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1529 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1530 }
1531#endif /* USE_ITT_BUILD */
1532
1533#if KMP_DEBUG
1534 if (KMP_MASTER_TID(tid)) {
1535 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1536 gtid, team_id, tid, nproc));
1537 }
1538#endif /* KMP_DEBUG */
1539
1540 // TODO now, mark worker threads as done so they may be disbanded
1541 KMP_MB(); // Flush all pending memory write invalidates.
1542 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001543
1544#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001545 if (ompt_enabled) {
Jonathan Peytoncab67cc2015-09-18 16:24:46 +00001546#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001547 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001548 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1549 team->t.ompt_team_info.parallel_id,
1550 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001551 }
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001552#endif
1553
1554 // return to default state
1555 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1556 }
1557#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001558}
1559
1560
1561// TODO release worker threads' fork barriers as we are ready instead of all at once
1562void
1563__kmp_fork_barrier(int gtid, int tid)
1564{
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001565 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
1566 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001567 KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001568 kmp_info_t *this_thr = __kmp_threads[gtid];
1569 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1570#if USE_ITT_BUILD
1571 void * itt_sync_obj = NULL;
1572#endif /* USE_ITT_BUILD */
1573
1574 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1575 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1576
1577 // th_team pointer only valid for master thread here
1578 if (KMP_MASTER_TID(tid)) {
1579#if USE_ITT_BUILD && USE_ITT_NOTIFY
1580 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1581 // Create itt barrier object
1582 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1583 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1584 }
1585#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1586
1587#ifdef KMP_DEBUG
1588 register kmp_info_t **other_threads = team->t.t_threads;
1589 register int i;
1590
1591 // Verify state
1592 KMP_MB();
1593
1594 for(i=1; i<team->t.t_nproc; ++i) {
1595 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1596 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1597 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1598 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1599 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1600 & ~(KMP_BARRIER_SLEEP_STATE))
1601 == KMP_INIT_BARRIER_STATE);
1602 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1603 }
1604#endif
1605
1606 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jonathan Peyton54127982015-11-04 21:37:48 +00001607 __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001608 }
1609
1610 /* The master thread may have changed its blocktime between the join barrier and the
1611 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1612 access it when the team struct is not guaranteed to exist. */
1613 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1614 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1615 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1616 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1617 }
1618 } // master
1619
1620 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1621 case bp_hyper_bar: {
1622 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1623 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1624 USE_ITT_BUILD_ARG(itt_sync_obj) );
1625 break;
1626 }
1627 case bp_hierarchical_bar: {
1628 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1629 USE_ITT_BUILD_ARG(itt_sync_obj) );
1630 break;
1631 }
1632 case bp_tree_bar: {
1633 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1634 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1635 USE_ITT_BUILD_ARG(itt_sync_obj) );
1636 break;
1637 }
1638 default: {
1639 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1640 USE_ITT_BUILD_ARG(itt_sync_obj) );
1641 }
1642 }
1643
1644 // Early exit for reaping threads releasing forkjoin barrier
1645 if (TCR_4(__kmp_global.g.g_done)) {
Jonathan Peyton54127982015-11-04 21:37:48 +00001646 this_thr->th.th_task_team = NULL;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001647
1648#if USE_ITT_BUILD && USE_ITT_NOTIFY
1649 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1650 if (!KMP_MASTER_TID(tid)) {
1651 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1652 if (itt_sync_obj)
1653 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1654 }
1655 }
1656#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1657 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1658 return;
1659 }
1660
1661 /* We can now assume that a valid team structure has been allocated by the master and
1662 propagated to all worker threads. The current thread, however, may not be part of the
1663 team, so we can't blindly assume that the team pointer is non-null. */
1664 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1665 KMP_DEBUG_ASSERT(team != NULL);
1666 tid = __kmp_tid_from_gtid(gtid);
1667
1668
1669#if KMP_BARRIER_ICV_PULL
1670 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1671 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1672 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1673 the fixed ICVs in the master's thread struct, because it is not always the case that the
1674 threads arrays have been allocated when __kmp_fork_call() is executed. */
Jonathan Peyton45be4502015-08-11 21:36:41 +00001675 {
1676 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
1677 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1678 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1679 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1680 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1681 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1682 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1683 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001684 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001685#endif // KMP_BARRIER_ICV_PULL
1686
1687 if (__kmp_tasking_mode != tskm_immediate_exec) {
1688 __kmp_task_team_sync(this_thr, team);
1689 }
1690
1691#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1692 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1693 if (proc_bind == proc_bind_intel) {
1694#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001695#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001696 // Call dynamic affinity settings
1697 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1698 __kmp_balanced_affinity(tid, team->t.t_nproc);
1699 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001700#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001701#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1702 }
Andrey Churbanov94e569e2015-03-10 09:19:47 +00001703 else if (proc_bind != proc_bind_false) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001704 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1705 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1706 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1707 }
1708 else {
1709 __kmp_affinity_set_place(gtid);
1710 }
1711 }
1712#endif
1713
1714#if USE_ITT_BUILD && USE_ITT_NOTIFY
1715 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1716 if (!KMP_MASTER_TID(tid)) {
1717 // Get correct barrier object
1718 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1719 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1720 } // (prepare called inside barrier_release)
1721 }
1722#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1723 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1724}
1725
1726
1727void
1728__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1729{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001730 KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001731
1732 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1733 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1734
1735 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1736 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1737 this data before this function is called. */
1738#if KMP_BARRIER_ICV_PULL
1739 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1740 all of the worker threads can access them and make their own copies after the barrier. */
1741 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1742 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1743 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1744 0, team->t.t_threads[0], team));
1745#elif KMP_BARRIER_ICV_PUSH
1746 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1747 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1748 0, team->t.t_threads[0], team));
1749#else
1750 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1751 ngo_load(new_icvs);
1752 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
Jonathan Peyton91b78702015-06-08 19:39:07 +00001753 for (int f=1; f<new_nproc; ++f) { // Skip the master thread
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001754 // TODO: GEH - pass in better source location info since usually NULL here
1755 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1756 f, team->t.t_threads[f], team));
1757 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1758 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1759 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1760 f, team->t.t_threads[f], team));
1761 }
1762 ngo_sync();
1763#endif // KMP_BARRIER_ICV_PULL
1764}