blob: 7843b68ff8dee3e0d32a241538061ad99ab1f1f7 [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
Jonathan Peytona0e159f2015-10-08 18:23:38 +000020#include "kmp_os.h"
21
Jim Cownie4cc4bb42014-10-07 16:25:50 +000022
23#if KMP_MIC
24#include <immintrin.h>
25#define USE_NGO_STORES 1
26#endif // KMP_MIC
27
Jonas Hahnfeld50fed042016-11-07 15:58:36 +000028#include "tsan_annotations.h"
29
Jim Cownie4cc4bb42014-10-07 16:25:50 +000030#if KMP_MIC && USE_NGO_STORES
31// ICV copying
32#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
33#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
35#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
36#else
37#define ngo_load(src) ((void)0)
38#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
Jonathan Peyton01b58b72015-07-09 18:20:51 +000039#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
Jim Cownie4cc4bb42014-10-07 16:25:50 +000040#define ngo_sync() ((void)0)
41#endif /* KMP_MIC && USE_NGO_STORES */
42
43void __kmp_print_structure(void); // Forward declaration
44
45// ---------------------------- Barrier Algorithms ----------------------------
46
47// Linear Barrier
48static void
49__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50 void (*reduce)(void *, void *)
51 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
52{
Jonathan Peyton45be4502015-08-11 21:36:41 +000053 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +000054 register kmp_team_t *team = this_thr->th.th_team;
55 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
56 register kmp_info_t **other_threads = team->t.t_threads;
57
58 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59 gtid, team->t.t_id, tid, bt));
60 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61
62#if USE_ITT_BUILD && USE_ITT_NOTIFY
63 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +000064 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +000065 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
66 }
67#endif
68 // We now perform a linear reduction to signal that all of the threads have arrived.
69 if (!KMP_MASTER_TID(tid)) {
70 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
Jonathan Peytond26e2132015-09-10 18:44:30 +000071 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +000072 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
73 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
74 // Mark arrival to master thread
75 /* After performing this write, a worker thread may not assume that the team is valid
76 any more - it could be deallocated by the master thread at any time. */
77 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
78 flag.release();
79 } else {
80 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
81 register int nproc = this_thr->th.th_team_nproc;
82 register int i;
83 // Don't have to worry about sleep bit here or atomic since team setting
Jonathan Peytond26e2132015-09-10 18:44:30 +000084 register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
Jim Cownie4cc4bb42014-10-07 16:25:50 +000085
86 // Collect all the worker team member threads.
87 for (i=1; i<nproc; ++i) {
88#if KMP_CACHE_MANAGE
89 // Prefetch next thread's arrived count
90 if (i+1 < nproc)
91 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
92#endif /* KMP_CACHE_MANAGE */
93 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +000094 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +000095 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
96 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
97
98 // Wait for worker thread to arrive
99 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
100 flag.wait(this_thr, FALSE
101 USE_ITT_BUILD_ARG(itt_sync_obj) );
102#if USE_ITT_BUILD && USE_ITT_NOTIFY
103 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000104 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000105 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
106 other_threads[i]->th.th_bar_min_time);
107 }
108#endif
109 if (reduce) {
110 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
111 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000112 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000113 (*reduce)(this_thr->th.th_local.reduce_data,
114 other_threads[i]->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000115 ANNOTATE_REDUCE_BEFORE(reduce);
116 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000117 }
118 }
119 // Don't have to worry about sleep bit here or atomic since team setting
120 team_bar->b_arrived = new_state;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000121 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000122 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
123 }
124 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
125 gtid, team->t.t_id, tid, bt));
126}
127
128static void
129__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
130 int propagate_icvs
131 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
132{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000133 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000134 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
135 register kmp_team_t *team;
136
137 if (KMP_MASTER_TID(tid)) {
138 register unsigned int i;
139 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
140 register kmp_info_t **other_threads;
141
142 team = __kmp_threads[gtid]->th.th_team;
143 KMP_DEBUG_ASSERT(team != NULL);
144 other_threads = team->t.t_threads;
145
146 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
147 gtid, team->t.t_id, tid, bt));
148
149 if (nproc > 1) {
150#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000151 {
152 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
153 if (propagate_icvs) {
154 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
155 for (i=1; i<nproc; ++i) {
156 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
157 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
158 &team->t.t_implicit_task_taskdata[0].td_icvs);
159 }
160 ngo_sync();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000161 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000162 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000163#endif // KMP_BARRIER_ICV_PUSH
164
165 // Now, release all of the worker threads
166 for (i=1; i<nproc; ++i) {
167#if KMP_CACHE_MANAGE
168 // Prefetch next thread's go flag
169 if (i+1 < nproc)
170 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
171#endif /* KMP_CACHE_MANAGE */
172 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
173 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
174 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
175 &other_threads[i]->th.th_bar[bt].bb.b_go,
176 other_threads[i]->th.th_bar[bt].bb.b_go,
177 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
178 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
179 flag.release();
180 }
181 }
182 } else { // Wait for the MASTER thread to release us
183 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
184 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
185 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
186 flag.wait(this_thr, TRUE
187 USE_ITT_BUILD_ARG(itt_sync_obj) );
188#if USE_ITT_BUILD && USE_ITT_NOTIFY
189 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
190 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
191 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
192 // Cancel wait on previous parallel region...
193 __kmp_itt_task_starting(itt_sync_obj);
194
195 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
196 return;
197
198 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
199 if (itt_sync_obj != NULL)
200 // Call prepare as early as possible for "new" barrier
201 __kmp_itt_task_finished(itt_sync_obj);
202 } else
203#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
204 // Early exit for reaping threads releasing forkjoin barrier
205 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
206 return;
207 // The worker thread may now assume that the team is valid.
208#ifdef KMP_DEBUG
209 tid = __kmp_tid_from_gtid(gtid);
210 team = __kmp_threads[gtid]->th.th_team;
211#endif
212 KMP_DEBUG_ASSERT(team != NULL);
213 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
214 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
215 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
216 KMP_MB(); // Flush all pending memory write invalidates.
217 }
218 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
219 gtid, team->t.t_id, tid, bt));
220}
221
222// Tree barrier
223static void
224__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
225 void (*reduce)(void *, void *)
226 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
227{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000228 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000229 register kmp_team_t *team = this_thr->th.th_team;
230 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
231 register kmp_info_t **other_threads = team->t.t_threads;
232 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
233 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
234 register kmp_uint32 branch_factor = 1 << branch_bits;
235 register kmp_uint32 child;
236 register kmp_uint32 child_tid;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000237 register kmp_uint64 new_state;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000238
239 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
240 gtid, team->t.t_id, tid, bt));
241 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
242
243#if USE_ITT_BUILD && USE_ITT_NOTIFY
244 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000245 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000246 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
247 }
248#endif
249 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
250 child_tid = (tid << branch_bits) + 1;
251 if (child_tid < nproc) {
252 // Parent threads wait for all their children to arrive
253 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
254 child = 1;
255 do {
256 register kmp_info_t *child_thr = other_threads[child_tid];
257 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
258#if KMP_CACHE_MANAGE
259 // Prefetch next thread's arrived count
260 if (child+1 <= branch_factor && child_tid+1 < nproc)
261 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
262#endif /* KMP_CACHE_MANAGE */
263 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000264 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000265 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
266 &child_bar->b_arrived, new_state));
267 // Wait for child to arrive
268 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
269 flag.wait(this_thr, FALSE
270 USE_ITT_BUILD_ARG(itt_sync_obj) );
271#if USE_ITT_BUILD && USE_ITT_NOTIFY
272 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000273 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000274 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
275 child_thr->th.th_bar_min_time);
276 }
277#endif
278 if (reduce) {
279 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
280 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
281 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000282 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000283 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000284 ANNOTATE_REDUCE_BEFORE(reduce);
285 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000286 }
287 child++;
288 child_tid++;
289 }
290 while (child <= branch_factor && child_tid < nproc);
291 }
292
293 if (!KMP_MASTER_TID(tid)) { // Worker threads
294 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
295
296 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000297 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000298 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
299 &thr_bar->b_arrived, thr_bar->b_arrived,
300 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
301
302 // Mark arrival to parent thread
303 /* After performing this write, a worker thread may not assume that the team is valid
304 any more - it could be deallocated by the master thread at any time. */
305 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
306 flag.release();
307 } else {
308 // Need to update the team arrived pointer if we are the master thread
309 if (nproc > 1) // New value was already computed above
310 team->t.t_bar[bt].b_arrived = new_state;
311 else
312 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000313 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000314 gtid, team->t.t_id, tid, team->t.t_id,
315 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
316 }
317 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
318 gtid, team->t.t_id, tid, bt));
319}
320
321static void
322__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
323 int propagate_icvs
324 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
325{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000326 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000327 register kmp_team_t *team;
328 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
329 register kmp_uint32 nproc;
330 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
331 register kmp_uint32 branch_factor = 1 << branch_bits;
332 register kmp_uint32 child;
333 register kmp_uint32 child_tid;
334
335 // Perform a tree release for all of the threads that have been gathered
336 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
337 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
338 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
339 // Wait for parent thread to release us
340 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
341 flag.wait(this_thr, TRUE
342 USE_ITT_BUILD_ARG(itt_sync_obj) );
343#if USE_ITT_BUILD && USE_ITT_NOTIFY
344 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
345 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
346 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
347 // Cancel wait on previous parallel region...
348 __kmp_itt_task_starting(itt_sync_obj);
349
350 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
351 return;
352
353 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
354 if (itt_sync_obj != NULL)
355 // Call prepare as early as possible for "new" barrier
356 __kmp_itt_task_finished(itt_sync_obj);
357 } else
358#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
359 // Early exit for reaping threads releasing forkjoin barrier
360 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
361 return;
362
363 // The worker thread may now assume that the team is valid.
364 team = __kmp_threads[gtid]->th.th_team;
365 KMP_DEBUG_ASSERT(team != NULL);
366 tid = __kmp_tid_from_gtid(gtid);
367
368 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
369 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
370 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
371 KMP_MB(); // Flush all pending memory write invalidates.
372 } else {
373 team = __kmp_threads[gtid]->th.th_team;
374 KMP_DEBUG_ASSERT(team != NULL);
375 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
376 gtid, team->t.t_id, tid, bt));
377 }
378 nproc = this_thr->th.th_team_nproc;
379 child_tid = (tid << branch_bits) + 1;
380
381 if (child_tid < nproc) {
382 register kmp_info_t **other_threads = team->t.t_threads;
383 child = 1;
384 // Parent threads release all their children
385 do {
386 register kmp_info_t *child_thr = other_threads[child_tid];
387 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
388#if KMP_CACHE_MANAGE
389 // Prefetch next thread's go count
390 if (child+1 <= branch_factor && child_tid+1 < nproc)
391 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
392#endif /* KMP_CACHE_MANAGE */
393
394#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000395 {
396 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
397 if (propagate_icvs) {
398 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
399 team, child_tid, FALSE);
400 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
401 &team->t.t_implicit_task_taskdata[0].td_icvs);
402 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000403 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000404#endif // KMP_BARRIER_ICV_PUSH
405 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
406 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
407 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
408 child_tid, &child_bar->b_go, child_bar->b_go,
409 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
410 // Release child from barrier
411 kmp_flag_64 flag(&child_bar->b_go, child_thr);
412 flag.release();
413 child++;
414 child_tid++;
415 }
416 while (child <= branch_factor && child_tid < nproc);
417 }
418 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
419 gtid, team->t.t_id, tid, bt));
420}
421
422
423// Hyper Barrier
424static void
425__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
426 void (*reduce)(void *, void *)
427 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
428{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000429 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000430 register kmp_team_t *team = this_thr->th.th_team;
431 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
432 register kmp_info_t **other_threads = team->t.t_threads;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000433 register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000434 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
435 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
436 register kmp_uint32 branch_factor = 1 << branch_bits;
437 register kmp_uint32 offset;
438 register kmp_uint32 level;
439
440 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
441 gtid, team->t.t_id, tid, bt));
442
443 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
444
445#if USE_ITT_BUILD && USE_ITT_NOTIFY
446 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000447 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000448 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
449 }
450#endif
451 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
452 arrived, and reduce any required data as we go. */
453 kmp_flag_64 p_flag(&thr_bar->b_arrived);
454 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
455 {
456 register kmp_uint32 child;
457 register kmp_uint32 child_tid;
458
459 if (((tid >> level) & (branch_factor - 1)) != 0) {
460 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
461
462 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000463 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000464 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
465 &thr_bar->b_arrived, thr_bar->b_arrived,
466 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
467 // Mark arrival to parent thread
468 /* After performing this write (in the last iteration of the enclosing for loop),
469 a worker thread may not assume that the team is valid any more - it could be
470 deallocated by the master thread at any time. */
471 p_flag.set_waiter(other_threads[parent_tid]);
Jonathan Peyton1bd61b42015-10-08 19:44:16 +0000472 p_flag.release();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000473 break;
474 }
475
476 // Parent threads wait for children to arrive
477 if (new_state == KMP_BARRIER_UNUSED_STATE)
478 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
479 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
480 child++, child_tid+=(1 << level))
481 {
482 register kmp_info_t *child_thr = other_threads[child_tid];
483 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
484#if KMP_CACHE_MANAGE
485 register kmp_uint32 next_child_tid = child_tid + (1 << level);
486 // Prefetch next thread's arrived count
487 if (child+1 < branch_factor && next_child_tid < num_threads)
488 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
489#endif /* KMP_CACHE_MANAGE */
490 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000491 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000492 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
493 &child_bar->b_arrived, new_state));
494 // Wait for child to arrive
495 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
496 c_flag.wait(this_thr, FALSE
497 USE_ITT_BUILD_ARG(itt_sync_obj) );
498#if USE_ITT_BUILD && USE_ITT_NOTIFY
499 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000500 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000501 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
502 child_thr->th.th_bar_min_time);
503 }
504#endif
505 if (reduce) {
506 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
507 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
508 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000509 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000510 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000511 ANNOTATE_REDUCE_BEFORE(reduce);
512 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000513 }
514 }
515 }
516
517 if (KMP_MASTER_TID(tid)) {
518 // Need to update the team arrived pointer if we are the master thread
519 if (new_state == KMP_BARRIER_UNUSED_STATE)
520 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
521 else
522 team->t.t_bar[bt].b_arrived = new_state;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000523 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000524 gtid, team->t.t_id, tid, team->t.t_id,
525 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
526 }
527 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
528 gtid, team->t.t_id, tid, bt));
529}
530
531// The reverse versions seem to beat the forward versions overall
532#define KMP_REVERSE_HYPER_BAR
533static void
534__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
535 int propagate_icvs
536 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
537{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000538 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000539 register kmp_team_t *team;
540 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
541 register kmp_info_t **other_threads;
542 register kmp_uint32 num_threads;
543 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
544 register kmp_uint32 branch_factor = 1 << branch_bits;
545 register kmp_uint32 child;
546 register kmp_uint32 child_tid;
547 register kmp_uint32 offset;
548 register kmp_uint32 level;
549
550 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
551 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
552 order of the corresponding gather, otherwise threads are released in the same order. */
553 if (KMP_MASTER_TID(tid)) { // master
554 team = __kmp_threads[gtid]->th.th_team;
555 KMP_DEBUG_ASSERT(team != NULL);
556 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
557 gtid, team->t.t_id, tid, bt));
558#if KMP_BARRIER_ICV_PUSH
559 if (propagate_icvs) { // master already has ICVs in final destination; copy
560 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
561 }
562#endif
563 }
564 else { // Handle fork barrier workers who aren't part of a team yet
565 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
566 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
567 // Wait for parent thread to release us
568 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
569 flag.wait(this_thr, TRUE
570 USE_ITT_BUILD_ARG(itt_sync_obj) );
571#if USE_ITT_BUILD && USE_ITT_NOTIFY
572 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
573 // In fork barrier where we could not get the object reliably
574 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
575 // Cancel wait on previous parallel region...
576 __kmp_itt_task_starting(itt_sync_obj);
577
578 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
579 return;
580
581 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
582 if (itt_sync_obj != NULL)
583 // Call prepare as early as possible for "new" barrier
584 __kmp_itt_task_finished(itt_sync_obj);
585 } else
586#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
587 // Early exit for reaping threads releasing forkjoin barrier
588 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
589 return;
590
591 // The worker thread may now assume that the team is valid.
592 team = __kmp_threads[gtid]->th.th_team;
593 KMP_DEBUG_ASSERT(team != NULL);
594 tid = __kmp_tid_from_gtid(gtid);
595
596 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
597 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
598 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
599 KMP_MB(); // Flush all pending memory write invalidates.
600 }
601 num_threads = this_thr->th.th_team_nproc;
602 other_threads = team->t.t_threads;
603
604#ifdef KMP_REVERSE_HYPER_BAR
605 // Count up to correct level for parent
606 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
607 level+=branch_bits, offset<<=branch_bits);
608
609 // Now go down from there
610 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
611 level-=branch_bits, offset>>=branch_bits)
612#else
613 // Go down the tree, level by level
614 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
615#endif // KMP_REVERSE_HYPER_BAR
616 {
617#ifdef KMP_REVERSE_HYPER_BAR
618 /* Now go in reverse order through the children, highest to lowest.
619 Initial setting of child is conservative here. */
620 child = num_threads >> ((level==0)?level:level-1);
621 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
622 child>=1; child--, child_tid-=(1<<level))
623#else
624 if (((tid >> level) & (branch_factor - 1)) != 0)
625 // No need to go lower than this, since this is the level parent would be notified
626 break;
627 // Iterate through children on this level of the tree
628 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
629 child++, child_tid+=(1<<level))
630#endif // KMP_REVERSE_HYPER_BAR
631 {
632 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
633 else {
634 register kmp_info_t *child_thr = other_threads[child_tid];
635 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
636#if KMP_CACHE_MANAGE
637 register kmp_uint32 next_child_tid = child_tid - (1 << level);
638 // Prefetch next thread's go count
639# ifdef KMP_REVERSE_HYPER_BAR
640 if (child-1 >= 1 && next_child_tid < num_threads)
641# else
642 if (child+1 < branch_factor && next_child_tid < num_threads)
643# endif // KMP_REVERSE_HYPER_BAR
644 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
645#endif /* KMP_CACHE_MANAGE */
646
647#if KMP_BARRIER_ICV_PUSH
648 if (propagate_icvs) // push my fixed ICVs to my child
649 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
650#endif // KMP_BARRIER_ICV_PUSH
651
652 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
653 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
654 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
655 child_tid, &child_bar->b_go, child_bar->b_go,
656 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
657 // Release child from barrier
658 kmp_flag_64 flag(&child_bar->b_go, child_thr);
659 flag.release();
660 }
661 }
662 }
663#if KMP_BARRIER_ICV_PUSH
664 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
665 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
666 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
667 }
668#endif
669 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
670 gtid, team->t.t_id, tid, bt));
671}
672
673// Hierarchical Barrier
674
675// Initialize thread barrier data
676/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
677 minimum amount of initialization required based on how the team has changed. Returns true if
678 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
679 team size increases, threads already in the team will respond to on-core wakeup on their parent
680 thread, but threads newly added to the team will only be listening on the their local b_go. */
681static bool
682__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
683 int gtid, int tid, kmp_team_t *team)
684{
685 // Checks to determine if (re-)initialization is needed
686 bool uninitialized = thr_bar->team == NULL;
687 bool team_changed = team != thr_bar->team;
688 bool team_sz_changed = nproc != thr_bar->nproc;
689 bool tid_changed = tid != thr_bar->old_tid;
690 bool retval = false;
691
692 if (uninitialized || team_sz_changed) {
693 __kmp_get_hierarchy(nproc, thr_bar);
694 }
695
696 if (uninitialized || team_sz_changed || tid_changed) {
697 thr_bar->my_level = thr_bar->depth-1; // default for master
698 thr_bar->parent_tid = -1; // default for master
699 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
700 kmp_uint32 d=0;
701 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
702 kmp_uint32 rem;
703 if (d == thr_bar->depth-2) { // reached level right below the master
704 thr_bar->parent_tid = 0;
705 thr_bar->my_level = d;
706 break;
707 }
708 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
709 // thread is not a subtree root at next level, so this is max
710 thr_bar->parent_tid = tid - rem;
711 thr_bar->my_level = d;
712 break;
713 }
714 ++d;
715 }
716 }
717 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
718 thr_bar->old_tid = tid;
719 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
Jonathan Peytonb0b83c82015-11-09 16:28:32 +0000720 thr_bar->team = team;
721 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000722 }
723 if (uninitialized || team_changed || tid_changed) {
724 thr_bar->team = team;
725 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
726 retval = true;
727 }
728 if (uninitialized || team_sz_changed || tid_changed) {
729 thr_bar->nproc = nproc;
730 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
731 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
732 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
733 thr_bar->leaf_kids = nproc - tid - 1;
734 thr_bar->leaf_state = 0;
735 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
736 }
737 return retval;
738}
739
740static void
741__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
742 int gtid, int tid, void (*reduce) (void *, void *)
743 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
744{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000745 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000746 register kmp_team_t *team = this_thr->th.th_team;
747 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
748 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
749 register kmp_info_t **other_threads = team->t.t_threads;
750 register kmp_uint64 new_state;
751
Andrey Churbanov42a79212015-01-27 16:50:31 +0000752 int level = team->t.t_level;
Jonathan Peyton441f3372015-09-21 17:24:46 +0000753#if OMP_40_ENABLED
Andrey Churbanov42a79212015-01-27 16:50:31 +0000754 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
755 if (this_thr->th.th_teams_size.nteams > 1)
756 ++level; // level was not increased in teams construct for team_of_masters
Jonathan Peyton441f3372015-09-21 17:24:46 +0000757#endif
Andrey Churbanov42a79212015-01-27 16:50:31 +0000758 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000759 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
760
761 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
762 gtid, team->t.t_id, tid, bt));
763 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
764
Andrey Churbanove6bfb732015-05-06 18:34:15 +0000765#if USE_ITT_BUILD && USE_ITT_NOTIFY
766 // Barrier imbalance - save arrive time to the thread
767 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
768 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
769 }
770#endif
771
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000772 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
773
774 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
775 register kmp_int32 child_tid;
776 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
777 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
778 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
Jonathan Peytond26e2132015-09-10 18:44:30 +0000779 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
Jonathan Peyton90862c42015-11-12 21:40:39 +0000780 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n",
781 gtid, team->t.t_id, tid));
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000782 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
783 flag.wait(this_thr, FALSE
784 USE_ITT_BUILD_ARG(itt_sync_obj) );
785 if (reduce) {
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000786 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000787 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
788 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
789 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
790 team->t.t_id, child_tid));
791 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
792 }
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000793 ANNOTATE_REDUCE_BEFORE(reduce);
794 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000795 }
796 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
797 }
798 // Next, wait for higher level children on each child's b_arrived flag
799 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
800 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
801 if (last > nproc) last = nproc;
802 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
803 register kmp_info_t *child_thr = other_threads[child_tid];
804 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
805 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000806 "arrived(%p) == %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000807 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
808 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
809 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
810 flag.wait(this_thr, FALSE
811 USE_ITT_BUILD_ARG(itt_sync_obj) );
812 if (reduce) {
813 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
814 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
815 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000816 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000817 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000818 ANNOTATE_REDUCE_BEFORE(reduce);
819 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000820 }
821 }
822 }
823 }
824 else { // Blocktime is not infinite
825 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
826 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
827 if (last > nproc) last = nproc;
828 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
829 register kmp_info_t *child_thr = other_threads[child_tid];
830 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
831 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000832 "arrived(%p) == %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000833 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
834 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
835 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
836 flag.wait(this_thr, FALSE
837 USE_ITT_BUILD_ARG(itt_sync_obj) );
838 if (reduce) {
839 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
840 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
841 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000842 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000843 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000844 ANNOTATE_REDUCE_BEFORE(reduce);
845 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000846 }
847 }
848 }
849 }
850 }
851 // All subordinates are gathered; now release parent if not master thread
852
853 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
854 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000855 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000856 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
857 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
858 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
859 the team is valid any more - it could be deallocated by the master thread at any time. */
860 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
861 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
862 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
863 flag.release();
864 }
865 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
Jonathan Peytond26e2132015-09-10 18:44:30 +0000866 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000867 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
868 flag.set_waiter(other_threads[thr_bar->parent_tid]);
869 flag.release();
870 }
871 } else { // Master thread needs to update the team's b_arrived value
Jonathan Peytond26e2132015-09-10 18:44:30 +0000872 team->t.t_bar[bt].b_arrived = new_state;
873 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000874 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
875 }
876 // Is the team access below unsafe or just technically invalid?
877 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
878 gtid, team->t.t_id, tid, bt));
879}
880
881static void
882__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
883 int propagate_icvs
884 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
885{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000886 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000887 register kmp_team_t *team;
888 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
889 register kmp_uint32 nproc;
890 bool team_change = false; // indicates on-core barrier shouldn't be used
891
892 if (KMP_MASTER_TID(tid)) {
893 team = __kmp_threads[gtid]->th.th_team;
894 KMP_DEBUG_ASSERT(team != NULL);
895 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
896 gtid, team->t.t_id, tid, bt));
897 }
898 else { // Worker threads
899 // Wait for parent thread to release me
900 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
901 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
902 // Use traditional method of waiting on my own b_go flag
903 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
904 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
905 flag.wait(this_thr, TRUE
906 USE_ITT_BUILD_ARG(itt_sync_obj) );
907 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
908 }
909 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
910 // Wait on my "offset" bits on parent's b_go flag
911 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
912 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
913 bt, this_thr
914 USE_ITT_BUILD_ARG(itt_sync_obj) );
Jonathan Peytona0e159f2015-10-08 18:23:38 +0000915 flag.wait(this_thr, TRUE);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000916 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
917 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
918 }
919 else { // Reset my bits on parent's b_go flag
920 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
921 }
922 }
923 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
924 // Early exit for reaping threads releasing forkjoin barrier
925 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
926 return;
927 // The worker thread may now assume that the team is valid.
928 team = __kmp_threads[gtid]->th.th_team;
929 KMP_DEBUG_ASSERT(team != NULL);
930 tid = __kmp_tid_from_gtid(gtid);
931
932 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
933 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
934 KMP_MB(); // Flush all pending memory write invalidates.
935 }
936
Jonathan Peytona0e159f2015-10-08 18:23:38 +0000937 nproc = this_thr->th.th_team_nproc;
Andrey Churbanov42a79212015-01-27 16:50:31 +0000938 int level = team->t.t_level;
Jonathan Peyton441f3372015-09-21 17:24:46 +0000939#if OMP_40_ENABLED
Andrey Churbanov42a79212015-01-27 16:50:31 +0000940 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
941 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
942 ++level; // level was not increased in teams construct for team_of_workers
943 if( this_thr->th.th_teams_size.nteams > 1 )
944 ++level; // level was not increased in teams construct for team_of_masters
945 }
Jonathan Peyton441f3372015-09-21 17:24:46 +0000946#endif
Andrey Churbanov42a79212015-01-27 16:50:31 +0000947 if (level == 1) thr_bar->use_oncore_barrier = 1;
948 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000949
950 // If the team size has increased, we still communicate with old leaves via oncore barrier.
951 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
952 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
953 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
954 // But if the entire team changes, we won't use oncore barrier at all
955 if (team_change) old_leaf_kids = 0;
956
957#if KMP_BARRIER_ICV_PUSH
958 if (propagate_icvs) {
Jonathan Peyton2211cfe2015-08-12 20:59:48 +0000959 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000960 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
961 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
962 }
963 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
964 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
965 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
966 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
967 &thr_bar->parent_bar->th_fixed_icvs);
968 // non-leaves will get ICVs piggybacked with b_go via NGO store
969 }
970 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
971 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
972 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
973 else // leaves copy parent's fixed ICVs directly to local ICV store
974 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
975 &thr_bar->parent_bar->th_fixed_icvs);
976 }
977 }
978#endif // KMP_BARRIER_ICV_PUSH
979
980 // Now, release my children
981 if (thr_bar->my_level) { // not a leaf
982 register kmp_int32 child_tid;
983 kmp_uint32 last;
984 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
985 if (KMP_MASTER_TID(tid)) { // do a flat release
986 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
987 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
988 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
989 ngo_load(&thr_bar->th_fixed_icvs);
990 // This loops over all the threads skipping only the leaf nodes in the hierarchy
991 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
992 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
993 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
994 " go(%p): %u => %u\n",
995 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
996 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
997 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
998 // Use ngo store (if available) to both store ICVs and release child via child's b_go
999 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1000 }
1001 ngo_sync();
1002 }
1003 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1004 // Now, release leaf children
1005 if (thr_bar->leaf_kids) { // if there are any
1006 // We test team_change on the off-chance that the level 1 team changed.
1007 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
1008 if (old_leaf_kids) { // release old leaf kids
1009 thr_bar->b_go |= old_leaf_state;
1010 }
1011 // Release new leaf kids
1012 last = tid+thr_bar->skip_per_level[1];
1013 if (last > nproc) last = nproc;
1014 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
1015 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1016 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1017 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1018 " T#%d(%d:%d) go(%p): %u => %u\n",
1019 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1020 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1021 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1022 // Release child using child's b_go flag
1023 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1024 flag.release();
1025 }
1026 }
1027 else { // Release all children at once with leaf_state bits on my own b_go flag
1028 thr_bar->b_go |= thr_bar->leaf_state;
1029 }
1030 }
1031 }
1032 else { // Blocktime is not infinite; do a simple hierarchical release
1033 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1034 last = tid+thr_bar->skip_per_level[d+1];
1035 kmp_uint32 skip = thr_bar->skip_per_level[d];
1036 if (last > nproc) last = nproc;
1037 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1038 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1039 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1040 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1041 " go(%p): %u => %u\n",
1042 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1043 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1044 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1045 // Release child using child's b_go flag
1046 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1047 flag.release();
1048 }
1049 }
1050 }
1051#if KMP_BARRIER_ICV_PUSH
1052 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1053 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1054#endif // KMP_BARRIER_ICV_PUSH
1055 }
1056 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1057 gtid, team->t.t_id, tid, bt));
1058}
1059
1060// ---------------------------- End of Barrier Algorithms ----------------------------
1061
1062// Internal function to do a barrier.
1063/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1064 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1065 Returns 0 if master thread, 1 if worker thread. */
1066int
1067__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1068 void *reduce_data, void (*reduce)(void *, void *))
1069{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001070 KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001071 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
1072 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001073 register int tid = __kmp_tid_from_gtid(gtid);
1074 register kmp_info_t *this_thr = __kmp_threads[gtid];
1075 register kmp_team_t *team = this_thr->th.th_team;
1076 register int status = 0;
1077 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001078#if OMPT_SUPPORT
1079 ompt_task_id_t my_task_id;
1080 ompt_parallel_id_t my_parallel_id;
1081#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001082
1083 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1084 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1085
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001086 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001087#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001088 if (ompt_enabled) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001089#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001090 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1091 my_parallel_id = team->t.ompt_team_info.parallel_id;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001092
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001093#if OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001094 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1095 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1096 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001097 my_parallel_id, my_task_id);
1098 }
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001099 }
1100#endif
1101 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1102 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1103 my_parallel_id, my_task_id);
1104 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001105#endif
1106 // It is OK to report the barrier state after the barrier begin callback.
1107 // According to the OMPT specification, a compliant implementation may
1108 // even delay reporting this state until the barrier begins to wait.
1109 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001110 }
1111#endif
1112
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001113 if (! team->t.t_serialized) {
1114#if USE_ITT_BUILD
1115 // This value will be used in itt notify events below.
1116 void *itt_sync_obj = NULL;
1117# if USE_ITT_NOTIFY
1118 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1119 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1120# endif
1121#endif /* USE_ITT_BUILD */
1122 if (__kmp_tasking_mode == tskm_extra_barrier) {
1123 __kmp_tasking_barrier(team, this_thr, gtid);
1124 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1125 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1126 }
1127
1128 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1129 the team struct is not guaranteed to exist. */
1130 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1131 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001132#if KMP_USE_MONITOR
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001133 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001134#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001135 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1136 }
1137
1138#if USE_ITT_BUILD
1139 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1140 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1141#endif /* USE_ITT_BUILD */
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001142#if USE_DEBUGGER
1143 // Let the debugger know: the thread arrived to the barrier and waiting.
1144 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1145 team->t.t_bar[bt].b_master_arrived += 1;
1146 } else {
1147 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1148 } // if
1149#endif /* USE_DEBUGGER */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001150 if (reduce != NULL) {
1151 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1152 this_thr->th.th_local.reduce_data = reduce_data;
1153 }
Jonathan Peytonb0b83c82015-11-09 16:28:32 +00001154
1155 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1156 __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1
1157
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001158 switch (__kmp_barrier_gather_pattern[bt]) {
1159 case bp_hyper_bar: {
1160 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1161 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1162 USE_ITT_BUILD_ARG(itt_sync_obj) );
1163 break;
1164 }
1165 case bp_hierarchical_bar: {
1166 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1167 USE_ITT_BUILD_ARG(itt_sync_obj));
1168 break;
1169 }
1170 case bp_tree_bar: {
1171 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1172 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1173 USE_ITT_BUILD_ARG(itt_sync_obj) );
1174 break;
1175 }
1176 default: {
1177 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1178 USE_ITT_BUILD_ARG(itt_sync_obj) );
1179 }
1180 }
1181
1182 KMP_MB();
1183
1184 if (KMP_MASTER_TID(tid)) {
1185 status = 0;
1186 if (__kmp_tasking_mode != tskm_immediate_exec) {
1187 __kmp_task_team_wait(this_thr, team
1188 USE_ITT_BUILD_ARG(itt_sync_obj) );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001189 }
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001190#if USE_DEBUGGER
1191 // Let the debugger know: All threads are arrived and starting leaving the barrier.
1192 team->t.t_bar[bt].b_team_arrived += 1;
1193#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001194
1195#if USE_ITT_BUILD
1196 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1197 before the final summation into the shared variable is done (final summation can be a
1198 long operation for array reductions). */
1199 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1200 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1201#endif /* USE_ITT_BUILD */
1202#if USE_ITT_BUILD && USE_ITT_NOTIFY
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001203 // Barrier - report frame end (only if active_level == 1)
1204 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1205#if OMP_40_ENABLED
1206 this_thr->th.th_teams_microtask == NULL &&
1207#endif
1208 team->t.t_active_level == 1)
1209 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001210 kmp_uint64 cur_time = __itt_get_timestamp();
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001211 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001212 int nproc = this_thr->th.th_team_nproc;
1213 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001214 switch(__kmp_forkjoin_frames_mode) {
1215 case 1:
1216 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1217 this_thr->th.th_frame_time = cur_time;
1218 break;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001219 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001220 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1221 break;
1222 case 3:
1223 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001224 // Initialize with master's wait time
1225 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001226 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1227 this_thr->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001228 for (i=1; i<nproc; ++i) {
1229 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001230 other_threads[i]->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001231 }
1232 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1233 }
1234 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1235 this_thr->th.th_frame_time = cur_time;
1236 break;
1237 }
1238 }
1239#endif /* USE_ITT_BUILD */
1240 } else {
1241 status = 1;
1242#if USE_ITT_BUILD
1243 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1244 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1245#endif /* USE_ITT_BUILD */
1246 }
1247 if (status == 1 || ! is_split) {
1248 switch (__kmp_barrier_release_pattern[bt]) {
1249 case bp_hyper_bar: {
1250 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1251 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1252 USE_ITT_BUILD_ARG(itt_sync_obj) );
1253 break;
1254 }
1255 case bp_hierarchical_bar: {
1256 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1257 USE_ITT_BUILD_ARG(itt_sync_obj) );
1258 break;
1259 }
1260 case bp_tree_bar: {
1261 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1262 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1263 USE_ITT_BUILD_ARG(itt_sync_obj) );
1264 break;
1265 }
1266 default: {
1267 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1268 USE_ITT_BUILD_ARG(itt_sync_obj) );
1269 }
1270 }
1271 if (__kmp_tasking_mode != tskm_immediate_exec) {
1272 __kmp_task_team_sync(this_thr, team);
1273 }
1274 }
1275
1276#if USE_ITT_BUILD
1277 /* GEH: TODO: Move this under if-condition above and also include in
1278 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1279 of the threads for split barriers. */
1280 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1281 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1282#endif /* USE_ITT_BUILD */
1283 } else { // Team is serialized.
1284 status = 0;
1285 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jonathan Peytondf6818b2016-06-14 17:57:47 +00001286#if OMP_45_ENABLED
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001287 if ( this_thr->th.th_task_team != NULL ) {
1288 void *itt_sync_obj = NULL;
1289#if USE_ITT_NOTIFY
1290 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1291 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1292 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1293 }
1294#endif
1295
Jonathan Peytonfe9a1d72015-08-26 19:58:48 +00001296 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001297 __kmp_task_team_wait(this_thr, team
1298 USE_ITT_BUILD_ARG(itt_sync_obj));
Jonathan Peyton54127982015-11-04 21:37:48 +00001299 __kmp_task_team_setup(this_thr, team, 0);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001300
1301#if USE_ITT_BUILD
1302 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1303 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1304#endif /* USE_ITT_BUILD */
1305 }
1306#else
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001307 // The task team should be NULL for serialized code (tasks will be executed immediately)
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001308 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001309 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001310#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001311 }
1312 }
1313 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1314 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001315
1316#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001317 if (ompt_enabled) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001318#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001319 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001320 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1321 my_parallel_id, my_task_id);
1322 }
1323#endif
1324 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1325 }
1326#endif
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001327 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001328
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001329 return status;
1330}
1331
1332
1333void
1334__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1335{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001336 KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001337 int tid = __kmp_tid_from_gtid(gtid);
1338 kmp_info_t *this_thr = __kmp_threads[gtid];
1339 kmp_team_t *team = this_thr->th.th_team;
1340
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001341 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001342 if (!team->t.t_serialized) {
1343 if (KMP_MASTER_GTID(gtid)) {
1344 switch (__kmp_barrier_release_pattern[bt]) {
1345 case bp_hyper_bar: {
1346 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1347 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1348 USE_ITT_BUILD_ARG(NULL) );
1349 break;
1350 }
1351 case bp_hierarchical_bar: {
1352 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1353 USE_ITT_BUILD_ARG(NULL));
1354 break;
1355 }
1356 case bp_tree_bar: {
1357 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1358 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1359 USE_ITT_BUILD_ARG(NULL) );
1360 break;
1361 }
1362 default: {
1363 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1364 USE_ITT_BUILD_ARG(NULL) );
1365 }
1366 }
1367 if (__kmp_tasking_mode != tskm_immediate_exec) {
1368 __kmp_task_team_sync(this_thr, team);
1369 } // if
1370 }
1371 }
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001372 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001373}
1374
1375
1376void
1377__kmp_join_barrier(int gtid)
1378{
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001379 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
1380 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001381 KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001382 register kmp_info_t *this_thr = __kmp_threads[gtid];
1383 register kmp_team_t *team;
1384 register kmp_uint nproc;
1385 kmp_info_t *master_thread;
1386 int tid;
1387#ifdef KMP_DEBUG
1388 int team_id;
1389#endif /* KMP_DEBUG */
1390#if USE_ITT_BUILD
1391 void *itt_sync_obj = NULL;
1392# if USE_ITT_NOTIFY
1393 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1394 // Get object created at fork_barrier
1395 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1396# endif
1397#endif /* USE_ITT_BUILD */
1398 KMP_MB();
1399
1400 // Get current info
1401 team = this_thr->th.th_team;
1402 nproc = this_thr->th.th_team_nproc;
1403 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1404 tid = __kmp_tid_from_gtid(gtid);
1405#ifdef KMP_DEBUG
1406 team_id = team->t.t_id;
1407#endif /* KMP_DEBUG */
1408 master_thread = this_thr->th.th_team_master;
1409#ifdef KMP_DEBUG
1410 if (master_thread != team->t.t_threads[0]) {
1411 __kmp_print_structure();
1412 }
1413#endif /* KMP_DEBUG */
1414 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1415 KMP_MB();
1416
1417 // Verify state
1418 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1419 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1420 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1421 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1422 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1423
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001424 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
Jonathan Peyton61118492016-05-20 19:03:38 +00001425#if OMPT_SUPPORT
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001426#if OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001427 if (ompt_enabled &&
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001428 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1429 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1430 team->t.ompt_team_info.parallel_id,
1431 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1432 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001433#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001434 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1435#endif
1436
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001437 if (__kmp_tasking_mode == tskm_extra_barrier) {
1438 __kmp_tasking_barrier(team, this_thr, gtid);
1439 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1440 }
1441# ifdef KMP_DEBUG
1442 if (__kmp_tasking_mode != tskm_immediate_exec) {
1443 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001444 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001445 this_thr->th.th_task_team));
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001446 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001447 }
1448# endif /* KMP_DEBUG */
1449
1450 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1451 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1452 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1453 since the values are not used by __kmp_wait_template() in that case. */
1454 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001455#if KMP_USE_MONITOR
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001456 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001457#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001458 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1459 }
1460
1461#if USE_ITT_BUILD
1462 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1463 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1464#endif /* USE_ITT_BUILD */
1465
1466 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1467 case bp_hyper_bar: {
1468 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1469 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1470 USE_ITT_BUILD_ARG(itt_sync_obj) );
1471 break;
1472 }
1473 case bp_hierarchical_bar: {
1474 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1475 USE_ITT_BUILD_ARG(itt_sync_obj) );
1476 break;
1477 }
1478 case bp_tree_bar: {
1479 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1480 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1481 USE_ITT_BUILD_ARG(itt_sync_obj) );
1482 break;
1483 }
1484 default: {
1485 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1486 USE_ITT_BUILD_ARG(itt_sync_obj) );
1487 }
1488 }
1489
1490 /* From this point on, the team data structure may be deallocated at any time by the
1491 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1492 data items that need to be referenced before the end of the barrier should be moved to
1493 the kmp_task_team_t structs. */
1494 if (KMP_MASTER_TID(tid)) {
1495 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001496 __kmp_task_team_wait(this_thr, team
1497 USE_ITT_BUILD_ARG(itt_sync_obj) );
1498 }
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001499#if KMP_STATS_ENABLED
1500 // Have master thread flag the workers to indicate they are now waiting for
1501 // next parallel region, Also wake them up so they switch their timers to idle.
1502 for (int i=0; i<team->t.t_nproc; ++i) {
1503 kmp_info_t* team_thread = team->t.t_threads[i];
1504 if (team_thread == this_thr)
1505 continue;
1506 team_thread->th.th_stats->setIdleFlag();
1507 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL)
1508 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc);
1509 }
1510#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001511#if USE_ITT_BUILD
1512 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1513 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1514#endif /* USE_ITT_BUILD */
1515
1516# if USE_ITT_BUILD && USE_ITT_NOTIFY
1517 // Join barrier - report frame end
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001518 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1519#if OMP_40_ENABLED
1520 this_thr->th.th_teams_microtask == NULL &&
1521#endif
1522 team->t.t_active_level == 1)
1523 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001524 kmp_uint64 cur_time = __itt_get_timestamp();
1525 ident_t * loc = team->t.t_ident;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001526 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001527 int nproc = this_thr->th.th_team_nproc;
1528 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001529 switch(__kmp_forkjoin_frames_mode) {
1530 case 1:
1531 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1532 break;
1533 case 2:
1534 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1535 break;
1536 case 3:
1537 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001538 // Initialize with master's wait time
1539 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001540 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1541 this_thr->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001542 for (i=1; i<nproc; ++i) {
1543 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001544 other_threads[i]->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001545 }
1546 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1547 }
1548 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1549 this_thr->th.th_frame_time = cur_time;
1550 break;
1551 }
1552 }
1553# endif /* USE_ITT_BUILD */
1554 }
1555#if USE_ITT_BUILD
1556 else {
1557 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1558 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1559 }
1560#endif /* USE_ITT_BUILD */
1561
1562#if KMP_DEBUG
1563 if (KMP_MASTER_TID(tid)) {
1564 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1565 gtid, team_id, tid, nproc));
1566 }
1567#endif /* KMP_DEBUG */
1568
1569 // TODO now, mark worker threads as done so they may be disbanded
1570 KMP_MB(); // Flush all pending memory write invalidates.
1571 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001572
1573#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001574 if (ompt_enabled) {
Jonathan Peytoncab67cc2015-09-18 16:24:46 +00001575#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001576 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001577 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1578 team->t.ompt_team_info.parallel_id,
1579 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001580 }
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001581#endif
1582
1583 // return to default state
1584 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1585 }
1586#endif
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001587 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001588}
1589
1590
1591// TODO release worker threads' fork barriers as we are ready instead of all at once
1592void
1593__kmp_fork_barrier(int gtid, int tid)
1594{
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001595 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
1596 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001597 KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001598 kmp_info_t *this_thr = __kmp_threads[gtid];
1599 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1600#if USE_ITT_BUILD
1601 void * itt_sync_obj = NULL;
1602#endif /* USE_ITT_BUILD */
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001603 if (team)
1604 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001605
1606 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1607 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1608
1609 // th_team pointer only valid for master thread here
1610 if (KMP_MASTER_TID(tid)) {
1611#if USE_ITT_BUILD && USE_ITT_NOTIFY
1612 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1613 // Create itt barrier object
1614 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1615 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1616 }
1617#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1618
1619#ifdef KMP_DEBUG
1620 register kmp_info_t **other_threads = team->t.t_threads;
1621 register int i;
1622
1623 // Verify state
1624 KMP_MB();
1625
1626 for(i=1; i<team->t.t_nproc; ++i) {
1627 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1628 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1629 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1630 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1631 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1632 & ~(KMP_BARRIER_SLEEP_STATE))
1633 == KMP_INIT_BARRIER_STATE);
1634 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1635 }
1636#endif
1637
1638 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jonathan Peyton54127982015-11-04 21:37:48 +00001639 __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001640 }
1641
1642 /* The master thread may have changed its blocktime between the join barrier and the
1643 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1644 access it when the team struct is not guaranteed to exist. */
1645 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1646 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001647#if KMP_USE_MONITOR
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001648 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001649#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001650 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1651 }
1652 } // master
1653
1654 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1655 case bp_hyper_bar: {
1656 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1657 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1658 USE_ITT_BUILD_ARG(itt_sync_obj) );
1659 break;
1660 }
1661 case bp_hierarchical_bar: {
1662 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1663 USE_ITT_BUILD_ARG(itt_sync_obj) );
1664 break;
1665 }
1666 case bp_tree_bar: {
1667 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1668 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1669 USE_ITT_BUILD_ARG(itt_sync_obj) );
1670 break;
1671 }
1672 default: {
1673 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1674 USE_ITT_BUILD_ARG(itt_sync_obj) );
1675 }
1676 }
1677
1678 // Early exit for reaping threads releasing forkjoin barrier
1679 if (TCR_4(__kmp_global.g.g_done)) {
Jonathan Peyton54127982015-11-04 21:37:48 +00001680 this_thr->th.th_task_team = NULL;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001681
1682#if USE_ITT_BUILD && USE_ITT_NOTIFY
1683 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1684 if (!KMP_MASTER_TID(tid)) {
1685 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1686 if (itt_sync_obj)
1687 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1688 }
1689 }
1690#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1691 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1692 return;
1693 }
1694
1695 /* We can now assume that a valid team structure has been allocated by the master and
1696 propagated to all worker threads. The current thread, however, may not be part of the
1697 team, so we can't blindly assume that the team pointer is non-null. */
1698 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1699 KMP_DEBUG_ASSERT(team != NULL);
1700 tid = __kmp_tid_from_gtid(gtid);
1701
1702
1703#if KMP_BARRIER_ICV_PULL
1704 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1705 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1706 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1707 the fixed ICVs in the master's thread struct, because it is not always the case that the
1708 threads arrays have been allocated when __kmp_fork_call() is executed. */
Jonathan Peyton45be4502015-08-11 21:36:41 +00001709 {
1710 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
1711 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1712 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1713 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1714 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1715 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1716 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1717 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001718 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001719#endif // KMP_BARRIER_ICV_PULL
1720
1721 if (__kmp_tasking_mode != tskm_immediate_exec) {
1722 __kmp_task_team_sync(this_thr, team);
1723 }
1724
1725#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1726 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1727 if (proc_bind == proc_bind_intel) {
1728#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001729#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001730 // Call dynamic affinity settings
1731 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1732 __kmp_balanced_affinity(tid, team->t.t_nproc);
1733 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001734#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001735#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1736 }
Andrey Churbanov94e569e2015-03-10 09:19:47 +00001737 else if (proc_bind != proc_bind_false) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001738 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1739 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1740 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1741 }
1742 else {
1743 __kmp_affinity_set_place(gtid);
1744 }
1745 }
1746#endif
1747
1748#if USE_ITT_BUILD && USE_ITT_NOTIFY
1749 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1750 if (!KMP_MASTER_TID(tid)) {
1751 // Get correct barrier object
1752 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1753 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1754 } // (prepare called inside barrier_release)
1755 }
1756#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001757 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001758 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1759}
1760
1761
1762void
1763__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1764{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001765 KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001766
1767 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1768 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1769
1770 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1771 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1772 this data before this function is called. */
1773#if KMP_BARRIER_ICV_PULL
1774 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1775 all of the worker threads can access them and make their own copies after the barrier. */
1776 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1777 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1778 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1779 0, team->t.t_threads[0], team));
1780#elif KMP_BARRIER_ICV_PUSH
1781 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1782 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1783 0, team->t.t_threads[0], team));
1784#else
1785 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1786 ngo_load(new_icvs);
1787 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
Jonathan Peyton91b78702015-06-08 19:39:07 +00001788 for (int f=1; f<new_nproc; ++f) { // Skip the master thread
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001789 // TODO: GEH - pass in better source location info since usually NULL here
1790 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1791 f, team->t.t_threads[f], team));
1792 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1793 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1794 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1795 f, team->t.t_threads[f], team));
1796 }
1797 ngo_sync();
1798#endif // KMP_BARRIER_ICV_PULL
1799}