blob: 5e776146cd184d1a78cfd36fe0aaf77cb70d82ee [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
Jonathan Peytona0e159f2015-10-08 18:23:38 +000020#include "kmp_os.h"
21
Jim Cownie4cc4bb42014-10-07 16:25:50 +000022
23#if KMP_MIC
24#include <immintrin.h>
25#define USE_NGO_STORES 1
26#endif // KMP_MIC
27
Jonas Hahnfeld50fed042016-11-07 15:58:36 +000028#include "tsan_annotations.h"
29
Jim Cownie4cc4bb42014-10-07 16:25:50 +000030#if KMP_MIC && USE_NGO_STORES
31// ICV copying
32#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
33#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
35#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
36#else
37#define ngo_load(src) ((void)0)
38#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
Jonathan Peyton01b58b72015-07-09 18:20:51 +000039#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
Jim Cownie4cc4bb42014-10-07 16:25:50 +000040#define ngo_sync() ((void)0)
41#endif /* KMP_MIC && USE_NGO_STORES */
42
43void __kmp_print_structure(void); // Forward declaration
44
45// ---------------------------- Barrier Algorithms ----------------------------
46
47// Linear Barrier
48static void
49__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50 void (*reduce)(void *, void *)
51 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
52{
Jonathan Peyton5375fe82016-11-14 21:13:44 +000053 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +000054 register kmp_team_t *team = this_thr->th.th_team;
55 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
56 register kmp_info_t **other_threads = team->t.t_threads;
57
58 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59 gtid, team->t.t_id, tid, bt));
60 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61
62#if USE_ITT_BUILD && USE_ITT_NOTIFY
63 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +000064 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +000065 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
66 }
67#endif
68 // We now perform a linear reduction to signal that all of the threads have arrived.
69 if (!KMP_MASTER_TID(tid)) {
70 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
Jonathan Peytond26e2132015-09-10 18:44:30 +000071 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +000072 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
73 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
74 // Mark arrival to master thread
75 /* After performing this write, a worker thread may not assume that the team is valid
76 any more - it could be deallocated by the master thread at any time. */
77 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
78 flag.release();
79 } else {
80 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
81 register int nproc = this_thr->th.th_team_nproc;
82 register int i;
83 // Don't have to worry about sleep bit here or atomic since team setting
Jonathan Peytond26e2132015-09-10 18:44:30 +000084 register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
Jim Cownie4cc4bb42014-10-07 16:25:50 +000085
86 // Collect all the worker team member threads.
87 for (i=1; i<nproc; ++i) {
88#if KMP_CACHE_MANAGE
89 // Prefetch next thread's arrived count
90 if (i+1 < nproc)
91 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
92#endif /* KMP_CACHE_MANAGE */
93 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +000094 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +000095 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
96 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
97
98 // Wait for worker thread to arrive
99 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
100 flag.wait(this_thr, FALSE
101 USE_ITT_BUILD_ARG(itt_sync_obj) );
102#if USE_ITT_BUILD && USE_ITT_NOTIFY
103 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000104 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000105 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
106 other_threads[i]->th.th_bar_min_time);
107 }
108#endif
109 if (reduce) {
110 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
111 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000112 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000113 (*reduce)(this_thr->th.th_local.reduce_data,
114 other_threads[i]->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000115 ANNOTATE_REDUCE_BEFORE(reduce);
116 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000117 }
118 }
119 // Don't have to worry about sleep bit here or atomic since team setting
120 team_bar->b_arrived = new_state;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000121 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000122 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
123 }
124 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
125 gtid, team->t.t_id, tid, bt));
126}
127
128static void
129__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
130 int propagate_icvs
131 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
132{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000133 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000134 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
135 register kmp_team_t *team;
136
137 if (KMP_MASTER_TID(tid)) {
138 register unsigned int i;
139 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
140 register kmp_info_t **other_threads;
141
142 team = __kmp_threads[gtid]->th.th_team;
143 KMP_DEBUG_ASSERT(team != NULL);
144 other_threads = team->t.t_threads;
145
146 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
147 gtid, team->t.t_id, tid, bt));
148
149 if (nproc > 1) {
150#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000151 {
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000152 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
Jonathan Peyton45be4502015-08-11 21:36:41 +0000153 if (propagate_icvs) {
154 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
155 for (i=1; i<nproc; ++i) {
156 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
157 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
158 &team->t.t_implicit_task_taskdata[0].td_icvs);
159 }
160 ngo_sync();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000161 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000162 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000163#endif // KMP_BARRIER_ICV_PUSH
164
165 // Now, release all of the worker threads
166 for (i=1; i<nproc; ++i) {
167#if KMP_CACHE_MANAGE
168 // Prefetch next thread's go flag
169 if (i+1 < nproc)
170 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
171#endif /* KMP_CACHE_MANAGE */
172 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
173 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
174 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
175 &other_threads[i]->th.th_bar[bt].bb.b_go,
176 other_threads[i]->th.th_bar[bt].bb.b_go,
177 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
178 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
179 flag.release();
180 }
181 }
182 } else { // Wait for the MASTER thread to release us
183 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
184 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
185 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
186 flag.wait(this_thr, TRUE
187 USE_ITT_BUILD_ARG(itt_sync_obj) );
188#if USE_ITT_BUILD && USE_ITT_NOTIFY
189 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
190 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
191 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
192 // Cancel wait on previous parallel region...
193 __kmp_itt_task_starting(itt_sync_obj);
194
195 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
196 return;
197
198 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
199 if (itt_sync_obj != NULL)
200 // Call prepare as early as possible for "new" barrier
201 __kmp_itt_task_finished(itt_sync_obj);
202 } else
203#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
204 // Early exit for reaping threads releasing forkjoin barrier
205 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
206 return;
207 // The worker thread may now assume that the team is valid.
208#ifdef KMP_DEBUG
209 tid = __kmp_tid_from_gtid(gtid);
210 team = __kmp_threads[gtid]->th.th_team;
211#endif
212 KMP_DEBUG_ASSERT(team != NULL);
213 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
214 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
215 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
216 KMP_MB(); // Flush all pending memory write invalidates.
217 }
218 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
219 gtid, team->t.t_id, tid, bt));
220}
221
222// Tree barrier
223static void
224__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
225 void (*reduce)(void *, void *)
226 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
227{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000228 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000229 register kmp_team_t *team = this_thr->th.th_team;
230 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
231 register kmp_info_t **other_threads = team->t.t_threads;
232 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
233 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
234 register kmp_uint32 branch_factor = 1 << branch_bits;
235 register kmp_uint32 child;
236 register kmp_uint32 child_tid;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000237 register kmp_uint64 new_state;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000238
239 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
240 gtid, team->t.t_id, tid, bt));
241 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
242
243#if USE_ITT_BUILD && USE_ITT_NOTIFY
244 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000245 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000246 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
247 }
248#endif
249 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
250 child_tid = (tid << branch_bits) + 1;
251 if (child_tid < nproc) {
252 // Parent threads wait for all their children to arrive
253 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
254 child = 1;
255 do {
256 register kmp_info_t *child_thr = other_threads[child_tid];
257 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
258#if KMP_CACHE_MANAGE
259 // Prefetch next thread's arrived count
260 if (child+1 <= branch_factor && child_tid+1 < nproc)
261 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
262#endif /* KMP_CACHE_MANAGE */
263 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000264 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000265 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
266 &child_bar->b_arrived, new_state));
267 // Wait for child to arrive
268 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
269 flag.wait(this_thr, FALSE
270 USE_ITT_BUILD_ARG(itt_sync_obj) );
271#if USE_ITT_BUILD && USE_ITT_NOTIFY
272 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000273 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000274 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
275 child_thr->th.th_bar_min_time);
276 }
277#endif
278 if (reduce) {
279 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
280 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
281 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000282 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000283 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000284 ANNOTATE_REDUCE_BEFORE(reduce);
285 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000286 }
287 child++;
288 child_tid++;
289 }
290 while (child <= branch_factor && child_tid < nproc);
291 }
292
293 if (!KMP_MASTER_TID(tid)) { // Worker threads
294 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
295
296 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000297 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000298 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
299 &thr_bar->b_arrived, thr_bar->b_arrived,
300 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
301
302 // Mark arrival to parent thread
303 /* After performing this write, a worker thread may not assume that the team is valid
304 any more - it could be deallocated by the master thread at any time. */
305 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
306 flag.release();
307 } else {
308 // Need to update the team arrived pointer if we are the master thread
309 if (nproc > 1) // New value was already computed above
310 team->t.t_bar[bt].b_arrived = new_state;
311 else
312 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000313 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000314 gtid, team->t.t_id, tid, team->t.t_id,
315 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
316 }
317 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
318 gtid, team->t.t_id, tid, bt));
319}
320
321static void
322__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
323 int propagate_icvs
324 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
325{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000326 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000327 register kmp_team_t *team;
328 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
329 register kmp_uint32 nproc;
330 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
331 register kmp_uint32 branch_factor = 1 << branch_bits;
332 register kmp_uint32 child;
333 register kmp_uint32 child_tid;
334
335 // Perform a tree release for all of the threads that have been gathered
336 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
337 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
338 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
339 // Wait for parent thread to release us
340 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
341 flag.wait(this_thr, TRUE
342 USE_ITT_BUILD_ARG(itt_sync_obj) );
343#if USE_ITT_BUILD && USE_ITT_NOTIFY
344 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
345 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
346 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
347 // Cancel wait on previous parallel region...
348 __kmp_itt_task_starting(itt_sync_obj);
349
350 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
351 return;
352
353 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
354 if (itt_sync_obj != NULL)
355 // Call prepare as early as possible for "new" barrier
356 __kmp_itt_task_finished(itt_sync_obj);
357 } else
358#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
359 // Early exit for reaping threads releasing forkjoin barrier
360 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
361 return;
362
363 // The worker thread may now assume that the team is valid.
364 team = __kmp_threads[gtid]->th.th_team;
365 KMP_DEBUG_ASSERT(team != NULL);
366 tid = __kmp_tid_from_gtid(gtid);
367
368 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
369 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
370 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
371 KMP_MB(); // Flush all pending memory write invalidates.
372 } else {
373 team = __kmp_threads[gtid]->th.th_team;
374 KMP_DEBUG_ASSERT(team != NULL);
375 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
376 gtid, team->t.t_id, tid, bt));
377 }
378 nproc = this_thr->th.th_team_nproc;
379 child_tid = (tid << branch_bits) + 1;
380
381 if (child_tid < nproc) {
382 register kmp_info_t **other_threads = team->t.t_threads;
383 child = 1;
384 // Parent threads release all their children
385 do {
386 register kmp_info_t *child_thr = other_threads[child_tid];
387 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
388#if KMP_CACHE_MANAGE
389 // Prefetch next thread's go count
390 if (child+1 <= branch_factor && child_tid+1 < nproc)
391 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
392#endif /* KMP_CACHE_MANAGE */
393
394#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000395 {
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000396 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
Jonathan Peyton45be4502015-08-11 21:36:41 +0000397 if (propagate_icvs) {
398 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
399 team, child_tid, FALSE);
400 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
401 &team->t.t_implicit_task_taskdata[0].td_icvs);
402 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000403 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000404#endif // KMP_BARRIER_ICV_PUSH
405 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
406 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
407 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
408 child_tid, &child_bar->b_go, child_bar->b_go,
409 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
410 // Release child from barrier
411 kmp_flag_64 flag(&child_bar->b_go, child_thr);
412 flag.release();
413 child++;
414 child_tid++;
415 }
416 while (child <= branch_factor && child_tid < nproc);
417 }
418 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
419 gtid, team->t.t_id, tid, bt));
420}
421
422
423// Hyper Barrier
424static void
425__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
426 void (*reduce)(void *, void *)
427 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
428{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000429 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000430 register kmp_team_t *team = this_thr->th.th_team;
431 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
432 register kmp_info_t **other_threads = team->t.t_threads;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000433 register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000434 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
435 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
436 register kmp_uint32 branch_factor = 1 << branch_bits;
437 register kmp_uint32 offset;
438 register kmp_uint32 level;
439
440 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
441 gtid, team->t.t_id, tid, bt));
442
443 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
444
445#if USE_ITT_BUILD && USE_ITT_NOTIFY
446 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000447 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000448 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
449 }
450#endif
451 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
452 arrived, and reduce any required data as we go. */
453 kmp_flag_64 p_flag(&thr_bar->b_arrived);
454 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
455 {
456 register kmp_uint32 child;
457 register kmp_uint32 child_tid;
458
459 if (((tid >> level) & (branch_factor - 1)) != 0) {
460 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
461
462 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000463 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000464 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
465 &thr_bar->b_arrived, thr_bar->b_arrived,
466 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
467 // Mark arrival to parent thread
468 /* After performing this write (in the last iteration of the enclosing for loop),
469 a worker thread may not assume that the team is valid any more - it could be
470 deallocated by the master thread at any time. */
471 p_flag.set_waiter(other_threads[parent_tid]);
Jonathan Peyton1bd61b42015-10-08 19:44:16 +0000472 p_flag.release();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000473 break;
474 }
475
476 // Parent threads wait for children to arrive
477 if (new_state == KMP_BARRIER_UNUSED_STATE)
478 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
479 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
480 child++, child_tid+=(1 << level))
481 {
482 register kmp_info_t *child_thr = other_threads[child_tid];
483 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
484#if KMP_CACHE_MANAGE
485 register kmp_uint32 next_child_tid = child_tid + (1 << level);
486 // Prefetch next thread's arrived count
487 if (child+1 < branch_factor && next_child_tid < num_threads)
488 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
489#endif /* KMP_CACHE_MANAGE */
490 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000491 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000492 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
493 &child_bar->b_arrived, new_state));
494 // Wait for child to arrive
495 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
496 c_flag.wait(this_thr, FALSE
497 USE_ITT_BUILD_ARG(itt_sync_obj) );
498#if USE_ITT_BUILD && USE_ITT_NOTIFY
499 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000500 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000501 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
502 child_thr->th.th_bar_min_time);
503 }
504#endif
505 if (reduce) {
506 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
507 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
508 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000509 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000510 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000511 ANNOTATE_REDUCE_BEFORE(reduce);
512 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000513 }
514 }
515 }
516
517 if (KMP_MASTER_TID(tid)) {
518 // Need to update the team arrived pointer if we are the master thread
519 if (new_state == KMP_BARRIER_UNUSED_STATE)
520 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
521 else
522 team->t.t_bar[bt].b_arrived = new_state;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000523 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000524 gtid, team->t.t_id, tid, team->t.t_id,
525 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
526 }
527 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
528 gtid, team->t.t_id, tid, bt));
529}
530
531// The reverse versions seem to beat the forward versions overall
532#define KMP_REVERSE_HYPER_BAR
533static void
534__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
535 int propagate_icvs
536 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
537{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000538 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000539 register kmp_team_t *team;
540 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
541 register kmp_info_t **other_threads;
542 register kmp_uint32 num_threads;
543 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
544 register kmp_uint32 branch_factor = 1 << branch_bits;
545 register kmp_uint32 child;
546 register kmp_uint32 child_tid;
547 register kmp_uint32 offset;
548 register kmp_uint32 level;
549
550 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
551 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
552 order of the corresponding gather, otherwise threads are released in the same order. */
553 if (KMP_MASTER_TID(tid)) { // master
554 team = __kmp_threads[gtid]->th.th_team;
555 KMP_DEBUG_ASSERT(team != NULL);
556 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
557 gtid, team->t.t_id, tid, bt));
558#if KMP_BARRIER_ICV_PUSH
559 if (propagate_icvs) { // master already has ICVs in final destination; copy
560 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
561 }
562#endif
563 }
564 else { // Handle fork barrier workers who aren't part of a team yet
565 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
566 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
567 // Wait for parent thread to release us
568 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
569 flag.wait(this_thr, TRUE
570 USE_ITT_BUILD_ARG(itt_sync_obj) );
571#if USE_ITT_BUILD && USE_ITT_NOTIFY
572 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
573 // In fork barrier where we could not get the object reliably
574 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
575 // Cancel wait on previous parallel region...
576 __kmp_itt_task_starting(itt_sync_obj);
577
578 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
579 return;
580
581 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
582 if (itt_sync_obj != NULL)
583 // Call prepare as early as possible for "new" barrier
584 __kmp_itt_task_finished(itt_sync_obj);
585 } else
586#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
587 // Early exit for reaping threads releasing forkjoin barrier
588 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
589 return;
590
591 // The worker thread may now assume that the team is valid.
592 team = __kmp_threads[gtid]->th.th_team;
593 KMP_DEBUG_ASSERT(team != NULL);
594 tid = __kmp_tid_from_gtid(gtid);
595
596 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
597 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
598 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
599 KMP_MB(); // Flush all pending memory write invalidates.
600 }
601 num_threads = this_thr->th.th_team_nproc;
602 other_threads = team->t.t_threads;
603
604#ifdef KMP_REVERSE_HYPER_BAR
605 // Count up to correct level for parent
606 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
607 level+=branch_bits, offset<<=branch_bits);
608
609 // Now go down from there
610 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
611 level-=branch_bits, offset>>=branch_bits)
612#else
613 // Go down the tree, level by level
614 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
615#endif // KMP_REVERSE_HYPER_BAR
616 {
617#ifdef KMP_REVERSE_HYPER_BAR
618 /* Now go in reverse order through the children, highest to lowest.
619 Initial setting of child is conservative here. */
620 child = num_threads >> ((level==0)?level:level-1);
621 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
622 child>=1; child--, child_tid-=(1<<level))
623#else
624 if (((tid >> level) & (branch_factor - 1)) != 0)
625 // No need to go lower than this, since this is the level parent would be notified
626 break;
627 // Iterate through children on this level of the tree
628 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
629 child++, child_tid+=(1<<level))
630#endif // KMP_REVERSE_HYPER_BAR
631 {
632 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
633 else {
634 register kmp_info_t *child_thr = other_threads[child_tid];
635 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
636#if KMP_CACHE_MANAGE
637 register kmp_uint32 next_child_tid = child_tid - (1 << level);
638 // Prefetch next thread's go count
639# ifdef KMP_REVERSE_HYPER_BAR
640 if (child-1 >= 1 && next_child_tid < num_threads)
641# else
642 if (child+1 < branch_factor && next_child_tid < num_threads)
643# endif // KMP_REVERSE_HYPER_BAR
644 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
645#endif /* KMP_CACHE_MANAGE */
646
647#if KMP_BARRIER_ICV_PUSH
648 if (propagate_icvs) // push my fixed ICVs to my child
649 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
650#endif // KMP_BARRIER_ICV_PUSH
651
652 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
653 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
654 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
655 child_tid, &child_bar->b_go, child_bar->b_go,
656 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
657 // Release child from barrier
658 kmp_flag_64 flag(&child_bar->b_go, child_thr);
659 flag.release();
660 }
661 }
662 }
663#if KMP_BARRIER_ICV_PUSH
664 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
665 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
666 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
667 }
668#endif
669 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
670 gtid, team->t.t_id, tid, bt));
671}
672
673// Hierarchical Barrier
674
675// Initialize thread barrier data
676/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
677 minimum amount of initialization required based on how the team has changed. Returns true if
678 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
679 team size increases, threads already in the team will respond to on-core wakeup on their parent
680 thread, but threads newly added to the team will only be listening on the their local b_go. */
681static bool
682__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
683 int gtid, int tid, kmp_team_t *team)
684{
685 // Checks to determine if (re-)initialization is needed
686 bool uninitialized = thr_bar->team == NULL;
687 bool team_changed = team != thr_bar->team;
688 bool team_sz_changed = nproc != thr_bar->nproc;
689 bool tid_changed = tid != thr_bar->old_tid;
690 bool retval = false;
691
692 if (uninitialized || team_sz_changed) {
693 __kmp_get_hierarchy(nproc, thr_bar);
694 }
695
696 if (uninitialized || team_sz_changed || tid_changed) {
697 thr_bar->my_level = thr_bar->depth-1; // default for master
698 thr_bar->parent_tid = -1; // default for master
699 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
700 kmp_uint32 d=0;
701 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
702 kmp_uint32 rem;
703 if (d == thr_bar->depth-2) { // reached level right below the master
704 thr_bar->parent_tid = 0;
705 thr_bar->my_level = d;
706 break;
707 }
708 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
709 // thread is not a subtree root at next level, so this is max
710 thr_bar->parent_tid = tid - rem;
711 thr_bar->my_level = d;
712 break;
713 }
714 ++d;
715 }
716 }
717 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
718 thr_bar->old_tid = tid;
719 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
Jonathan Peytonb0b83c82015-11-09 16:28:32 +0000720 thr_bar->team = team;
721 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000722 }
723 if (uninitialized || team_changed || tid_changed) {
724 thr_bar->team = team;
725 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
726 retval = true;
727 }
728 if (uninitialized || team_sz_changed || tid_changed) {
729 thr_bar->nproc = nproc;
730 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
731 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
732 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
733 thr_bar->leaf_kids = nproc - tid - 1;
734 thr_bar->leaf_state = 0;
735 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
736 }
737 return retval;
738}
739
740static void
741__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
742 int gtid, int tid, void (*reduce) (void *, void *)
743 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
744{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000745 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000746 register kmp_team_t *team = this_thr->th.th_team;
747 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
748 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
749 register kmp_info_t **other_threads = team->t.t_threads;
750 register kmp_uint64 new_state;
751
Andrey Churbanov42a79212015-01-27 16:50:31 +0000752 int level = team->t.t_level;
Jonathan Peyton441f3372015-09-21 17:24:46 +0000753#if OMP_40_ENABLED
Andrey Churbanov42a79212015-01-27 16:50:31 +0000754 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
755 if (this_thr->th.th_teams_size.nteams > 1)
756 ++level; // level was not increased in teams construct for team_of_masters
Jonathan Peyton441f3372015-09-21 17:24:46 +0000757#endif
Andrey Churbanov42a79212015-01-27 16:50:31 +0000758 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000759 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
760
761 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
762 gtid, team->t.t_id, tid, bt));
763 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
764
Andrey Churbanove6bfb732015-05-06 18:34:15 +0000765#if USE_ITT_BUILD && USE_ITT_NOTIFY
766 // Barrier imbalance - save arrive time to the thread
767 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
768 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
769 }
770#endif
771
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000772 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
773
774 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
775 register kmp_int32 child_tid;
776 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
777 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
778 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
Jonathan Peytond26e2132015-09-10 18:44:30 +0000779 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
Jonathan Peyton90862c42015-11-12 21:40:39 +0000780 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n",
781 gtid, team->t.t_id, tid));
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000782 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
783 flag.wait(this_thr, FALSE
784 USE_ITT_BUILD_ARG(itt_sync_obj) );
785 if (reduce) {
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000786 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000787 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
788 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
789 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
790 team->t.t_id, child_tid));
791 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
792 }
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000793 ANNOTATE_REDUCE_BEFORE(reduce);
794 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000795 }
796 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
797 }
798 // Next, wait for higher level children on each child's b_arrived flag
799 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
800 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
801 if (last > nproc) last = nproc;
802 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
803 register kmp_info_t *child_thr = other_threads[child_tid];
804 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
805 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000806 "arrived(%p) == %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000807 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
808 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
809 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
810 flag.wait(this_thr, FALSE
811 USE_ITT_BUILD_ARG(itt_sync_obj) );
812 if (reduce) {
813 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
814 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
815 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000816 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000817 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000818 ANNOTATE_REDUCE_BEFORE(reduce);
819 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000820 }
821 }
822 }
823 }
824 else { // Blocktime is not infinite
825 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
826 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
827 if (last > nproc) last = nproc;
828 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
829 register kmp_info_t *child_thr = other_threads[child_tid];
830 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
831 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000832 "arrived(%p) == %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000833 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
834 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
835 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
836 flag.wait(this_thr, FALSE
837 USE_ITT_BUILD_ARG(itt_sync_obj) );
838 if (reduce) {
839 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
840 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
841 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000842 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000843 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000844 ANNOTATE_REDUCE_BEFORE(reduce);
845 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000846 }
847 }
848 }
849 }
850 }
851 // All subordinates are gathered; now release parent if not master thread
852
853 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
854 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000855 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000856 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
857 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
858 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
859 the team is valid any more - it could be deallocated by the master thread at any time. */
860 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
861 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
862 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
863 flag.release();
864 }
865 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
Jonathan Peytond26e2132015-09-10 18:44:30 +0000866 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000867 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
868 flag.set_waiter(other_threads[thr_bar->parent_tid]);
869 flag.release();
870 }
871 } else { // Master thread needs to update the team's b_arrived value
Jonathan Peytond26e2132015-09-10 18:44:30 +0000872 team->t.t_bar[bt].b_arrived = new_state;
873 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000874 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
875 }
876 // Is the team access below unsafe or just technically invalid?
877 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
878 gtid, team->t.t_id, tid, bt));
879}
880
881static void
882__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
883 int propagate_icvs
884 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
885{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000886 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000887 register kmp_team_t *team;
888 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
889 register kmp_uint32 nproc;
890 bool team_change = false; // indicates on-core barrier shouldn't be used
891
892 if (KMP_MASTER_TID(tid)) {
893 team = __kmp_threads[gtid]->th.th_team;
894 KMP_DEBUG_ASSERT(team != NULL);
895 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
896 gtid, team->t.t_id, tid, bt));
897 }
898 else { // Worker threads
899 // Wait for parent thread to release me
900 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
901 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
902 // Use traditional method of waiting on my own b_go flag
903 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
904 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
905 flag.wait(this_thr, TRUE
906 USE_ITT_BUILD_ARG(itt_sync_obj) );
907 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
908 }
909 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
910 // Wait on my "offset" bits on parent's b_go flag
911 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
912 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
913 bt, this_thr
914 USE_ITT_BUILD_ARG(itt_sync_obj) );
Jonathan Peytona0e159f2015-10-08 18:23:38 +0000915 flag.wait(this_thr, TRUE);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000916 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
917 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
918 }
919 else { // Reset my bits on parent's b_go flag
920 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
921 }
922 }
923 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
924 // Early exit for reaping threads releasing forkjoin barrier
925 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
926 return;
927 // The worker thread may now assume that the team is valid.
928 team = __kmp_threads[gtid]->th.th_team;
929 KMP_DEBUG_ASSERT(team != NULL);
930 tid = __kmp_tid_from_gtid(gtid);
931
932 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
933 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
934 KMP_MB(); // Flush all pending memory write invalidates.
935 }
936
Jonathan Peytona0e159f2015-10-08 18:23:38 +0000937 nproc = this_thr->th.th_team_nproc;
Andrey Churbanov42a79212015-01-27 16:50:31 +0000938 int level = team->t.t_level;
Jonathan Peyton441f3372015-09-21 17:24:46 +0000939#if OMP_40_ENABLED
Andrey Churbanov42a79212015-01-27 16:50:31 +0000940 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
941 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
942 ++level; // level was not increased in teams construct for team_of_workers
943 if( this_thr->th.th_teams_size.nteams > 1 )
944 ++level; // level was not increased in teams construct for team_of_masters
945 }
Jonathan Peyton441f3372015-09-21 17:24:46 +0000946#endif
Andrey Churbanov42a79212015-01-27 16:50:31 +0000947 if (level == 1) thr_bar->use_oncore_barrier = 1;
948 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000949
950 // If the team size has increased, we still communicate with old leaves via oncore barrier.
951 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
952 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
953 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
954 // But if the entire team changes, we won't use oncore barrier at all
955 if (team_change) old_leaf_kids = 0;
956
957#if KMP_BARRIER_ICV_PUSH
958 if (propagate_icvs) {
Jonathan Peyton2211cfe2015-08-12 20:59:48 +0000959 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000960 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
961 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
962 }
963 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
964 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
965 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
966 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
967 &thr_bar->parent_bar->th_fixed_icvs);
968 // non-leaves will get ICVs piggybacked with b_go via NGO store
969 }
970 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
971 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
972 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
973 else // leaves copy parent's fixed ICVs directly to local ICV store
974 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
975 &thr_bar->parent_bar->th_fixed_icvs);
976 }
977 }
978#endif // KMP_BARRIER_ICV_PUSH
979
980 // Now, release my children
981 if (thr_bar->my_level) { // not a leaf
982 register kmp_int32 child_tid;
983 kmp_uint32 last;
984 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
985 if (KMP_MASTER_TID(tid)) { // do a flat release
986 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
987 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
988 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
989 ngo_load(&thr_bar->th_fixed_icvs);
990 // This loops over all the threads skipping only the leaf nodes in the hierarchy
991 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
992 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
993 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
994 " go(%p): %u => %u\n",
995 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
996 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
997 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
998 // Use ngo store (if available) to both store ICVs and release child via child's b_go
999 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1000 }
1001 ngo_sync();
1002 }
1003 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1004 // Now, release leaf children
1005 if (thr_bar->leaf_kids) { // if there are any
1006 // We test team_change on the off-chance that the level 1 team changed.
1007 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
1008 if (old_leaf_kids) { // release old leaf kids
1009 thr_bar->b_go |= old_leaf_state;
1010 }
1011 // Release new leaf kids
1012 last = tid+thr_bar->skip_per_level[1];
1013 if (last > nproc) last = nproc;
1014 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
1015 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1016 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1017 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1018 " T#%d(%d:%d) go(%p): %u => %u\n",
1019 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1020 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1021 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1022 // Release child using child's b_go flag
1023 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1024 flag.release();
1025 }
1026 }
1027 else { // Release all children at once with leaf_state bits on my own b_go flag
1028 thr_bar->b_go |= thr_bar->leaf_state;
1029 }
1030 }
1031 }
1032 else { // Blocktime is not infinite; do a simple hierarchical release
1033 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1034 last = tid+thr_bar->skip_per_level[d+1];
1035 kmp_uint32 skip = thr_bar->skip_per_level[d];
1036 if (last > nproc) last = nproc;
1037 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1038 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1039 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1040 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1041 " go(%p): %u => %u\n",
1042 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1043 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1044 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1045 // Release child using child's b_go flag
1046 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1047 flag.release();
1048 }
1049 }
1050 }
1051#if KMP_BARRIER_ICV_PUSH
1052 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1053 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1054#endif // KMP_BARRIER_ICV_PUSH
1055 }
1056 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1057 gtid, team->t.t_id, tid, bt));
1058}
1059
1060// ---------------------------- End of Barrier Algorithms ----------------------------
1061
1062// Internal function to do a barrier.
1063/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1064 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1065 Returns 0 if master thread, 1 if worker thread. */
1066int
1067__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1068 void *reduce_data, void (*reduce)(void *, void *))
1069{
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001070 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001071 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001072 register int tid = __kmp_tid_from_gtid(gtid);
1073 register kmp_info_t *this_thr = __kmp_threads[gtid];
1074 register kmp_team_t *team = this_thr->th.th_team;
1075 register int status = 0;
1076 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001077#if OMPT_SUPPORT
1078 ompt_task_id_t my_task_id;
1079 ompt_parallel_id_t my_parallel_id;
1080#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001081
1082 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1083 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1084
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001085 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001086#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001087 if (ompt_enabled) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001088#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001089 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1090 my_parallel_id = team->t.ompt_team_info.parallel_id;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001091
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001092#if OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001093 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1094 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1095 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001096 my_parallel_id, my_task_id);
1097 }
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001098 }
1099#endif
1100 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1101 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1102 my_parallel_id, my_task_id);
1103 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001104#endif
1105 // It is OK to report the barrier state after the barrier begin callback.
1106 // According to the OMPT specification, a compliant implementation may
1107 // even delay reporting this state until the barrier begins to wait.
1108 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001109 }
1110#endif
1111
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001112 if (! team->t.t_serialized) {
1113#if USE_ITT_BUILD
1114 // This value will be used in itt notify events below.
1115 void *itt_sync_obj = NULL;
1116# if USE_ITT_NOTIFY
1117 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1118 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1119# endif
1120#endif /* USE_ITT_BUILD */
1121 if (__kmp_tasking_mode == tskm_extra_barrier) {
1122 __kmp_tasking_barrier(team, this_thr, gtid);
1123 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1124 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1125 }
1126
1127 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1128 the team struct is not guaranteed to exist. */
1129 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1130 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001131#if KMP_USE_MONITOR
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001132 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001133#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001134 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1135 }
1136
1137#if USE_ITT_BUILD
1138 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1139 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1140#endif /* USE_ITT_BUILD */
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001141#if USE_DEBUGGER
1142 // Let the debugger know: the thread arrived to the barrier and waiting.
1143 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1144 team->t.t_bar[bt].b_master_arrived += 1;
1145 } else {
1146 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1147 } // if
1148#endif /* USE_DEBUGGER */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001149 if (reduce != NULL) {
1150 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1151 this_thr->th.th_local.reduce_data = reduce_data;
1152 }
Jonathan Peytonb0b83c82015-11-09 16:28:32 +00001153
1154 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1155 __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1
1156
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001157 switch (__kmp_barrier_gather_pattern[bt]) {
1158 case bp_hyper_bar: {
1159 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1160 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1161 USE_ITT_BUILD_ARG(itt_sync_obj) );
1162 break;
1163 }
1164 case bp_hierarchical_bar: {
1165 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1166 USE_ITT_BUILD_ARG(itt_sync_obj));
1167 break;
1168 }
1169 case bp_tree_bar: {
1170 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1171 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1172 USE_ITT_BUILD_ARG(itt_sync_obj) );
1173 break;
1174 }
1175 default: {
1176 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1177 USE_ITT_BUILD_ARG(itt_sync_obj) );
1178 }
1179 }
1180
1181 KMP_MB();
1182
1183 if (KMP_MASTER_TID(tid)) {
1184 status = 0;
1185 if (__kmp_tasking_mode != tskm_immediate_exec) {
1186 __kmp_task_team_wait(this_thr, team
1187 USE_ITT_BUILD_ARG(itt_sync_obj) );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001188 }
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001189#if USE_DEBUGGER
1190 // Let the debugger know: All threads are arrived and starting leaving the barrier.
1191 team->t.t_bar[bt].b_team_arrived += 1;
1192#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001193
1194#if USE_ITT_BUILD
1195 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1196 before the final summation into the shared variable is done (final summation can be a
1197 long operation for array reductions). */
1198 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1199 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1200#endif /* USE_ITT_BUILD */
1201#if USE_ITT_BUILD && USE_ITT_NOTIFY
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001202 // Barrier - report frame end (only if active_level == 1)
1203 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1204#if OMP_40_ENABLED
1205 this_thr->th.th_teams_microtask == NULL &&
1206#endif
1207 team->t.t_active_level == 1)
1208 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001209 kmp_uint64 cur_time = __itt_get_timestamp();
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001210 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001211 int nproc = this_thr->th.th_team_nproc;
1212 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001213 switch(__kmp_forkjoin_frames_mode) {
1214 case 1:
1215 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1216 this_thr->th.th_frame_time = cur_time;
1217 break;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001218 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001219 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1220 break;
1221 case 3:
1222 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001223 // Initialize with master's wait time
1224 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001225 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1226 this_thr->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001227 for (i=1; i<nproc; ++i) {
1228 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001229 other_threads[i]->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001230 }
1231 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1232 }
1233 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1234 this_thr->th.th_frame_time = cur_time;
1235 break;
1236 }
1237 }
1238#endif /* USE_ITT_BUILD */
1239 } else {
1240 status = 1;
1241#if USE_ITT_BUILD
1242 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1243 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1244#endif /* USE_ITT_BUILD */
1245 }
1246 if (status == 1 || ! is_split) {
1247 switch (__kmp_barrier_release_pattern[bt]) {
1248 case bp_hyper_bar: {
1249 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1250 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1251 USE_ITT_BUILD_ARG(itt_sync_obj) );
1252 break;
1253 }
1254 case bp_hierarchical_bar: {
1255 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1256 USE_ITT_BUILD_ARG(itt_sync_obj) );
1257 break;
1258 }
1259 case bp_tree_bar: {
1260 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1261 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1262 USE_ITT_BUILD_ARG(itt_sync_obj) );
1263 break;
1264 }
1265 default: {
1266 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1267 USE_ITT_BUILD_ARG(itt_sync_obj) );
1268 }
1269 }
1270 if (__kmp_tasking_mode != tskm_immediate_exec) {
1271 __kmp_task_team_sync(this_thr, team);
1272 }
1273 }
1274
1275#if USE_ITT_BUILD
1276 /* GEH: TODO: Move this under if-condition above and also include in
1277 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1278 of the threads for split barriers. */
1279 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1280 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1281#endif /* USE_ITT_BUILD */
1282 } else { // Team is serialized.
1283 status = 0;
1284 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jonathan Peytondf6818b2016-06-14 17:57:47 +00001285#if OMP_45_ENABLED
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001286 if ( this_thr->th.th_task_team != NULL ) {
1287 void *itt_sync_obj = NULL;
1288#if USE_ITT_NOTIFY
1289 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1290 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1291 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1292 }
1293#endif
1294
Jonathan Peytonfe9a1d72015-08-26 19:58:48 +00001295 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001296 __kmp_task_team_wait(this_thr, team
1297 USE_ITT_BUILD_ARG(itt_sync_obj));
Jonathan Peyton54127982015-11-04 21:37:48 +00001298 __kmp_task_team_setup(this_thr, team, 0);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001299
1300#if USE_ITT_BUILD
1301 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1302 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1303#endif /* USE_ITT_BUILD */
1304 }
1305#else
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001306 // The task team should be NULL for serialized code (tasks will be executed immediately)
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001307 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001308 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001309#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001310 }
1311 }
1312 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1313 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001314
1315#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001316 if (ompt_enabled) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001317#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001318 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001319 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1320 my_parallel_id, my_task_id);
1321 }
1322#endif
1323 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1324 }
1325#endif
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001326 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001327
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001328 return status;
1329}
1330
1331
1332void
1333__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1334{
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001335 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1336 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001337 int tid = __kmp_tid_from_gtid(gtid);
1338 kmp_info_t *this_thr = __kmp_threads[gtid];
1339 kmp_team_t *team = this_thr->th.th_team;
1340
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001341 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001342 if (!team->t.t_serialized) {
1343 if (KMP_MASTER_GTID(gtid)) {
1344 switch (__kmp_barrier_release_pattern[bt]) {
1345 case bp_hyper_bar: {
1346 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1347 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1348 USE_ITT_BUILD_ARG(NULL) );
1349 break;
1350 }
1351 case bp_hierarchical_bar: {
1352 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1353 USE_ITT_BUILD_ARG(NULL));
1354 break;
1355 }
1356 case bp_tree_bar: {
1357 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1358 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1359 USE_ITT_BUILD_ARG(NULL) );
1360 break;
1361 }
1362 default: {
1363 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1364 USE_ITT_BUILD_ARG(NULL) );
1365 }
1366 }
1367 if (__kmp_tasking_mode != tskm_immediate_exec) {
1368 __kmp_task_team_sync(this_thr, team);
1369 } // if
1370 }
1371 }
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001372 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001373}
1374
1375
1376void
1377__kmp_join_barrier(int gtid)
1378{
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001379 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001380 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001381 register kmp_info_t *this_thr = __kmp_threads[gtid];
1382 register kmp_team_t *team;
1383 register kmp_uint nproc;
1384 kmp_info_t *master_thread;
1385 int tid;
1386#ifdef KMP_DEBUG
1387 int team_id;
1388#endif /* KMP_DEBUG */
1389#if USE_ITT_BUILD
1390 void *itt_sync_obj = NULL;
1391# if USE_ITT_NOTIFY
1392 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1393 // Get object created at fork_barrier
1394 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1395# endif
1396#endif /* USE_ITT_BUILD */
1397 KMP_MB();
1398
1399 // Get current info
1400 team = this_thr->th.th_team;
1401 nproc = this_thr->th.th_team_nproc;
1402 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1403 tid = __kmp_tid_from_gtid(gtid);
1404#ifdef KMP_DEBUG
1405 team_id = team->t.t_id;
1406#endif /* KMP_DEBUG */
1407 master_thread = this_thr->th.th_team_master;
1408#ifdef KMP_DEBUG
1409 if (master_thread != team->t.t_threads[0]) {
1410 __kmp_print_structure();
1411 }
1412#endif /* KMP_DEBUG */
1413 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1414 KMP_MB();
1415
1416 // Verify state
1417 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1418 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1419 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1420 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1421 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1422
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001423 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
Jonathan Peyton61118492016-05-20 19:03:38 +00001424#if OMPT_SUPPORT
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001425#if OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001426 if (ompt_enabled &&
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001427 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1428 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1429 team->t.ompt_team_info.parallel_id,
1430 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1431 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001432#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001433 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1434#endif
1435
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001436 if (__kmp_tasking_mode == tskm_extra_barrier) {
1437 __kmp_tasking_barrier(team, this_thr, gtid);
1438 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1439 }
1440# ifdef KMP_DEBUG
1441 if (__kmp_tasking_mode != tskm_immediate_exec) {
1442 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001443 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001444 this_thr->th.th_task_team));
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001445 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001446 }
1447# endif /* KMP_DEBUG */
1448
1449 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1450 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1451 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1452 since the values are not used by __kmp_wait_template() in that case. */
1453 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001454#if KMP_USE_MONITOR
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001455 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001456#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001457 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1458 }
1459
1460#if USE_ITT_BUILD
1461 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1462 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1463#endif /* USE_ITT_BUILD */
1464
1465 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1466 case bp_hyper_bar: {
1467 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1468 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1469 USE_ITT_BUILD_ARG(itt_sync_obj) );
1470 break;
1471 }
1472 case bp_hierarchical_bar: {
1473 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1474 USE_ITT_BUILD_ARG(itt_sync_obj) );
1475 break;
1476 }
1477 case bp_tree_bar: {
1478 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1479 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1480 USE_ITT_BUILD_ARG(itt_sync_obj) );
1481 break;
1482 }
1483 default: {
1484 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1485 USE_ITT_BUILD_ARG(itt_sync_obj) );
1486 }
1487 }
1488
1489 /* From this point on, the team data structure may be deallocated at any time by the
1490 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1491 data items that need to be referenced before the end of the barrier should be moved to
1492 the kmp_task_team_t structs. */
1493 if (KMP_MASTER_TID(tid)) {
1494 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001495 __kmp_task_team_wait(this_thr, team
1496 USE_ITT_BUILD_ARG(itt_sync_obj) );
1497 }
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001498#if KMP_STATS_ENABLED
1499 // Have master thread flag the workers to indicate they are now waiting for
1500 // next parallel region, Also wake them up so they switch their timers to idle.
1501 for (int i=0; i<team->t.t_nproc; ++i) {
1502 kmp_info_t* team_thread = team->t.t_threads[i];
1503 if (team_thread == this_thr)
1504 continue;
1505 team_thread->th.th_stats->setIdleFlag();
1506 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL)
1507 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc);
1508 }
1509#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001510#if USE_ITT_BUILD
1511 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1512 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1513#endif /* USE_ITT_BUILD */
1514
1515# if USE_ITT_BUILD && USE_ITT_NOTIFY
1516 // Join barrier - report frame end
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001517 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1518#if OMP_40_ENABLED
1519 this_thr->th.th_teams_microtask == NULL &&
1520#endif
1521 team->t.t_active_level == 1)
1522 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001523 kmp_uint64 cur_time = __itt_get_timestamp();
1524 ident_t * loc = team->t.t_ident;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001525 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001526 int nproc = this_thr->th.th_team_nproc;
1527 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001528 switch(__kmp_forkjoin_frames_mode) {
1529 case 1:
1530 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1531 break;
1532 case 2:
1533 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1534 break;
1535 case 3:
1536 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001537 // Initialize with master's wait time
1538 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001539 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1540 this_thr->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001541 for (i=1; i<nproc; ++i) {
1542 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001543 other_threads[i]->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001544 }
1545 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1546 }
1547 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1548 this_thr->th.th_frame_time = cur_time;
1549 break;
1550 }
1551 }
1552# endif /* USE_ITT_BUILD */
1553 }
1554#if USE_ITT_BUILD
1555 else {
1556 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1557 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1558 }
1559#endif /* USE_ITT_BUILD */
1560
1561#if KMP_DEBUG
1562 if (KMP_MASTER_TID(tid)) {
1563 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1564 gtid, team_id, tid, nproc));
1565 }
1566#endif /* KMP_DEBUG */
1567
1568 // TODO now, mark worker threads as done so they may be disbanded
1569 KMP_MB(); // Flush all pending memory write invalidates.
1570 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001571
1572#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001573 if (ompt_enabled) {
Jonathan Peytoncab67cc2015-09-18 16:24:46 +00001574#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001575 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001576 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1577 team->t.ompt_team_info.parallel_id,
1578 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001579 }
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001580#endif
1581
1582 // return to default state
1583 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1584 }
1585#endif
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001586 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001587}
1588
1589
1590// TODO release worker threads' fork barriers as we are ready instead of all at once
1591void
1592__kmp_fork_barrier(int gtid, int tid)
1593{
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001594 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001595 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001596 kmp_info_t *this_thr = __kmp_threads[gtid];
1597 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1598#if USE_ITT_BUILD
1599 void * itt_sync_obj = NULL;
1600#endif /* USE_ITT_BUILD */
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001601 if (team)
1602 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001603
1604 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1605 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1606
1607 // th_team pointer only valid for master thread here
1608 if (KMP_MASTER_TID(tid)) {
1609#if USE_ITT_BUILD && USE_ITT_NOTIFY
1610 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1611 // Create itt barrier object
1612 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1613 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1614 }
1615#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1616
1617#ifdef KMP_DEBUG
1618 register kmp_info_t **other_threads = team->t.t_threads;
1619 register int i;
1620
1621 // Verify state
1622 KMP_MB();
1623
1624 for(i=1; i<team->t.t_nproc; ++i) {
1625 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1626 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1627 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1628 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1629 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1630 & ~(KMP_BARRIER_SLEEP_STATE))
1631 == KMP_INIT_BARRIER_STATE);
1632 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1633 }
1634#endif
1635
1636 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jonathan Peyton54127982015-11-04 21:37:48 +00001637 __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001638 }
1639
1640 /* The master thread may have changed its blocktime between the join barrier and the
1641 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1642 access it when the team struct is not guaranteed to exist. */
1643 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1644 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001645#if KMP_USE_MONITOR
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001646 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001647#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001648 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1649 }
1650 } // master
1651
1652 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1653 case bp_hyper_bar: {
1654 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1655 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1656 USE_ITT_BUILD_ARG(itt_sync_obj) );
1657 break;
1658 }
1659 case bp_hierarchical_bar: {
1660 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1661 USE_ITT_BUILD_ARG(itt_sync_obj) );
1662 break;
1663 }
1664 case bp_tree_bar: {
1665 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1666 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1667 USE_ITT_BUILD_ARG(itt_sync_obj) );
1668 break;
1669 }
1670 default: {
1671 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1672 USE_ITT_BUILD_ARG(itt_sync_obj) );
1673 }
1674 }
1675
1676 // Early exit for reaping threads releasing forkjoin barrier
1677 if (TCR_4(__kmp_global.g.g_done)) {
Jonathan Peyton54127982015-11-04 21:37:48 +00001678 this_thr->th.th_task_team = NULL;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001679
1680#if USE_ITT_BUILD && USE_ITT_NOTIFY
1681 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1682 if (!KMP_MASTER_TID(tid)) {
1683 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1684 if (itt_sync_obj)
1685 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1686 }
1687 }
1688#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1689 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1690 return;
1691 }
1692
1693 /* We can now assume that a valid team structure has been allocated by the master and
1694 propagated to all worker threads. The current thread, however, may not be part of the
1695 team, so we can't blindly assume that the team pointer is non-null. */
1696 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1697 KMP_DEBUG_ASSERT(team != NULL);
1698 tid = __kmp_tid_from_gtid(gtid);
1699
1700
1701#if KMP_BARRIER_ICV_PULL
1702 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1703 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1704 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1705 the fixed ICVs in the master's thread struct, because it is not always the case that the
1706 threads arrays have been allocated when __kmp_fork_call() is executed. */
Jonathan Peyton45be4502015-08-11 21:36:41 +00001707 {
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001708 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001709 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1710 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1711 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1712 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1713 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1714 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1715 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001716 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001717#endif // KMP_BARRIER_ICV_PULL
1718
1719 if (__kmp_tasking_mode != tskm_immediate_exec) {
1720 __kmp_task_team_sync(this_thr, team);
1721 }
1722
1723#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1724 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1725 if (proc_bind == proc_bind_intel) {
1726#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001727#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001728 // Call dynamic affinity settings
1729 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1730 __kmp_balanced_affinity(tid, team->t.t_nproc);
1731 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001732#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001733#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1734 }
Andrey Churbanov94e569e2015-03-10 09:19:47 +00001735 else if (proc_bind != proc_bind_false) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001736 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1737 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1738 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1739 }
1740 else {
1741 __kmp_affinity_set_place(gtid);
1742 }
1743 }
1744#endif
1745
1746#if USE_ITT_BUILD && USE_ITT_NOTIFY
1747 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1748 if (!KMP_MASTER_TID(tid)) {
1749 // Get correct barrier object
1750 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1751 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1752 } // (prepare called inside barrier_release)
1753 }
1754#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001755 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001756 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1757}
1758
1759
1760void
1761__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1762{
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001763 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001764
1765 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1766 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1767
1768 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1769 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1770 this data before this function is called. */
1771#if KMP_BARRIER_ICV_PULL
1772 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1773 all of the worker threads can access them and make their own copies after the barrier. */
1774 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1775 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1776 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1777 0, team->t.t_threads[0], team));
1778#elif KMP_BARRIER_ICV_PUSH
1779 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1780 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1781 0, team->t.t_threads[0], team));
1782#else
1783 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1784 ngo_load(new_icvs);
1785 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
Jonathan Peyton91b78702015-06-08 19:39:07 +00001786 for (int f=1; f<new_nproc; ++f) { // Skip the master thread
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001787 // TODO: GEH - pass in better source location info since usually NULL here
1788 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1789 f, team->t.t_threads[f], team));
1790 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1791 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1792 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1793 f, team->t.t_threads[f], team));
1794 }
1795 ngo_sync();
1796#endif // KMP_BARRIER_ICV_PULL
1797}