blob: 41062453f921c53854d23a75590b9939691b1a16 [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
Jonathan Peytona0e159f2015-10-08 18:23:38 +000020#include "kmp_os.h"
21
Jim Cownie4cc4bb42014-10-07 16:25:50 +000022
23#if KMP_MIC
24#include <immintrin.h>
25#define USE_NGO_STORES 1
26#endif // KMP_MIC
27
Jonas Hahnfeld50fed042016-11-07 15:58:36 +000028#include "tsan_annotations.h"
29
Jim Cownie4cc4bb42014-10-07 16:25:50 +000030#if KMP_MIC && USE_NGO_STORES
31// ICV copying
32#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
33#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
34#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
35#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
36#else
37#define ngo_load(src) ((void)0)
38#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
Jonathan Peyton01b58b72015-07-09 18:20:51 +000039#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
Jim Cownie4cc4bb42014-10-07 16:25:50 +000040#define ngo_sync() ((void)0)
41#endif /* KMP_MIC && USE_NGO_STORES */
42
43void __kmp_print_structure(void); // Forward declaration
44
45// ---------------------------- Barrier Algorithms ----------------------------
46
47// Linear Barrier
48static void
49__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
50 void (*reduce)(void *, void *)
51 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
52{
Jonathan Peyton5375fe82016-11-14 21:13:44 +000053 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +000054 register kmp_team_t *team = this_thr->th.th_team;
55 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
56 register kmp_info_t **other_threads = team->t.t_threads;
57
58 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
59 gtid, team->t.t_id, tid, bt));
60 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
61
62#if USE_ITT_BUILD && USE_ITT_NOTIFY
63 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +000064 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +000065 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
66 }
67#endif
68 // We now perform a linear reduction to signal that all of the threads have arrived.
69 if (!KMP_MASTER_TID(tid)) {
70 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
Jonathan Peytond26e2132015-09-10 18:44:30 +000071 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +000072 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
73 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
74 // Mark arrival to master thread
75 /* After performing this write, a worker thread may not assume that the team is valid
76 any more - it could be deallocated by the master thread at any time. */
77 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
78 flag.release();
79 } else {
80 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
81 register int nproc = this_thr->th.th_team_nproc;
82 register int i;
83 // Don't have to worry about sleep bit here or atomic since team setting
Jonathan Peytond26e2132015-09-10 18:44:30 +000084 register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
Jim Cownie4cc4bb42014-10-07 16:25:50 +000085
86 // Collect all the worker team member threads.
87 for (i=1; i<nproc; ++i) {
88#if KMP_CACHE_MANAGE
89 // Prefetch next thread's arrived count
90 if (i+1 < nproc)
91 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
92#endif /* KMP_CACHE_MANAGE */
93 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +000094 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +000095 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
96 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
97
98 // Wait for worker thread to arrive
99 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
100 flag.wait(this_thr, FALSE
101 USE_ITT_BUILD_ARG(itt_sync_obj) );
102#if USE_ITT_BUILD && USE_ITT_NOTIFY
103 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000104 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000105 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
106 other_threads[i]->th.th_bar_min_time);
107 }
108#endif
109 if (reduce) {
110 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
111 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000112 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000113 (*reduce)(this_thr->th.th_local.reduce_data,
114 other_threads[i]->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000115 ANNOTATE_REDUCE_BEFORE(reduce);
116 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000117 }
118 }
119 // Don't have to worry about sleep bit here or atomic since team setting
120 team_bar->b_arrived = new_state;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000121 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000122 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
123 }
124 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
125 gtid, team->t.t_id, tid, bt));
126}
127
128static void
129__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
130 int propagate_icvs
131 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
132{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000133 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000134 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
135 register kmp_team_t *team;
136
137 if (KMP_MASTER_TID(tid)) {
138 register unsigned int i;
139 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
140 register kmp_info_t **other_threads;
141
142 team = __kmp_threads[gtid]->th.th_team;
143 KMP_DEBUG_ASSERT(team != NULL);
144 other_threads = team->t.t_threads;
145
146 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
147 gtid, team->t.t_id, tid, bt));
148
149 if (nproc > 1) {
150#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000151 {
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000152 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
Jonathan Peyton45be4502015-08-11 21:36:41 +0000153 if (propagate_icvs) {
154 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
155 for (i=1; i<nproc; ++i) {
156 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
157 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
158 &team->t.t_implicit_task_taskdata[0].td_icvs);
159 }
160 ngo_sync();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000161 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000162 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000163#endif // KMP_BARRIER_ICV_PUSH
164
165 // Now, release all of the worker threads
166 for (i=1; i<nproc; ++i) {
167#if KMP_CACHE_MANAGE
168 // Prefetch next thread's go flag
169 if (i+1 < nproc)
170 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
171#endif /* KMP_CACHE_MANAGE */
172 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
173 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
174 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
175 &other_threads[i]->th.th_bar[bt].bb.b_go,
176 other_threads[i]->th.th_bar[bt].bb.b_go,
177 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
178 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
179 flag.release();
180 }
181 }
182 } else { // Wait for the MASTER thread to release us
183 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
184 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
185 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
186 flag.wait(this_thr, TRUE
187 USE_ITT_BUILD_ARG(itt_sync_obj) );
188#if USE_ITT_BUILD && USE_ITT_NOTIFY
189 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
190 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
191 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
192 // Cancel wait on previous parallel region...
193 __kmp_itt_task_starting(itt_sync_obj);
194
195 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
196 return;
197
198 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
199 if (itt_sync_obj != NULL)
200 // Call prepare as early as possible for "new" barrier
201 __kmp_itt_task_finished(itt_sync_obj);
202 } else
203#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
204 // Early exit for reaping threads releasing forkjoin barrier
205 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
206 return;
207 // The worker thread may now assume that the team is valid.
208#ifdef KMP_DEBUG
209 tid = __kmp_tid_from_gtid(gtid);
210 team = __kmp_threads[gtid]->th.th_team;
211#endif
212 KMP_DEBUG_ASSERT(team != NULL);
213 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
214 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
215 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
216 KMP_MB(); // Flush all pending memory write invalidates.
217 }
218 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
219 gtid, team->t.t_id, tid, bt));
220}
221
222// Tree barrier
223static void
224__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
225 void (*reduce)(void *, void *)
226 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
227{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000228 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000229 register kmp_team_t *team = this_thr->th.th_team;
230 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
231 register kmp_info_t **other_threads = team->t.t_threads;
232 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
233 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
234 register kmp_uint32 branch_factor = 1 << branch_bits;
235 register kmp_uint32 child;
236 register kmp_uint32 child_tid;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000237 register kmp_uint64 new_state;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000238
239 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
240 gtid, team->t.t_id, tid, bt));
241 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
242
243#if USE_ITT_BUILD && USE_ITT_NOTIFY
244 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000245 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000246 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
247 }
248#endif
249 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
250 child_tid = (tid << branch_bits) + 1;
251 if (child_tid < nproc) {
252 // Parent threads wait for all their children to arrive
253 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
254 child = 1;
255 do {
256 register kmp_info_t *child_thr = other_threads[child_tid];
257 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
258#if KMP_CACHE_MANAGE
259 // Prefetch next thread's arrived count
260 if (child+1 <= branch_factor && child_tid+1 < nproc)
261 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
262#endif /* KMP_CACHE_MANAGE */
263 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000264 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000265 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
266 &child_bar->b_arrived, new_state));
267 // Wait for child to arrive
268 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
269 flag.wait(this_thr, FALSE
270 USE_ITT_BUILD_ARG(itt_sync_obj) );
271#if USE_ITT_BUILD && USE_ITT_NOTIFY
272 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000273 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000274 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
275 child_thr->th.th_bar_min_time);
276 }
277#endif
278 if (reduce) {
279 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
280 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
281 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000282 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000283 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000284 ANNOTATE_REDUCE_BEFORE(reduce);
285 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000286 }
287 child++;
288 child_tid++;
289 }
290 while (child <= branch_factor && child_tid < nproc);
291 }
292
293 if (!KMP_MASTER_TID(tid)) { // Worker threads
294 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
295
296 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000297 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000298 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
299 &thr_bar->b_arrived, thr_bar->b_arrived,
300 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
301
302 // Mark arrival to parent thread
303 /* After performing this write, a worker thread may not assume that the team is valid
304 any more - it could be deallocated by the master thread at any time. */
305 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
306 flag.release();
307 } else {
308 // Need to update the team arrived pointer if we are the master thread
309 if (nproc > 1) // New value was already computed above
310 team->t.t_bar[bt].b_arrived = new_state;
311 else
312 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000313 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000314 gtid, team->t.t_id, tid, team->t.t_id,
315 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
316 }
317 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
318 gtid, team->t.t_id, tid, bt));
319}
320
321static void
322__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
323 int propagate_icvs
324 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
325{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000326 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000327 register kmp_team_t *team;
328 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
329 register kmp_uint32 nproc;
330 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
331 register kmp_uint32 branch_factor = 1 << branch_bits;
332 register kmp_uint32 child;
333 register kmp_uint32 child_tid;
334
335 // Perform a tree release for all of the threads that have been gathered
336 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
337 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
338 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
339 // Wait for parent thread to release us
340 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
341 flag.wait(this_thr, TRUE
342 USE_ITT_BUILD_ARG(itt_sync_obj) );
343#if USE_ITT_BUILD && USE_ITT_NOTIFY
344 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
345 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
346 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
347 // Cancel wait on previous parallel region...
348 __kmp_itt_task_starting(itt_sync_obj);
349
350 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
351 return;
352
353 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
354 if (itt_sync_obj != NULL)
355 // Call prepare as early as possible for "new" barrier
356 __kmp_itt_task_finished(itt_sync_obj);
357 } else
358#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
359 // Early exit for reaping threads releasing forkjoin barrier
360 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
361 return;
362
363 // The worker thread may now assume that the team is valid.
364 team = __kmp_threads[gtid]->th.th_team;
365 KMP_DEBUG_ASSERT(team != NULL);
366 tid = __kmp_tid_from_gtid(gtid);
367
368 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
369 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
370 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
371 KMP_MB(); // Flush all pending memory write invalidates.
372 } else {
373 team = __kmp_threads[gtid]->th.th_team;
374 KMP_DEBUG_ASSERT(team != NULL);
375 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
376 gtid, team->t.t_id, tid, bt));
377 }
378 nproc = this_thr->th.th_team_nproc;
379 child_tid = (tid << branch_bits) + 1;
380
381 if (child_tid < nproc) {
382 register kmp_info_t **other_threads = team->t.t_threads;
383 child = 1;
384 // Parent threads release all their children
385 do {
386 register kmp_info_t *child_thr = other_threads[child_tid];
387 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
388#if KMP_CACHE_MANAGE
389 // Prefetch next thread's go count
390 if (child+1 <= branch_factor && child_tid+1 < nproc)
391 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
392#endif /* KMP_CACHE_MANAGE */
393
394#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000395 {
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000396 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
Jonathan Peyton45be4502015-08-11 21:36:41 +0000397 if (propagate_icvs) {
398 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
399 team, child_tid, FALSE);
400 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
401 &team->t.t_implicit_task_taskdata[0].td_icvs);
402 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000403 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000404#endif // KMP_BARRIER_ICV_PUSH
405 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
406 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
407 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
408 child_tid, &child_bar->b_go, child_bar->b_go,
409 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
410 // Release child from barrier
411 kmp_flag_64 flag(&child_bar->b_go, child_thr);
412 flag.release();
413 child++;
414 child_tid++;
415 }
416 while (child <= branch_factor && child_tid < nproc);
417 }
418 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
419 gtid, team->t.t_id, tid, bt));
420}
421
422
423// Hyper Barrier
424static void
425__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
426 void (*reduce)(void *, void *)
427 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
428{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000429 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000430 register kmp_team_t *team = this_thr->th.th_team;
431 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
432 register kmp_info_t **other_threads = team->t.t_threads;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000433 register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000434 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
435 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
436 register kmp_uint32 branch_factor = 1 << branch_bits;
437 register kmp_uint32 offset;
438 register kmp_uint32 level;
439
440 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
441 gtid, team->t.t_id, tid, bt));
442
443 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
444
445#if USE_ITT_BUILD && USE_ITT_NOTIFY
446 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000447 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000448 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
449 }
450#endif
451 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
452 arrived, and reduce any required data as we go. */
453 kmp_flag_64 p_flag(&thr_bar->b_arrived);
454 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
455 {
456 register kmp_uint32 child;
457 register kmp_uint32 child_tid;
458
459 if (((tid >> level) & (branch_factor - 1)) != 0) {
460 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
461
462 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000463 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000464 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
465 &thr_bar->b_arrived, thr_bar->b_arrived,
466 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
467 // Mark arrival to parent thread
468 /* After performing this write (in the last iteration of the enclosing for loop),
469 a worker thread may not assume that the team is valid any more - it could be
470 deallocated by the master thread at any time. */
471 p_flag.set_waiter(other_threads[parent_tid]);
Jonathan Peyton1bd61b42015-10-08 19:44:16 +0000472 p_flag.release();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000473 break;
474 }
475
476 // Parent threads wait for children to arrive
477 if (new_state == KMP_BARRIER_UNUSED_STATE)
478 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
479 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
480 child++, child_tid+=(1 << level))
481 {
482 register kmp_info_t *child_thr = other_threads[child_tid];
483 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
484#if KMP_CACHE_MANAGE
485 register kmp_uint32 next_child_tid = child_tid + (1 << level);
486 // Prefetch next thread's arrived count
487 if (child+1 < branch_factor && next_child_tid < num_threads)
488 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
489#endif /* KMP_CACHE_MANAGE */
490 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000491 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000492 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
493 &child_bar->b_arrived, new_state));
494 // Wait for child to arrive
495 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
496 c_flag.wait(this_thr, FALSE
497 USE_ITT_BUILD_ARG(itt_sync_obj) );
498#if USE_ITT_BUILD && USE_ITT_NOTIFY
499 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000500 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000501 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
502 child_thr->th.th_bar_min_time);
503 }
504#endif
505 if (reduce) {
506 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
507 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
508 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000509 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000510 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000511 ANNOTATE_REDUCE_BEFORE(reduce);
512 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000513 }
514 }
515 }
516
517 if (KMP_MASTER_TID(tid)) {
518 // Need to update the team arrived pointer if we are the master thread
519 if (new_state == KMP_BARRIER_UNUSED_STATE)
520 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
521 else
522 team->t.t_bar[bt].b_arrived = new_state;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000523 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000524 gtid, team->t.t_id, tid, team->t.t_id,
525 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
526 }
527 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
528 gtid, team->t.t_id, tid, bt));
529}
530
531// The reverse versions seem to beat the forward versions overall
532#define KMP_REVERSE_HYPER_BAR
533static void
534__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
535 int propagate_icvs
536 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
537{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000538 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000539 register kmp_team_t *team;
540 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
541 register kmp_info_t **other_threads;
542 register kmp_uint32 num_threads;
543 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
544 register kmp_uint32 branch_factor = 1 << branch_bits;
545 register kmp_uint32 child;
546 register kmp_uint32 child_tid;
547 register kmp_uint32 offset;
548 register kmp_uint32 level;
549
550 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
551 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
552 order of the corresponding gather, otherwise threads are released in the same order. */
553 if (KMP_MASTER_TID(tid)) { // master
554 team = __kmp_threads[gtid]->th.th_team;
555 KMP_DEBUG_ASSERT(team != NULL);
556 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
557 gtid, team->t.t_id, tid, bt));
558#if KMP_BARRIER_ICV_PUSH
559 if (propagate_icvs) { // master already has ICVs in final destination; copy
560 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
561 }
562#endif
563 }
564 else { // Handle fork barrier workers who aren't part of a team yet
565 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
566 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
567 // Wait for parent thread to release us
568 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
569 flag.wait(this_thr, TRUE
570 USE_ITT_BUILD_ARG(itt_sync_obj) );
571#if USE_ITT_BUILD && USE_ITT_NOTIFY
572 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
573 // In fork barrier where we could not get the object reliably
574 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
575 // Cancel wait on previous parallel region...
576 __kmp_itt_task_starting(itt_sync_obj);
577
578 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
579 return;
580
581 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
582 if (itt_sync_obj != NULL)
583 // Call prepare as early as possible for "new" barrier
584 __kmp_itt_task_finished(itt_sync_obj);
585 } else
586#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
587 // Early exit for reaping threads releasing forkjoin barrier
588 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
589 return;
590
591 // The worker thread may now assume that the team is valid.
592 team = __kmp_threads[gtid]->th.th_team;
593 KMP_DEBUG_ASSERT(team != NULL);
594 tid = __kmp_tid_from_gtid(gtid);
595
596 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
597 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
598 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
599 KMP_MB(); // Flush all pending memory write invalidates.
600 }
601 num_threads = this_thr->th.th_team_nproc;
602 other_threads = team->t.t_threads;
603
604#ifdef KMP_REVERSE_HYPER_BAR
605 // Count up to correct level for parent
606 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
607 level+=branch_bits, offset<<=branch_bits);
608
609 // Now go down from there
610 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
611 level-=branch_bits, offset>>=branch_bits)
612#else
613 // Go down the tree, level by level
614 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
615#endif // KMP_REVERSE_HYPER_BAR
616 {
617#ifdef KMP_REVERSE_HYPER_BAR
618 /* Now go in reverse order through the children, highest to lowest.
619 Initial setting of child is conservative here. */
620 child = num_threads >> ((level==0)?level:level-1);
621 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
622 child>=1; child--, child_tid-=(1<<level))
623#else
624 if (((tid >> level) & (branch_factor - 1)) != 0)
625 // No need to go lower than this, since this is the level parent would be notified
626 break;
627 // Iterate through children on this level of the tree
628 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
629 child++, child_tid+=(1<<level))
630#endif // KMP_REVERSE_HYPER_BAR
631 {
632 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
633 else {
634 register kmp_info_t *child_thr = other_threads[child_tid];
635 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
636#if KMP_CACHE_MANAGE
637 register kmp_uint32 next_child_tid = child_tid - (1 << level);
638 // Prefetch next thread's go count
639# ifdef KMP_REVERSE_HYPER_BAR
640 if (child-1 >= 1 && next_child_tid < num_threads)
641# else
642 if (child+1 < branch_factor && next_child_tid < num_threads)
643# endif // KMP_REVERSE_HYPER_BAR
644 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
645#endif /* KMP_CACHE_MANAGE */
646
647#if KMP_BARRIER_ICV_PUSH
648 if (propagate_icvs) // push my fixed ICVs to my child
649 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
650#endif // KMP_BARRIER_ICV_PUSH
651
652 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
653 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
654 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
655 child_tid, &child_bar->b_go, child_bar->b_go,
656 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
657 // Release child from barrier
658 kmp_flag_64 flag(&child_bar->b_go, child_thr);
659 flag.release();
660 }
661 }
662 }
663#if KMP_BARRIER_ICV_PUSH
664 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
665 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
666 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
667 }
668#endif
669 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
670 gtid, team->t.t_id, tid, bt));
671}
672
673// Hierarchical Barrier
674
675// Initialize thread barrier data
676/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
677 minimum amount of initialization required based on how the team has changed. Returns true if
678 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
679 team size increases, threads already in the team will respond to on-core wakeup on their parent
680 thread, but threads newly added to the team will only be listening on the their local b_go. */
681static bool
682__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
683 int gtid, int tid, kmp_team_t *team)
684{
685 // Checks to determine if (re-)initialization is needed
686 bool uninitialized = thr_bar->team == NULL;
687 bool team_changed = team != thr_bar->team;
688 bool team_sz_changed = nproc != thr_bar->nproc;
689 bool tid_changed = tid != thr_bar->old_tid;
690 bool retval = false;
691
692 if (uninitialized || team_sz_changed) {
693 __kmp_get_hierarchy(nproc, thr_bar);
694 }
695
696 if (uninitialized || team_sz_changed || tid_changed) {
697 thr_bar->my_level = thr_bar->depth-1; // default for master
698 thr_bar->parent_tid = -1; // default for master
699 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
700 kmp_uint32 d=0;
701 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
702 kmp_uint32 rem;
703 if (d == thr_bar->depth-2) { // reached level right below the master
704 thr_bar->parent_tid = 0;
705 thr_bar->my_level = d;
706 break;
707 }
708 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
709 // thread is not a subtree root at next level, so this is max
710 thr_bar->parent_tid = tid - rem;
711 thr_bar->my_level = d;
712 break;
713 }
714 ++d;
715 }
716 }
717 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
718 thr_bar->old_tid = tid;
719 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
Jonathan Peytonb0b83c82015-11-09 16:28:32 +0000720 thr_bar->team = team;
721 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000722 }
723 if (uninitialized || team_changed || tid_changed) {
724 thr_bar->team = team;
725 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
726 retval = true;
727 }
728 if (uninitialized || team_sz_changed || tid_changed) {
729 thr_bar->nproc = nproc;
730 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
731 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
732 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
733 thr_bar->leaf_kids = nproc - tid - 1;
734 thr_bar->leaf_state = 0;
735 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
736 }
737 return retval;
738}
739
740static void
741__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
742 int gtid, int tid, void (*reduce) (void *, void *)
743 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
744{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000745 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000746 register kmp_team_t *team = this_thr->th.th_team;
747 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
748 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
749 register kmp_info_t **other_threads = team->t.t_threads;
750 register kmp_uint64 new_state;
751
Andrey Churbanov42a79212015-01-27 16:50:31 +0000752 int level = team->t.t_level;
Jonathan Peyton441f3372015-09-21 17:24:46 +0000753#if OMP_40_ENABLED
Andrey Churbanov42a79212015-01-27 16:50:31 +0000754 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
755 if (this_thr->th.th_teams_size.nteams > 1)
756 ++level; // level was not increased in teams construct for team_of_masters
Jonathan Peyton441f3372015-09-21 17:24:46 +0000757#endif
Andrey Churbanov42a79212015-01-27 16:50:31 +0000758 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000759 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
760
761 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
762 gtid, team->t.t_id, tid, bt));
763 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
764
Andrey Churbanove6bfb732015-05-06 18:34:15 +0000765#if USE_ITT_BUILD && USE_ITT_NOTIFY
766 // Barrier imbalance - save arrive time to the thread
767 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
768 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
769 }
770#endif
771
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000772 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
773
774 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
775 register kmp_int32 child_tid;
776 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
777 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
778 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
Jonathan Peytond26e2132015-09-10 18:44:30 +0000779 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
Jonathan Peyton90862c42015-11-12 21:40:39 +0000780 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) waiting for leaf kids\n",
781 gtid, team->t.t_id, tid));
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000782 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
783 flag.wait(this_thr, FALSE
784 USE_ITT_BUILD_ARG(itt_sync_obj) );
785 if (reduce) {
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000786 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000787 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
788 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
789 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
790 team->t.t_id, child_tid));
791 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
792 }
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000793 ANNOTATE_REDUCE_BEFORE(reduce);
794 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000795 }
796 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
797 }
798 // Next, wait for higher level children on each child's b_arrived flag
799 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
800 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
801 if (last > nproc) last = nproc;
802 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
803 register kmp_info_t *child_thr = other_threads[child_tid];
804 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
805 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000806 "arrived(%p) == %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000807 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
808 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
809 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
810 flag.wait(this_thr, FALSE
811 USE_ITT_BUILD_ARG(itt_sync_obj) );
812 if (reduce) {
813 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
814 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
815 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000816 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000817 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000818 ANNOTATE_REDUCE_BEFORE(reduce);
819 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000820 }
821 }
822 }
823 }
824 else { // Blocktime is not infinite
825 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
826 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
827 if (last > nproc) last = nproc;
828 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
829 register kmp_info_t *child_thr = other_threads[child_tid];
830 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
831 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000832 "arrived(%p) == %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000833 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
834 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
835 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
836 flag.wait(this_thr, FALSE
837 USE_ITT_BUILD_ARG(itt_sync_obj) );
838 if (reduce) {
839 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
840 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
841 team->t.t_id, child_tid));
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000842 ANNOTATE_REDUCE_AFTER(reduce);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000843 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
Jonas Hahnfeld50fed042016-11-07 15:58:36 +0000844 ANNOTATE_REDUCE_BEFORE(reduce);
845 ANNOTATE_REDUCE_BEFORE(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000846 }
847 }
848 }
849 }
850 }
851 // All subordinates are gathered; now release parent if not master thread
852
853 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
854 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000855 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000856 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
857 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
858 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
859 the team is valid any more - it could be deallocated by the master thread at any time. */
860 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
861 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
862 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
863 flag.release();
864 }
865 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
Jonathan Peytond26e2132015-09-10 18:44:30 +0000866 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000867 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
868 flag.set_waiter(other_threads[thr_bar->parent_tid]);
869 flag.release();
870 }
871 } else { // Master thread needs to update the team's b_arrived value
Jonathan Peytond26e2132015-09-10 18:44:30 +0000872 team->t.t_bar[bt].b_arrived = new_state;
873 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000874 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
875 }
876 // Is the team access below unsafe or just technically invalid?
877 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
878 gtid, team->t.t_id, tid, bt));
879}
880
881static void
882__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
883 int propagate_icvs
884 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
885{
Jonathan Peyton5375fe82016-11-14 21:13:44 +0000886 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000887 register kmp_team_t *team;
888 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
889 register kmp_uint32 nproc;
890 bool team_change = false; // indicates on-core barrier shouldn't be used
891
892 if (KMP_MASTER_TID(tid)) {
893 team = __kmp_threads[gtid]->th.th_team;
894 KMP_DEBUG_ASSERT(team != NULL);
895 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
896 gtid, team->t.t_id, tid, bt));
897 }
898 else { // Worker threads
899 // Wait for parent thread to release me
900 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
901 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
902 // Use traditional method of waiting on my own b_go flag
903 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
904 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
905 flag.wait(this_thr, TRUE
906 USE_ITT_BUILD_ARG(itt_sync_obj) );
907 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
908 }
909 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
910 // Wait on my "offset" bits on parent's b_go flag
911 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
912 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
913 bt, this_thr
914 USE_ITT_BUILD_ARG(itt_sync_obj) );
Jonathan Peytona0e159f2015-10-08 18:23:38 +0000915 flag.wait(this_thr, TRUE);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000916 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
917 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
918 }
919 else { // Reset my bits on parent's b_go flag
920 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
921 }
922 }
923 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
924 // Early exit for reaping threads releasing forkjoin barrier
925 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
926 return;
927 // The worker thread may now assume that the team is valid.
928 team = __kmp_threads[gtid]->th.th_team;
929 KMP_DEBUG_ASSERT(team != NULL);
930 tid = __kmp_tid_from_gtid(gtid);
931
932 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
933 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
934 KMP_MB(); // Flush all pending memory write invalidates.
935 }
936
Jonathan Peytona0e159f2015-10-08 18:23:38 +0000937 nproc = this_thr->th.th_team_nproc;
Andrey Churbanov42a79212015-01-27 16:50:31 +0000938 int level = team->t.t_level;
Jonathan Peyton441f3372015-09-21 17:24:46 +0000939#if OMP_40_ENABLED
Andrey Churbanov42a79212015-01-27 16:50:31 +0000940 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
941 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
942 ++level; // level was not increased in teams construct for team_of_workers
943 if( this_thr->th.th_teams_size.nteams > 1 )
944 ++level; // level was not increased in teams construct for team_of_masters
945 }
Jonathan Peyton441f3372015-09-21 17:24:46 +0000946#endif
Andrey Churbanov42a79212015-01-27 16:50:31 +0000947 if (level == 1) thr_bar->use_oncore_barrier = 1;
948 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000949
950 // If the team size has increased, we still communicate with old leaves via oncore barrier.
951 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
952 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
953 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
954 // But if the entire team changes, we won't use oncore barrier at all
955 if (team_change) old_leaf_kids = 0;
956
957#if KMP_BARRIER_ICV_PUSH
958 if (propagate_icvs) {
Jonathan Peyton2211cfe2015-08-12 20:59:48 +0000959 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000960 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
961 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
962 }
963 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
964 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
965 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
966 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
967 &thr_bar->parent_bar->th_fixed_icvs);
968 // non-leaves will get ICVs piggybacked with b_go via NGO store
969 }
970 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
971 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
972 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
973 else // leaves copy parent's fixed ICVs directly to local ICV store
974 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
975 &thr_bar->parent_bar->th_fixed_icvs);
976 }
977 }
978#endif // KMP_BARRIER_ICV_PUSH
979
980 // Now, release my children
981 if (thr_bar->my_level) { // not a leaf
982 register kmp_int32 child_tid;
983 kmp_uint32 last;
984 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
985 if (KMP_MASTER_TID(tid)) { // do a flat release
986 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
987 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
988 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
989 ngo_load(&thr_bar->th_fixed_icvs);
990 // This loops over all the threads skipping only the leaf nodes in the hierarchy
991 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
992 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
993 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
994 " go(%p): %u => %u\n",
995 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
996 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
997 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
998 // Use ngo store (if available) to both store ICVs and release child via child's b_go
999 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
1000 }
1001 ngo_sync();
1002 }
1003 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
1004 // Now, release leaf children
1005 if (thr_bar->leaf_kids) { // if there are any
1006 // We test team_change on the off-chance that the level 1 team changed.
1007 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
1008 if (old_leaf_kids) { // release old leaf kids
1009 thr_bar->b_go |= old_leaf_state;
1010 }
1011 // Release new leaf kids
1012 last = tid+thr_bar->skip_per_level[1];
1013 if (last > nproc) last = nproc;
1014 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
1015 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1016 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1017 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
1018 " T#%d(%d:%d) go(%p): %u => %u\n",
1019 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1020 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1021 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1022 // Release child using child's b_go flag
1023 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1024 flag.release();
1025 }
1026 }
1027 else { // Release all children at once with leaf_state bits on my own b_go flag
1028 thr_bar->b_go |= thr_bar->leaf_state;
1029 }
1030 }
1031 }
1032 else { // Blocktime is not infinite; do a simple hierarchical release
1033 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1034 last = tid+thr_bar->skip_per_level[d+1];
1035 kmp_uint32 skip = thr_bar->skip_per_level[d];
1036 if (last > nproc) last = nproc;
1037 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1038 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1039 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1040 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1041 " go(%p): %u => %u\n",
1042 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1043 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1044 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1045 // Release child using child's b_go flag
1046 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1047 flag.release();
1048 }
1049 }
1050 }
1051#if KMP_BARRIER_ICV_PUSH
1052 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1053 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1054#endif // KMP_BARRIER_ICV_PUSH
1055 }
1056 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1057 gtid, team->t.t_id, tid, bt));
1058}
1059
1060// ---------------------------- End of Barrier Algorithms ----------------------------
1061
1062// Internal function to do a barrier.
1063/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1064 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1065 Returns 0 if master thread, 1 if worker thread. */
1066int
1067__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1068 void *reduce_data, void (*reduce)(void *, void *))
1069{
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001070 KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001071 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001072 register int tid = __kmp_tid_from_gtid(gtid);
1073 register kmp_info_t *this_thr = __kmp_threads[gtid];
1074 register kmp_team_t *team = this_thr->th.th_team;
1075 register int status = 0;
1076 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001077#if OMPT_SUPPORT
1078 ompt_task_id_t my_task_id;
1079 ompt_parallel_id_t my_parallel_id;
1080#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001081
1082 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1083 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1084
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001085 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001086#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001087 if (ompt_enabled) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001088#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001089 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1090 my_parallel_id = team->t.ompt_team_info.parallel_id;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001091
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001092#if OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001093 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1094 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1095 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001096 my_parallel_id, my_task_id);
1097 }
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001098 }
1099#endif
1100 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1101 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1102 my_parallel_id, my_task_id);
1103 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001104#endif
1105 // It is OK to report the barrier state after the barrier begin callback.
1106 // According to the OMPT specification, a compliant implementation may
1107 // even delay reporting this state until the barrier begins to wait.
1108 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001109 }
1110#endif
1111
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001112 if (! team->t.t_serialized) {
1113#if USE_ITT_BUILD
1114 // This value will be used in itt notify events below.
1115 void *itt_sync_obj = NULL;
1116# if USE_ITT_NOTIFY
1117 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1118 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1119# endif
1120#endif /* USE_ITT_BUILD */
1121 if (__kmp_tasking_mode == tskm_extra_barrier) {
1122 __kmp_tasking_barrier(team, this_thr, gtid);
1123 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1124 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1125 }
1126
1127 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1128 the team struct is not guaranteed to exist. */
1129 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1130 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001131#if KMP_USE_MONITOR
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001132 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1133 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
Jonathan Peyton2208a8512017-01-27 17:54:31 +00001134#else
1135 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL();
1136#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001137 }
1138
1139#if USE_ITT_BUILD
1140 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1141 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1142#endif /* USE_ITT_BUILD */
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001143#if USE_DEBUGGER
1144 // Let the debugger know: the thread arrived to the barrier and waiting.
1145 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1146 team->t.t_bar[bt].b_master_arrived += 1;
1147 } else {
1148 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1149 } // if
1150#endif /* USE_DEBUGGER */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001151 if (reduce != NULL) {
1152 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1153 this_thr->th.th_local.reduce_data = reduce_data;
1154 }
Jonathan Peytonb0b83c82015-11-09 16:28:32 +00001155
1156 if (KMP_MASTER_TID(tid) && __kmp_tasking_mode != tskm_immediate_exec)
1157 __kmp_task_team_setup(this_thr, team, 0); // use 0 to only setup the current team if nthreads > 1
1158
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001159 switch (__kmp_barrier_gather_pattern[bt]) {
1160 case bp_hyper_bar: {
1161 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1162 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1163 USE_ITT_BUILD_ARG(itt_sync_obj) );
1164 break;
1165 }
1166 case bp_hierarchical_bar: {
1167 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1168 USE_ITT_BUILD_ARG(itt_sync_obj));
1169 break;
1170 }
1171 case bp_tree_bar: {
1172 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1173 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1174 USE_ITT_BUILD_ARG(itt_sync_obj) );
1175 break;
1176 }
1177 default: {
1178 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1179 USE_ITT_BUILD_ARG(itt_sync_obj) );
1180 }
1181 }
1182
1183 KMP_MB();
1184
1185 if (KMP_MASTER_TID(tid)) {
1186 status = 0;
1187 if (__kmp_tasking_mode != tskm_immediate_exec) {
1188 __kmp_task_team_wait(this_thr, team
1189 USE_ITT_BUILD_ARG(itt_sync_obj) );
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001190 }
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001191#if USE_DEBUGGER
1192 // Let the debugger know: All threads are arrived and starting leaving the barrier.
1193 team->t.t_bar[bt].b_team_arrived += 1;
1194#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001195
1196#if USE_ITT_BUILD
1197 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1198 before the final summation into the shared variable is done (final summation can be a
1199 long operation for array reductions). */
1200 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1201 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1202#endif /* USE_ITT_BUILD */
1203#if USE_ITT_BUILD && USE_ITT_NOTIFY
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001204 // Barrier - report frame end (only if active_level == 1)
1205 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1206#if OMP_40_ENABLED
1207 this_thr->th.th_teams_microtask == NULL &&
1208#endif
1209 team->t.t_active_level == 1)
1210 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001211 kmp_uint64 cur_time = __itt_get_timestamp();
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001212 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001213 int nproc = this_thr->th.th_team_nproc;
1214 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001215 switch(__kmp_forkjoin_frames_mode) {
1216 case 1:
1217 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1218 this_thr->th.th_frame_time = cur_time;
1219 break;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001220 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001221 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1222 break;
1223 case 3:
1224 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001225 // Initialize with master's wait time
1226 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001227 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1228 this_thr->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001229 for (i=1; i<nproc; ++i) {
1230 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001231 other_threads[i]->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001232 }
1233 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1234 }
1235 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1236 this_thr->th.th_frame_time = cur_time;
1237 break;
1238 }
1239 }
1240#endif /* USE_ITT_BUILD */
1241 } else {
1242 status = 1;
1243#if USE_ITT_BUILD
1244 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1245 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1246#endif /* USE_ITT_BUILD */
1247 }
1248 if (status == 1 || ! is_split) {
1249 switch (__kmp_barrier_release_pattern[bt]) {
1250 case bp_hyper_bar: {
1251 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1252 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1253 USE_ITT_BUILD_ARG(itt_sync_obj) );
1254 break;
1255 }
1256 case bp_hierarchical_bar: {
1257 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1258 USE_ITT_BUILD_ARG(itt_sync_obj) );
1259 break;
1260 }
1261 case bp_tree_bar: {
1262 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1263 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1264 USE_ITT_BUILD_ARG(itt_sync_obj) );
1265 break;
1266 }
1267 default: {
1268 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1269 USE_ITT_BUILD_ARG(itt_sync_obj) );
1270 }
1271 }
1272 if (__kmp_tasking_mode != tskm_immediate_exec) {
1273 __kmp_task_team_sync(this_thr, team);
1274 }
1275 }
1276
1277#if USE_ITT_BUILD
1278 /* GEH: TODO: Move this under if-condition above and also include in
1279 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1280 of the threads for split barriers. */
1281 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1282 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1283#endif /* USE_ITT_BUILD */
1284 } else { // Team is serialized.
1285 status = 0;
1286 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jonathan Peytondf6818b2016-06-14 17:57:47 +00001287#if OMP_45_ENABLED
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001288 if ( this_thr->th.th_task_team != NULL ) {
1289 void *itt_sync_obj = NULL;
1290#if USE_ITT_NOTIFY
1291 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1292 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1293 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1294 }
1295#endif
1296
Jonathan Peytonfe9a1d72015-08-26 19:58:48 +00001297 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001298 __kmp_task_team_wait(this_thr, team
1299 USE_ITT_BUILD_ARG(itt_sync_obj));
Jonathan Peyton54127982015-11-04 21:37:48 +00001300 __kmp_task_team_setup(this_thr, team, 0);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001301
1302#if USE_ITT_BUILD
1303 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1304 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1305#endif /* USE_ITT_BUILD */
1306 }
1307#else
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001308 // The task team should be NULL for serialized code (tasks will be executed immediately)
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001309 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001310 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001311#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001312 }
1313 }
1314 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1315 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001316
1317#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001318 if (ompt_enabled) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001319#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001320 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001321 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1322 my_parallel_id, my_task_id);
1323 }
1324#endif
1325 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1326 }
1327#endif
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001328 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001329
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001330 return status;
1331}
1332
1333
1334void
1335__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1336{
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001337 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
1338 KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001339 int tid = __kmp_tid_from_gtid(gtid);
1340 kmp_info_t *this_thr = __kmp_threads[gtid];
1341 kmp_team_t *team = this_thr->th.th_team;
1342
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001343 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001344 if (!team->t.t_serialized) {
1345 if (KMP_MASTER_GTID(gtid)) {
1346 switch (__kmp_barrier_release_pattern[bt]) {
1347 case bp_hyper_bar: {
1348 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1349 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1350 USE_ITT_BUILD_ARG(NULL) );
1351 break;
1352 }
1353 case bp_hierarchical_bar: {
1354 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1355 USE_ITT_BUILD_ARG(NULL));
1356 break;
1357 }
1358 case bp_tree_bar: {
1359 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1360 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1361 USE_ITT_BUILD_ARG(NULL) );
1362 break;
1363 }
1364 default: {
1365 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1366 USE_ITT_BUILD_ARG(NULL) );
1367 }
1368 }
1369 if (__kmp_tasking_mode != tskm_immediate_exec) {
1370 __kmp_task_team_sync(this_thr, team);
1371 } // if
1372 }
1373 }
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001374 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001375}
1376
1377
1378void
1379__kmp_join_barrier(int gtid)
1380{
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001381 KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001382 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001383 register kmp_info_t *this_thr = __kmp_threads[gtid];
1384 register kmp_team_t *team;
1385 register kmp_uint nproc;
1386 kmp_info_t *master_thread;
1387 int tid;
1388#ifdef KMP_DEBUG
1389 int team_id;
1390#endif /* KMP_DEBUG */
1391#if USE_ITT_BUILD
1392 void *itt_sync_obj = NULL;
1393# if USE_ITT_NOTIFY
1394 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1395 // Get object created at fork_barrier
1396 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1397# endif
1398#endif /* USE_ITT_BUILD */
1399 KMP_MB();
1400
1401 // Get current info
1402 team = this_thr->th.th_team;
1403 nproc = this_thr->th.th_team_nproc;
1404 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1405 tid = __kmp_tid_from_gtid(gtid);
1406#ifdef KMP_DEBUG
1407 team_id = team->t.t_id;
1408#endif /* KMP_DEBUG */
1409 master_thread = this_thr->th.th_team_master;
1410#ifdef KMP_DEBUG
1411 if (master_thread != team->t.t_threads[0]) {
1412 __kmp_print_structure();
1413 }
1414#endif /* KMP_DEBUG */
1415 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1416 KMP_MB();
1417
1418 // Verify state
1419 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1420 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1421 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1422 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1423 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1424
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001425 ANNOTATE_NEW_BARRIER_BEGIN(&team->t.t_bar);
Jonathan Peyton61118492016-05-20 19:03:38 +00001426#if OMPT_SUPPORT
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001427#if OMPT_TRACE
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001428 if (ompt_enabled &&
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001429 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1430 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1431 team->t.ompt_team_info.parallel_id,
1432 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1433 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001434#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001435 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1436#endif
1437
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001438 if (__kmp_tasking_mode == tskm_extra_barrier) {
1439 __kmp_tasking_barrier(team, this_thr, gtid);
1440 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1441 }
1442# ifdef KMP_DEBUG
1443 if (__kmp_tasking_mode != tskm_immediate_exec) {
1444 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001445 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001446 this_thr->th.th_task_team));
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001447 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001448 }
1449# endif /* KMP_DEBUG */
1450
1451 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1452 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1453 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1454 since the values are not used by __kmp_wait_template() in that case. */
1455 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001456#if KMP_USE_MONITOR
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001457 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1458 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
Jonathan Peyton2208a8512017-01-27 17:54:31 +00001459#else
1460 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL();
1461#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001462 }
1463
1464#if USE_ITT_BUILD
1465 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1466 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1467#endif /* USE_ITT_BUILD */
1468
1469 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1470 case bp_hyper_bar: {
1471 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1472 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1473 USE_ITT_BUILD_ARG(itt_sync_obj) );
1474 break;
1475 }
1476 case bp_hierarchical_bar: {
1477 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1478 USE_ITT_BUILD_ARG(itt_sync_obj) );
1479 break;
1480 }
1481 case bp_tree_bar: {
1482 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1483 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1484 USE_ITT_BUILD_ARG(itt_sync_obj) );
1485 break;
1486 }
1487 default: {
1488 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1489 USE_ITT_BUILD_ARG(itt_sync_obj) );
1490 }
1491 }
1492
1493 /* From this point on, the team data structure may be deallocated at any time by the
1494 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1495 data items that need to be referenced before the end of the barrier should be moved to
1496 the kmp_task_team_t structs. */
1497 if (KMP_MASTER_TID(tid)) {
1498 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001499 __kmp_task_team_wait(this_thr, team
1500 USE_ITT_BUILD_ARG(itt_sync_obj) );
1501 }
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001502#if KMP_STATS_ENABLED
1503 // Have master thread flag the workers to indicate they are now waiting for
1504 // next parallel region, Also wake them up so they switch their timers to idle.
1505 for (int i=0; i<team->t.t_nproc; ++i) {
1506 kmp_info_t* team_thread = team->t.t_threads[i];
1507 if (team_thread == this_thr)
1508 continue;
1509 team_thread->th.th_stats->setIdleFlag();
1510 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME && team_thread->th.th_sleep_loc != NULL)
1511 __kmp_null_resume_wrapper(__kmp_gtid_from_thread(team_thread), team_thread->th.th_sleep_loc);
1512 }
1513#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001514#if USE_ITT_BUILD
1515 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1516 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1517#endif /* USE_ITT_BUILD */
1518
1519# if USE_ITT_BUILD && USE_ITT_NOTIFY
1520 // Join barrier - report frame end
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001521 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1522#if OMP_40_ENABLED
1523 this_thr->th.th_teams_microtask == NULL &&
1524#endif
1525 team->t.t_active_level == 1)
1526 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001527 kmp_uint64 cur_time = __itt_get_timestamp();
1528 ident_t * loc = team->t.t_ident;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001529 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001530 int nproc = this_thr->th.th_team_nproc;
1531 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001532 switch(__kmp_forkjoin_frames_mode) {
1533 case 1:
1534 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1535 break;
1536 case 2:
1537 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1538 break;
1539 case 3:
1540 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001541 // Initialize with master's wait time
1542 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001543 // Set arrive time to zero to be able to check it in __kmp_invoke_task(); the same is done inside the loop below
1544 this_thr->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001545 for (i=1; i<nproc; ++i) {
1546 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
Jonathan Peyton99ef4d02016-04-14 16:06:49 +00001547 other_threads[i]->th.th_bar_arrive_time = 0;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001548 }
1549 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1550 }
1551 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1552 this_thr->th.th_frame_time = cur_time;
1553 break;
1554 }
1555 }
1556# endif /* USE_ITT_BUILD */
1557 }
1558#if USE_ITT_BUILD
1559 else {
1560 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1561 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1562 }
1563#endif /* USE_ITT_BUILD */
1564
1565#if KMP_DEBUG
1566 if (KMP_MASTER_TID(tid)) {
1567 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1568 gtid, team_id, tid, nproc));
1569 }
1570#endif /* KMP_DEBUG */
1571
1572 // TODO now, mark worker threads as done so they may be disbanded
1573 KMP_MB(); // Flush all pending memory write invalidates.
1574 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001575
1576#if OMPT_SUPPORT
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001577 if (ompt_enabled) {
Jonathan Peytoncab67cc2015-09-18 16:24:46 +00001578#if OMPT_BLAME
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001579 if (ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001580 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1581 team->t.ompt_team_info.parallel_id,
1582 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
Jonathan Peytonb68a85d2015-09-21 18:11:22 +00001583 }
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001584#endif
1585
1586 // return to default state
1587 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1588 }
1589#endif
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001590 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001591}
1592
1593
1594// TODO release worker threads' fork barriers as we are ready instead of all at once
1595void
1596__kmp_fork_barrier(int gtid, int tid)
1597{
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001598 KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
Jonathan Peyton11dc82f2016-05-05 16:15:57 +00001599 KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001600 kmp_info_t *this_thr = __kmp_threads[gtid];
1601 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1602#if USE_ITT_BUILD
1603 void * itt_sync_obj = NULL;
1604#endif /* USE_ITT_BUILD */
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001605 if (team)
1606 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001607
1608 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1609 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1610
1611 // th_team pointer only valid for master thread here
1612 if (KMP_MASTER_TID(tid)) {
1613#if USE_ITT_BUILD && USE_ITT_NOTIFY
1614 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1615 // Create itt barrier object
1616 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1617 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1618 }
1619#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1620
1621#ifdef KMP_DEBUG
1622 register kmp_info_t **other_threads = team->t.t_threads;
1623 register int i;
1624
1625 // Verify state
1626 KMP_MB();
1627
1628 for(i=1; i<team->t.t_nproc; ++i) {
1629 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1630 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1631 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1632 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1633 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1634 & ~(KMP_BARRIER_SLEEP_STATE))
1635 == KMP_INIT_BARRIER_STATE);
1636 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1637 }
1638#endif
1639
1640 if (__kmp_tasking_mode != tskm_immediate_exec) {
Jonathan Peyton54127982015-11-04 21:37:48 +00001641 __kmp_task_team_setup(this_thr, team, 0); // 0 indicates setup current task team if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001642 }
1643
1644 /* The master thread may have changed its blocktime between the join barrier and the
1645 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1646 access it when the team struct is not guaranteed to exist. */
1647 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1648 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
Jonathan Peytone1c7c132016-10-07 18:12:19 +00001649#if KMP_USE_MONITOR
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001650 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1651 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
Jonathan Peyton2208a8512017-01-27 17:54:31 +00001652#else
1653 this_thr->th.th_team_bt_intervals = KMP_BLOCKTIME_INTERVAL();
1654#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001655 }
1656 } // master
1657
1658 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1659 case bp_hyper_bar: {
1660 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1661 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1662 USE_ITT_BUILD_ARG(itt_sync_obj) );
1663 break;
1664 }
1665 case bp_hierarchical_bar: {
1666 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1667 USE_ITT_BUILD_ARG(itt_sync_obj) );
1668 break;
1669 }
1670 case bp_tree_bar: {
1671 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1672 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1673 USE_ITT_BUILD_ARG(itt_sync_obj) );
1674 break;
1675 }
1676 default: {
1677 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1678 USE_ITT_BUILD_ARG(itt_sync_obj) );
1679 }
1680 }
1681
1682 // Early exit for reaping threads releasing forkjoin barrier
1683 if (TCR_4(__kmp_global.g.g_done)) {
Jonathan Peyton54127982015-11-04 21:37:48 +00001684 this_thr->th.th_task_team = NULL;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001685
1686#if USE_ITT_BUILD && USE_ITT_NOTIFY
1687 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1688 if (!KMP_MASTER_TID(tid)) {
1689 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1690 if (itt_sync_obj)
1691 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1692 }
1693 }
1694#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1695 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1696 return;
1697 }
1698
1699 /* We can now assume that a valid team structure has been allocated by the master and
1700 propagated to all worker threads. The current thread, however, may not be part of the
1701 team, so we can't blindly assume that the team pointer is non-null. */
1702 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1703 KMP_DEBUG_ASSERT(team != NULL);
1704 tid = __kmp_tid_from_gtid(gtid);
1705
1706
1707#if KMP_BARRIER_ICV_PULL
1708 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1709 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1710 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1711 the fixed ICVs in the master's thread struct, because it is not always the case that the
1712 threads arrays have been allocated when __kmp_fork_call() is executed. */
Jonathan Peyton45be4502015-08-11 21:36:41 +00001713 {
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001714 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
Jonathan Peyton45be4502015-08-11 21:36:41 +00001715 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1716 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1717 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1718 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1719 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1720 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1721 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001722 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001723#endif // KMP_BARRIER_ICV_PULL
1724
1725 if (__kmp_tasking_mode != tskm_immediate_exec) {
1726 __kmp_task_team_sync(this_thr, team);
1727 }
1728
1729#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1730 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1731 if (proc_bind == proc_bind_intel) {
1732#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001733#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001734 // Call dynamic affinity settings
1735 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1736 __kmp_balanced_affinity(tid, team->t.t_nproc);
1737 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001738#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001739#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1740 }
Andrey Churbanov94e569e2015-03-10 09:19:47 +00001741 else if (proc_bind != proc_bind_false) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001742 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1743 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1744 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1745 }
1746 else {
1747 __kmp_affinity_set_place(gtid);
1748 }
1749 }
1750#endif
1751
1752#if USE_ITT_BUILD && USE_ITT_NOTIFY
1753 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1754 if (!KMP_MASTER_TID(tid)) {
1755 // Get correct barrier object
1756 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1757 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1758 } // (prepare called inside barrier_release)
1759 }
1760#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
Jonas Hahnfeld50fed042016-11-07 15:58:36 +00001761 ANNOTATE_NEW_BARRIER_END(&team->t.t_bar);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001762 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1763}
1764
1765
1766void
1767__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1768{
Jonathan Peyton5375fe82016-11-14 21:13:44 +00001769 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001770
1771 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1772 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1773
1774 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1775 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1776 this data before this function is called. */
1777#if KMP_BARRIER_ICV_PULL
1778 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1779 all of the worker threads can access them and make their own copies after the barrier. */
1780 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1781 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1782 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1783 0, team->t.t_threads[0], team));
1784#elif KMP_BARRIER_ICV_PUSH
1785 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1786 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1787 0, team->t.t_threads[0], team));
1788#else
1789 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1790 ngo_load(new_icvs);
1791 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
Jonathan Peyton91b78702015-06-08 19:39:07 +00001792 for (int f=1; f<new_nproc; ++f) { // Skip the master thread
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001793 // TODO: GEH - pass in better source location info since usually NULL here
1794 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1795 f, team->t.t_threads[f], team));
1796 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1797 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1798 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1799 f, team->t.t_threads[f], team));
1800 }
1801 ngo_sync();
1802#endif // KMP_BARRIER_ICV_PULL
1803}