blob: 505daec8795b57579521705598b2ff7e537ca254 [file] [log] [blame]
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001/*
2 * kmp_barrier.cpp
Jim Cownie4cc4bb42014-10-07 16:25:50 +00003 */
4
5
6//===----------------------------------------------------------------------===//
7//
8// The LLVM Compiler Infrastructure
9//
10// This file is dual licensed under the MIT and the University of Illinois Open
11// Source Licenses. See LICENSE.txt for details.
12//
13//===----------------------------------------------------------------------===//
14
15
16#include "kmp.h"
17#include "kmp_wait_release.h"
18#include "kmp_stats.h"
19#include "kmp_itt.h"
20
21#if KMP_MIC
22#include <immintrin.h>
23#define USE_NGO_STORES 1
24#endif // KMP_MIC
25
26#if KMP_MIC && USE_NGO_STORES
27// ICV copying
28#define ngo_load(src) __m512d Vt = _mm512_load_pd((void *)(src))
29#define ngo_store_icvs(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
30#define ngo_store_go(dst, src) _mm512_storenrngo_pd((void *)(dst), Vt)
31#define ngo_sync() __asm__ volatile ("lock; addl $0,0(%%rsp)" ::: "memory")
32#else
33#define ngo_load(src) ((void)0)
34#define ngo_store_icvs(dst, src) copy_icvs((dst), (src))
Jonathan Peyton01b58b72015-07-09 18:20:51 +000035#define ngo_store_go(dst, src) KMP_MEMCPY((dst), (src), CACHE_LINE)
Jim Cownie4cc4bb42014-10-07 16:25:50 +000036#define ngo_sync() ((void)0)
37#endif /* KMP_MIC && USE_NGO_STORES */
38
39void __kmp_print_structure(void); // Forward declaration
40
41// ---------------------------- Barrier Algorithms ----------------------------
42
43// Linear Barrier
44static void
45__kmp_linear_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
46 void (*reduce)(void *, void *)
47 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
48{
Jonathan Peyton45be4502015-08-11 21:36:41 +000049 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +000050 register kmp_team_t *team = this_thr->th.th_team;
51 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
52 register kmp_info_t **other_threads = team->t.t_threads;
53
54 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
55 gtid, team->t.t_id, tid, bt));
56 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
57
58#if USE_ITT_BUILD && USE_ITT_NOTIFY
59 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +000060 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +000061 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
62 }
63#endif
64 // We now perform a linear reduction to signal that all of the threads have arrived.
65 if (!KMP_MASTER_TID(tid)) {
66 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d)"
Jonathan Peytond26e2132015-09-10 18:44:30 +000067 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +000068 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, &thr_bar->b_arrived,
69 thr_bar->b_arrived, thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
70 // Mark arrival to master thread
71 /* After performing this write, a worker thread may not assume that the team is valid
72 any more - it could be deallocated by the master thread at any time. */
73 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[0]);
74 flag.release();
75 } else {
76 register kmp_balign_team_t *team_bar = &team->t.t_bar[bt];
77 register int nproc = this_thr->th.th_team_nproc;
78 register int i;
79 // Don't have to worry about sleep bit here or atomic since team setting
Jonathan Peytond26e2132015-09-10 18:44:30 +000080 register kmp_uint64 new_state = team_bar->b_arrived + KMP_BARRIER_STATE_BUMP;
Jim Cownie4cc4bb42014-10-07 16:25:50 +000081
82 // Collect all the worker team member threads.
83 for (i=1; i<nproc; ++i) {
84#if KMP_CACHE_MANAGE
85 // Prefetch next thread's arrived count
86 if (i+1 < nproc)
87 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_arrived);
88#endif /* KMP_CACHE_MANAGE */
89 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +000090 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +000091 __kmp_gtid_from_tid(i, team), team->t.t_id, i,
92 &other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state));
93
94 // Wait for worker thread to arrive
95 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_arrived, new_state);
96 flag.wait(this_thr, FALSE
97 USE_ITT_BUILD_ARG(itt_sync_obj) );
98#if USE_ITT_BUILD && USE_ITT_NOTIFY
99 // Barrier imbalance - write min of the thread time and the other thread time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000100 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000101 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
102 other_threads[i]->th.th_bar_min_time);
103 }
104#endif
105 if (reduce) {
106 KA_TRACE(100, ("__kmp_linear_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n", gtid,
107 team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i));
108 (*reduce)(this_thr->th.th_local.reduce_data,
109 other_threads[i]->th.th_local.reduce_data);
110 }
111 }
112 // Don't have to worry about sleep bit here or atomic since team setting
113 team_bar->b_arrived = new_state;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000114 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000115 gtid, team->t.t_id, tid, team->t.t_id, &team_bar->b_arrived, new_state));
116 }
117 KA_TRACE(20, ("__kmp_linear_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
118 gtid, team->t.t_id, tid, bt));
119}
120
121static void
122__kmp_linear_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
123 int propagate_icvs
124 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
125{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000126 KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000127 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
128 register kmp_team_t *team;
129
130 if (KMP_MASTER_TID(tid)) {
131 register unsigned int i;
132 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
133 register kmp_info_t **other_threads;
134
135 team = __kmp_threads[gtid]->th.th_team;
136 KMP_DEBUG_ASSERT(team != NULL);
137 other_threads = team->t.t_threads;
138
139 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
140 gtid, team->t.t_id, tid, bt));
141
142 if (nproc > 1) {
143#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000144 {
145 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
146 if (propagate_icvs) {
147 ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
148 for (i=1; i<nproc; ++i) {
149 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[i], team, i, FALSE);
150 ngo_store_icvs(&team->t.t_implicit_task_taskdata[i].td_icvs,
151 &team->t.t_implicit_task_taskdata[0].td_icvs);
152 }
153 ngo_sync();
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000154 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000155 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000156#endif // KMP_BARRIER_ICV_PUSH
157
158 // Now, release all of the worker threads
159 for (i=1; i<nproc; ++i) {
160#if KMP_CACHE_MANAGE
161 // Prefetch next thread's go flag
162 if (i+1 < nproc)
163 KMP_CACHE_PREFETCH(&other_threads[i+1]->th.th_bar[bt].bb.b_go);
164#endif /* KMP_CACHE_MANAGE */
165 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d) "
166 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
167 other_threads[i]->th.th_info.ds.ds_gtid, team->t.t_id, i,
168 &other_threads[i]->th.th_bar[bt].bb.b_go,
169 other_threads[i]->th.th_bar[bt].bb.b_go,
170 other_threads[i]->th.th_bar[bt].bb.b_go + KMP_BARRIER_STATE_BUMP));
171 kmp_flag_64 flag(&other_threads[i]->th.th_bar[bt].bb.b_go, other_threads[i]);
172 flag.release();
173 }
174 }
175 } else { // Wait for the MASTER thread to release us
176 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d wait go(%p) == %u\n",
177 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
178 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
179 flag.wait(this_thr, TRUE
180 USE_ITT_BUILD_ARG(itt_sync_obj) );
181#if USE_ITT_BUILD && USE_ITT_NOTIFY
182 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
183 // In a fork barrier; cannot get the object reliably (or ITTNOTIFY is disabled)
184 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
185 // Cancel wait on previous parallel region...
186 __kmp_itt_task_starting(itt_sync_obj);
187
188 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
189 return;
190
191 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
192 if (itt_sync_obj != NULL)
193 // Call prepare as early as possible for "new" barrier
194 __kmp_itt_task_finished(itt_sync_obj);
195 } else
196#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
197 // Early exit for reaping threads releasing forkjoin barrier
198 if ( bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done) )
199 return;
200 // The worker thread may now assume that the team is valid.
201#ifdef KMP_DEBUG
202 tid = __kmp_tid_from_gtid(gtid);
203 team = __kmp_threads[gtid]->th.th_team;
204#endif
205 KMP_DEBUG_ASSERT(team != NULL);
206 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
207 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
208 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
209 KMP_MB(); // Flush all pending memory write invalidates.
210 }
211 KA_TRACE(20, ("__kmp_linear_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
212 gtid, team->t.t_id, tid, bt));
213}
214
215// Tree barrier
216static void
217__kmp_tree_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
218 void (*reduce)(void *, void *)
219 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
220{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000221 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000222 register kmp_team_t *team = this_thr->th.th_team;
223 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
224 register kmp_info_t **other_threads = team->t.t_threads;
225 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
226 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
227 register kmp_uint32 branch_factor = 1 << branch_bits;
228 register kmp_uint32 child;
229 register kmp_uint32 child_tid;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000230 register kmp_uint64 new_state;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000231
232 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
233 gtid, team->t.t_id, tid, bt));
234 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
235
236#if USE_ITT_BUILD && USE_ITT_NOTIFY
237 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000238 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000239 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
240 }
241#endif
242 // Perform tree gather to wait until all threads have arrived; reduce any required data as we go
243 child_tid = (tid << branch_bits) + 1;
244 if (child_tid < nproc) {
245 // Parent threads wait for all their children to arrive
246 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
247 child = 1;
248 do {
249 register kmp_info_t *child_thr = other_threads[child_tid];
250 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
251#if KMP_CACHE_MANAGE
252 // Prefetch next thread's arrived count
253 if (child+1 <= branch_factor && child_tid+1 < nproc)
254 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_arrived);
255#endif /* KMP_CACHE_MANAGE */
256 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000257 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000258 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
259 &child_bar->b_arrived, new_state));
260 // Wait for child to arrive
261 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
262 flag.wait(this_thr, FALSE
263 USE_ITT_BUILD_ARG(itt_sync_obj) );
264#if USE_ITT_BUILD && USE_ITT_NOTIFY
265 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000266 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000267 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
268 child_thr->th.th_bar_min_time);
269 }
270#endif
271 if (reduce) {
272 KA_TRACE(100, ("__kmp_tree_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
273 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
274 team->t.t_id, child_tid));
275 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
276 }
277 child++;
278 child_tid++;
279 }
280 while (child <= branch_factor && child_tid < nproc);
281 }
282
283 if (!KMP_MASTER_TID(tid)) { // Worker threads
284 register kmp_int32 parent_tid = (tid - 1) >> branch_bits;
285
286 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000287 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000288 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
289 &thr_bar->b_arrived, thr_bar->b_arrived,
290 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
291
292 // Mark arrival to parent thread
293 /* After performing this write, a worker thread may not assume that the team is valid
294 any more - it could be deallocated by the master thread at any time. */
295 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[parent_tid]);
296 flag.release();
297 } else {
298 // Need to update the team arrived pointer if we are the master thread
299 if (nproc > 1) // New value was already computed above
300 team->t.t_bar[bt].b_arrived = new_state;
301 else
302 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000303 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000304 gtid, team->t.t_id, tid, team->t.t_id,
305 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
306 }
307 KA_TRACE(20, ("__kmp_tree_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
308 gtid, team->t.t_id, tid, bt));
309}
310
311static void
312__kmp_tree_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
313 int propagate_icvs
314 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
315{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000316 KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000317 register kmp_team_t *team;
318 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
319 register kmp_uint32 nproc;
320 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[bt];
321 register kmp_uint32 branch_factor = 1 << branch_bits;
322 register kmp_uint32 child;
323 register kmp_uint32 child_tid;
324
325 // Perform a tree release for all of the threads that have been gathered
326 if (!KMP_MASTER_TID(tid)) { // Handle fork barrier workers who aren't part of a team yet
327 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d wait go(%p) == %u\n",
328 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
329 // Wait for parent thread to release us
330 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
331 flag.wait(this_thr, TRUE
332 USE_ITT_BUILD_ARG(itt_sync_obj) );
333#if USE_ITT_BUILD && USE_ITT_NOTIFY
334 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
335 // In fork barrier where we could not get the object reliably (or ITTNOTIFY is disabled)
336 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
337 // Cancel wait on previous parallel region...
338 __kmp_itt_task_starting(itt_sync_obj);
339
340 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
341 return;
342
343 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
344 if (itt_sync_obj != NULL)
345 // Call prepare as early as possible for "new" barrier
346 __kmp_itt_task_finished(itt_sync_obj);
347 } else
348#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
349 // Early exit for reaping threads releasing forkjoin barrier
350 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
351 return;
352
353 // The worker thread may now assume that the team is valid.
354 team = __kmp_threads[gtid]->th.th_team;
355 KMP_DEBUG_ASSERT(team != NULL);
356 tid = __kmp_tid_from_gtid(gtid);
357
358 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
359 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
360 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
361 KMP_MB(); // Flush all pending memory write invalidates.
362 } else {
363 team = __kmp_threads[gtid]->th.th_team;
364 KMP_DEBUG_ASSERT(team != NULL);
365 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
366 gtid, team->t.t_id, tid, bt));
367 }
368 nproc = this_thr->th.th_team_nproc;
369 child_tid = (tid << branch_bits) + 1;
370
371 if (child_tid < nproc) {
372 register kmp_info_t **other_threads = team->t.t_threads;
373 child = 1;
374 // Parent threads release all their children
375 do {
376 register kmp_info_t *child_thr = other_threads[child_tid];
377 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
378#if KMP_CACHE_MANAGE
379 // Prefetch next thread's go count
380 if (child+1 <= branch_factor && child_tid+1 < nproc)
381 KMP_CACHE_PREFETCH(&other_threads[child_tid+1]->th.th_bar[bt].bb.b_go);
382#endif /* KMP_CACHE_MANAGE */
383
384#if KMP_BARRIER_ICV_PUSH
Jonathan Peyton45be4502015-08-11 21:36:41 +0000385 {
386 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
387 if (propagate_icvs) {
388 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
389 team, child_tid, FALSE);
390 copy_icvs(&team->t.t_implicit_task_taskdata[child_tid].td_icvs,
391 &team->t.t_implicit_task_taskdata[0].td_icvs);
392 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000393 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000394#endif // KMP_BARRIER_ICV_PUSH
395 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
396 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
397 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
398 child_tid, &child_bar->b_go, child_bar->b_go,
399 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
400 // Release child from barrier
401 kmp_flag_64 flag(&child_bar->b_go, child_thr);
402 flag.release();
403 child++;
404 child_tid++;
405 }
406 while (child <= branch_factor && child_tid < nproc);
407 }
408 KA_TRACE(20, ("__kmp_tree_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
409 gtid, team->t.t_id, tid, bt));
410}
411
412
413// Hyper Barrier
414static void
415__kmp_hyper_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
416 void (*reduce)(void *, void *)
417 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
418{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000419 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000420 register kmp_team_t *team = this_thr->th.th_team;
421 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
422 register kmp_info_t **other_threads = team->t.t_threads;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000423 register kmp_uint64 new_state = KMP_BARRIER_UNUSED_STATE;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000424 register kmp_uint32 num_threads = this_thr->th.th_team_nproc;
425 register kmp_uint32 branch_bits = __kmp_barrier_gather_branch_bits[bt];
426 register kmp_uint32 branch_factor = 1 << branch_bits;
427 register kmp_uint32 offset;
428 register kmp_uint32 level;
429
430 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
431 gtid, team->t.t_id, tid, bt));
432
433 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
434
435#if USE_ITT_BUILD && USE_ITT_NOTIFY
436 // Barrier imbalance - save arrive time to the thread
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000437 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000438 this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp();
439 }
440#endif
441 /* Perform a hypercube-embedded tree gather to wait until all of the threads have
442 arrived, and reduce any required data as we go. */
443 kmp_flag_64 p_flag(&thr_bar->b_arrived);
444 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
445 {
446 register kmp_uint32 child;
447 register kmp_uint32 child_tid;
448
449 if (((tid >> level) & (branch_factor - 1)) != 0) {
450 register kmp_int32 parent_tid = tid & ~((1 << (level + branch_bits)) -1);
451
452 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000453 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000454 __kmp_gtid_from_tid(parent_tid, team), team->t.t_id, parent_tid,
455 &thr_bar->b_arrived, thr_bar->b_arrived,
456 thr_bar->b_arrived + KMP_BARRIER_STATE_BUMP));
457 // Mark arrival to parent thread
458 /* After performing this write (in the last iteration of the enclosing for loop),
459 a worker thread may not assume that the team is valid any more - it could be
460 deallocated by the master thread at any time. */
461 p_flag.set_waiter(other_threads[parent_tid]);
462 p_flag.release();
463 break;
464 }
465
466 // Parent threads wait for children to arrive
467 if (new_state == KMP_BARRIER_UNUSED_STATE)
468 new_state = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
469 for (child=1, child_tid=tid+(1 << level); child<branch_factor && child_tid<num_threads;
470 child++, child_tid+=(1 << level))
471 {
472 register kmp_info_t *child_thr = other_threads[child_tid];
473 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
474#if KMP_CACHE_MANAGE
475 register kmp_uint32 next_child_tid = child_tid + (1 << level);
476 // Prefetch next thread's arrived count
477 if (child+1 < branch_factor && next_child_tid < num_threads)
478 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_arrived);
479#endif /* KMP_CACHE_MANAGE */
480 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%u) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000481 "arrived(%p) == %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000482 __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid,
483 &child_bar->b_arrived, new_state));
484 // Wait for child to arrive
485 kmp_flag_64 c_flag(&child_bar->b_arrived, new_state);
486 c_flag.wait(this_thr, FALSE
487 USE_ITT_BUILD_ARG(itt_sync_obj) );
488#if USE_ITT_BUILD && USE_ITT_NOTIFY
489 // Barrier imbalance - write min of the thread time and a child time to the thread.
Andrey Churbanov51aecb82015-05-06 19:22:36 +0000490 if (__kmp_forkjoin_frames_mode == 2) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000491 this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time,
492 child_thr->th.th_bar_min_time);
493 }
494#endif
495 if (reduce) {
496 KA_TRACE(100, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) += T#%d(%d:%u)\n",
497 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
498 team->t.t_id, child_tid));
499 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
500 }
501 }
502 }
503
504 if (KMP_MASTER_TID(tid)) {
505 // Need to update the team arrived pointer if we are the master thread
506 if (new_state == KMP_BARRIER_UNUSED_STATE)
507 team->t.t_bar[bt].b_arrived += KMP_BARRIER_STATE_BUMP;
508 else
509 team->t.t_bar[bt].b_arrived = new_state;
Jonathan Peytond26e2132015-09-10 18:44:30 +0000510 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000511 gtid, team->t.t_id, tid, team->t.t_id,
512 &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
513 }
514 KA_TRACE(20, ("__kmp_hyper_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
515 gtid, team->t.t_id, tid, bt));
516}
517
518// The reverse versions seem to beat the forward versions overall
519#define KMP_REVERSE_HYPER_BAR
520static void
521__kmp_hyper_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
522 int propagate_icvs
523 USE_ITT_BUILD_ARG(void *itt_sync_obj) )
524{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000525 KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000526 register kmp_team_t *team;
527 register kmp_bstate_t *thr_bar = & this_thr -> th.th_bar[ bt ].bb;
528 register kmp_info_t **other_threads;
529 register kmp_uint32 num_threads;
530 register kmp_uint32 branch_bits = __kmp_barrier_release_branch_bits[ bt ];
531 register kmp_uint32 branch_factor = 1 << branch_bits;
532 register kmp_uint32 child;
533 register kmp_uint32 child_tid;
534 register kmp_uint32 offset;
535 register kmp_uint32 level;
536
537 /* Perform a hypercube-embedded tree release for all of the threads that have been gathered.
538 If KMP_REVERSE_HYPER_BAR is defined (default) the threads are released in the reverse
539 order of the corresponding gather, otherwise threads are released in the same order. */
540 if (KMP_MASTER_TID(tid)) { // master
541 team = __kmp_threads[gtid]->th.th_team;
542 KMP_DEBUG_ASSERT(team != NULL);
543 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) master enter for barrier type %d\n",
544 gtid, team->t.t_id, tid, bt));
545#if KMP_BARRIER_ICV_PUSH
546 if (propagate_icvs) { // master already has ICVs in final destination; copy
547 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
548 }
549#endif
550 }
551 else { // Handle fork barrier workers who aren't part of a team yet
552 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d wait go(%p) == %u\n",
553 gtid, &thr_bar->b_go, KMP_BARRIER_STATE_BUMP));
554 // Wait for parent thread to release us
555 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
556 flag.wait(this_thr, TRUE
557 USE_ITT_BUILD_ARG(itt_sync_obj) );
558#if USE_ITT_BUILD && USE_ITT_NOTIFY
559 if ((__itt_sync_create_ptr && itt_sync_obj == NULL) || KMP_ITT_DEBUG) {
560 // In fork barrier where we could not get the object reliably
561 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 0, -1);
562 // Cancel wait on previous parallel region...
563 __kmp_itt_task_starting(itt_sync_obj);
564
565 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
566 return;
567
568 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
569 if (itt_sync_obj != NULL)
570 // Call prepare as early as possible for "new" barrier
571 __kmp_itt_task_finished(itt_sync_obj);
572 } else
573#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
574 // Early exit for reaping threads releasing forkjoin barrier
575 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
576 return;
577
578 // The worker thread may now assume that the team is valid.
579 team = __kmp_threads[gtid]->th.th_team;
580 KMP_DEBUG_ASSERT(team != NULL);
581 tid = __kmp_tid_from_gtid(gtid);
582
583 TCW_4(thr_bar->b_go, KMP_INIT_BARRIER_STATE);
584 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
585 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
586 KMP_MB(); // Flush all pending memory write invalidates.
587 }
588 num_threads = this_thr->th.th_team_nproc;
589 other_threads = team->t.t_threads;
590
591#ifdef KMP_REVERSE_HYPER_BAR
592 // Count up to correct level for parent
593 for (level=0, offset=1; offset<num_threads && (((tid>>level) & (branch_factor-1)) == 0);
594 level+=branch_bits, offset<<=branch_bits);
595
596 // Now go down from there
597 for (level-=branch_bits, offset>>=branch_bits; offset != 0;
598 level-=branch_bits, offset>>=branch_bits)
599#else
600 // Go down the tree, level by level
601 for (level=0, offset=1; offset<num_threads; level+=branch_bits, offset<<=branch_bits)
602#endif // KMP_REVERSE_HYPER_BAR
603 {
604#ifdef KMP_REVERSE_HYPER_BAR
605 /* Now go in reverse order through the children, highest to lowest.
606 Initial setting of child is conservative here. */
607 child = num_threads >> ((level==0)?level:level-1);
608 for (child=(child<branch_factor-1) ? child : branch_factor-1, child_tid=tid+(child<<level);
609 child>=1; child--, child_tid-=(1<<level))
610#else
611 if (((tid >> level) & (branch_factor - 1)) != 0)
612 // No need to go lower than this, since this is the level parent would be notified
613 break;
614 // Iterate through children on this level of the tree
615 for (child=1, child_tid=tid+(1<<level); child<branch_factor && child_tid<num_threads;
616 child++, child_tid+=(1<<level))
617#endif // KMP_REVERSE_HYPER_BAR
618 {
619 if (child_tid >= num_threads) continue; // Child doesn't exist so keep going
620 else {
621 register kmp_info_t *child_thr = other_threads[child_tid];
622 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
623#if KMP_CACHE_MANAGE
624 register kmp_uint32 next_child_tid = child_tid - (1 << level);
625 // Prefetch next thread's go count
626# ifdef KMP_REVERSE_HYPER_BAR
627 if (child-1 >= 1 && next_child_tid < num_threads)
628# else
629 if (child+1 < branch_factor && next_child_tid < num_threads)
630# endif // KMP_REVERSE_HYPER_BAR
631 KMP_CACHE_PREFETCH(&other_threads[next_child_tid]->th.th_bar[bt].bb.b_go);
632#endif /* KMP_CACHE_MANAGE */
633
634#if KMP_BARRIER_ICV_PUSH
635 if (propagate_icvs) // push my fixed ICVs to my child
636 copy_icvs(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
637#endif // KMP_BARRIER_ICV_PUSH
638
639 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%u)"
640 "go(%p): %u => %u\n", gtid, team->t.t_id, tid,
641 __kmp_gtid_from_tid(child_tid, team), team->t.t_id,
642 child_tid, &child_bar->b_go, child_bar->b_go,
643 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
644 // Release child from barrier
645 kmp_flag_64 flag(&child_bar->b_go, child_thr);
646 flag.release();
647 }
648 }
649 }
650#if KMP_BARRIER_ICV_PUSH
651 if (propagate_icvs && !KMP_MASTER_TID(tid)) { // copy ICVs locally to final dest
652 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
653 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
654 }
655#endif
656 KA_TRACE(20, ("__kmp_hyper_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
657 gtid, team->t.t_id, tid, bt));
658}
659
660// Hierarchical Barrier
661
662// Initialize thread barrier data
663/* Initializes/re-initializes the hierarchical barrier data stored on a thread. Performs the
664 minimum amount of initialization required based on how the team has changed. Returns true if
665 leaf children will require both on-core and traditional wake-up mechanisms. For example, if the
666 team size increases, threads already in the team will respond to on-core wakeup on their parent
667 thread, but threads newly added to the team will only be listening on the their local b_go. */
668static bool
669__kmp_init_hierarchical_barrier_thread(enum barrier_type bt, kmp_bstate_t *thr_bar, kmp_uint32 nproc,
670 int gtid, int tid, kmp_team_t *team)
671{
672 // Checks to determine if (re-)initialization is needed
673 bool uninitialized = thr_bar->team == NULL;
674 bool team_changed = team != thr_bar->team;
675 bool team_sz_changed = nproc != thr_bar->nproc;
676 bool tid_changed = tid != thr_bar->old_tid;
677 bool retval = false;
678
679 if (uninitialized || team_sz_changed) {
680 __kmp_get_hierarchy(nproc, thr_bar);
681 }
682
683 if (uninitialized || team_sz_changed || tid_changed) {
684 thr_bar->my_level = thr_bar->depth-1; // default for master
685 thr_bar->parent_tid = -1; // default for master
686 if (!KMP_MASTER_TID(tid)) { // if not master, find parent thread in hierarchy
687 kmp_uint32 d=0;
688 while (d<thr_bar->depth) { // find parent based on level of thread in hierarchy, and note level
689 kmp_uint32 rem;
690 if (d == thr_bar->depth-2) { // reached level right below the master
691 thr_bar->parent_tid = 0;
692 thr_bar->my_level = d;
693 break;
694 }
695 else if ((rem = tid%thr_bar->skip_per_level[d+1]) != 0) { // TODO: can we make this op faster?
696 // thread is not a subtree root at next level, so this is max
697 thr_bar->parent_tid = tid - rem;
698 thr_bar->my_level = d;
699 break;
700 }
701 ++d;
702 }
703 }
704 thr_bar->offset = 7-(tid-thr_bar->parent_tid-1);
705 thr_bar->old_tid = tid;
706 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
707 }
708 if (uninitialized || team_changed || tid_changed) {
709 thr_bar->team = team;
710 thr_bar->parent_bar = &team->t.t_threads[thr_bar->parent_tid]->th.th_bar[bt].bb;
711 retval = true;
712 }
713 if (uninitialized || team_sz_changed || tid_changed) {
714 thr_bar->nproc = nproc;
715 thr_bar->leaf_kids = thr_bar->base_leaf_kids;
716 if (thr_bar->my_level == 0) thr_bar->leaf_kids=0;
717 if (thr_bar->leaf_kids && (kmp_uint32)tid+thr_bar->leaf_kids+1 > nproc)
718 thr_bar->leaf_kids = nproc - tid - 1;
719 thr_bar->leaf_state = 0;
720 for (int i=0; i<thr_bar->leaf_kids; ++i) ((char *)&(thr_bar->leaf_state))[7-i] = 1;
721 }
722 return retval;
723}
724
725static void
726__kmp_hierarchical_barrier_gather(enum barrier_type bt, kmp_info_t *this_thr,
727 int gtid, int tid, void (*reduce) (void *, void *)
728 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
729{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000730 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000731 register kmp_team_t *team = this_thr->th.th_team;
732 register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
733 register kmp_uint32 nproc = this_thr->th.th_team_nproc;
734 register kmp_info_t **other_threads = team->t.t_threads;
735 register kmp_uint64 new_state;
736
Andrey Churbanov42a79212015-01-27 16:50:31 +0000737 int level = team->t.t_level;
Jonathan Peyton441f3372015-09-21 17:24:46 +0000738#if OMP_40_ENABLED
Andrey Churbanov42a79212015-01-27 16:50:31 +0000739 if (other_threads[0]->th.th_teams_microtask) // are we inside the teams construct?
740 if (this_thr->th.th_teams_size.nteams > 1)
741 ++level; // level was not increased in teams construct for team_of_masters
Jonathan Peyton441f3372015-09-21 17:24:46 +0000742#endif
Andrey Churbanov42a79212015-01-27 16:50:31 +0000743 if (level == 1) thr_bar->use_oncore_barrier = 1;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000744 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
745
746 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) enter for barrier type %d\n",
747 gtid, team->t.t_id, tid, bt));
748 KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]);
749
Andrey Churbanove6bfb732015-05-06 18:34:15 +0000750#if USE_ITT_BUILD && USE_ITT_NOTIFY
751 // Barrier imbalance - save arrive time to the thread
752 if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) {
753 this_thr->th.th_bar_arrive_time = __itt_get_timestamp();
754 }
755#endif
756
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000757 (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
758
759 if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf)
760 register kmp_int32 child_tid;
761 new_state = (kmp_uint64)team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
762 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
763 if (thr_bar->leaf_kids) { // First, wait for leaf children to check-in on my b_arrived flag
Jonathan Peytond26e2132015-09-10 18:44:30 +0000764 kmp_uint64 leaf_state = KMP_MASTER_TID(tid) ? thr_bar->b_arrived | thr_bar->leaf_state : team->t.t_bar[bt].b_arrived | thr_bar->leaf_state;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000765 kmp_flag_64 flag(&thr_bar->b_arrived, leaf_state);
766 flag.wait(this_thr, FALSE
767 USE_ITT_BUILD_ARG(itt_sync_obj) );
768 if (reduce) {
769 for (child_tid=tid+1; child_tid<=tid+thr_bar->leaf_kids; ++child_tid) {
770 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
771 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
772 team->t.t_id, child_tid));
773 (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data);
774 }
775 }
776 (void) KMP_TEST_THEN_AND64((volatile kmp_int64 *)&thr_bar->b_arrived, ~(thr_bar->leaf_state)); // clear leaf_state bits
777 }
778 // Next, wait for higher level children on each child's b_arrived flag
779 for (kmp_uint32 d=1; d<thr_bar->my_level; ++d) { // gather lowest level threads first, but skip 0
780 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
781 if (last > nproc) last = nproc;
782 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
783 register kmp_info_t *child_thr = other_threads[child_tid];
784 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
785 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000786 "arrived(%p) == %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000787 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
788 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
789 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
790 flag.wait(this_thr, FALSE
791 USE_ITT_BUILD_ARG(itt_sync_obj) );
792 if (reduce) {
793 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
794 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
795 team->t.t_id, child_tid));
796 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
797 }
798 }
799 }
800 }
801 else { // Blocktime is not infinite
802 for (kmp_uint32 d=0; d<thr_bar->my_level; ++d) { // Gather lowest level threads first
803 kmp_uint32 last = tid+thr_bar->skip_per_level[d+1], skip = thr_bar->skip_per_level[d];
804 if (last > nproc) last = nproc;
805 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
806 register kmp_info_t *child_thr = other_threads[child_tid];
807 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
808 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) wait T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000809 "arrived(%p) == %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000810 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
811 team->t.t_id, child_tid, &child_bar->b_arrived, new_state));
812 kmp_flag_64 flag(&child_bar->b_arrived, new_state);
813 flag.wait(this_thr, FALSE
814 USE_ITT_BUILD_ARG(itt_sync_obj) );
815 if (reduce) {
816 KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += T#%d(%d:%d)\n",
817 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
818 team->t.t_id, child_tid));
819 (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data);
820 }
821 }
822 }
823 }
824 }
825 // All subordinates are gathered; now release parent if not master thread
826
827 if (!KMP_MASTER_TID(tid)) { // worker threads release parent in hierarchy
828 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) releasing T#%d(%d:%d) "
Jonathan Peytond26e2132015-09-10 18:44:30 +0000829 "arrived(%p): %llu => %llu\n", gtid, team->t.t_id, tid,
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000830 __kmp_gtid_from_tid(thr_bar->parent_tid, team), team->t.t_id, thr_bar->parent_tid,
831 &thr_bar->b_arrived, thr_bar->b_arrived, thr_bar->b_arrived+KMP_BARRIER_STATE_BUMP));
832 /* Mark arrival to parent: After performing this write, a worker thread may not assume that
833 the team is valid any more - it could be deallocated by the master thread at any time. */
834 if (thr_bar->my_level || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
835 || !thr_bar->use_oncore_barrier) { // Parent is waiting on my b_arrived flag; release it
836 kmp_flag_64 flag(&thr_bar->b_arrived, other_threads[thr_bar->parent_tid]);
837 flag.release();
838 }
839 else { // Leaf does special release on the "offset" bits of parent's b_arrived flag
Jonathan Peytond26e2132015-09-10 18:44:30 +0000840 thr_bar->b_arrived = team->t.t_bar[bt].b_arrived + KMP_BARRIER_STATE_BUMP;
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000841 kmp_flag_oncore flag(&thr_bar->parent_bar->b_arrived, thr_bar->offset);
842 flag.set_waiter(other_threads[thr_bar->parent_tid]);
843 flag.release();
844 }
845 } else { // Master thread needs to update the team's b_arrived value
Jonathan Peytond26e2132015-09-10 18:44:30 +0000846 team->t.t_bar[bt].b_arrived = new_state;
847 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) set team %d arrived(%p) = %llu\n",
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000848 gtid, team->t.t_id, tid, team->t.t_id, &team->t.t_bar[bt].b_arrived, team->t.t_bar[bt].b_arrived));
849 }
850 // Is the team access below unsafe or just technically invalid?
851 KA_TRACE(20, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) exit for barrier type %d\n",
852 gtid, team->t.t_id, tid, bt));
853}
854
855static void
856__kmp_hierarchical_barrier_release(enum barrier_type bt, kmp_info_t *this_thr, int gtid, int tid,
857 int propagate_icvs
858 USE_ITT_BUILD_ARG(void * itt_sync_obj) )
859{
Jonathan Peyton45be4502015-08-11 21:36:41 +0000860 KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000861 register kmp_team_t *team;
862 register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
863 register kmp_uint32 nproc;
864 bool team_change = false; // indicates on-core barrier shouldn't be used
865
866 if (KMP_MASTER_TID(tid)) {
867 team = __kmp_threads[gtid]->th.th_team;
868 KMP_DEBUG_ASSERT(team != NULL);
869 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) master entered barrier type %d\n",
870 gtid, team->t.t_id, tid, bt));
871 }
872 else { // Worker threads
873 // Wait for parent thread to release me
874 if (!thr_bar->use_oncore_barrier || __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME
875 || thr_bar->my_level != 0 || thr_bar->team == NULL) {
876 // Use traditional method of waiting on my own b_go flag
877 thr_bar->wait_flag = KMP_BARRIER_OWN_FLAG;
878 kmp_flag_64 flag(&thr_bar->b_go, KMP_BARRIER_STATE_BUMP);
879 flag.wait(this_thr, TRUE
880 USE_ITT_BUILD_ARG(itt_sync_obj) );
881 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
882 }
883 else { // Thread barrier data is initialized, this is a leaf, blocktime is infinite, not nested
884 // Wait on my "offset" bits on parent's b_go flag
885 thr_bar->wait_flag = KMP_BARRIER_PARENT_FLAG;
886 kmp_flag_oncore flag(&thr_bar->parent_bar->b_go, KMP_BARRIER_STATE_BUMP, thr_bar->offset,
887 bt, this_thr
888 USE_ITT_BUILD_ARG(itt_sync_obj) );
889 flag.wait(this_thr, TRUE
890 USE_ITT_BUILD_ARG(itt_sync_obj) );
891 if (thr_bar->wait_flag == KMP_BARRIER_SWITCHING) { // Thread was switched to own b_go
892 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
893 }
894 else { // Reset my bits on parent's b_go flag
895 ((char*)&(thr_bar->parent_bar->b_go))[thr_bar->offset] = 0;
896 }
897 }
898 thr_bar->wait_flag = KMP_BARRIER_NOT_WAITING;
899 // Early exit for reaping threads releasing forkjoin barrier
900 if (bt == bs_forkjoin_barrier && TCR_4(__kmp_global.g.g_done))
901 return;
902 // The worker thread may now assume that the team is valid.
903 team = __kmp_threads[gtid]->th.th_team;
904 KMP_DEBUG_ASSERT(team != NULL);
905 tid = __kmp_tid_from_gtid(gtid);
906
907 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) set go(%p) = %u\n",
908 gtid, team->t.t_id, tid, &thr_bar->b_go, KMP_INIT_BARRIER_STATE));
909 KMP_MB(); // Flush all pending memory write invalidates.
910 }
911
Andrey Churbanov42a79212015-01-27 16:50:31 +0000912 int level = team->t.t_level;
Jonathan Peyton441f3372015-09-21 17:24:46 +0000913#if OMP_40_ENABLED
Andrey Churbanov42a79212015-01-27 16:50:31 +0000914 if (team->t.t_threads[0]->th.th_teams_microtask ) { // are we inside the teams construct?
915 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && this_thr->th.th_teams_level == level)
916 ++level; // level was not increased in teams construct for team_of_workers
917 if( this_thr->th.th_teams_size.nteams > 1 )
918 ++level; // level was not increased in teams construct for team_of_masters
919 }
Jonathan Peyton441f3372015-09-21 17:24:46 +0000920#endif
Andrey Churbanov42a79212015-01-27 16:50:31 +0000921 if (level == 1) thr_bar->use_oncore_barrier = 1;
922 else thr_bar->use_oncore_barrier = 0; // Do not use oncore barrier when nested
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000923 nproc = this_thr->th.th_team_nproc;
924
925 // If the team size has increased, we still communicate with old leaves via oncore barrier.
926 unsigned short int old_leaf_kids = thr_bar->leaf_kids;
927 kmp_uint64 old_leaf_state = thr_bar->leaf_state;
928 team_change = __kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team);
929 // But if the entire team changes, we won't use oncore barrier at all
930 if (team_change) old_leaf_kids = 0;
931
932#if KMP_BARRIER_ICV_PUSH
933 if (propagate_icvs) {
Jonathan Peyton2211cfe2015-08-12 20:59:48 +0000934 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
Jim Cownie4cc4bb42014-10-07 16:25:50 +0000935 if (KMP_MASTER_TID(tid)) { // master already has copy in final destination; copy
936 copy_icvs(&thr_bar->th_fixed_icvs, &team->t.t_implicit_task_taskdata[tid].td_icvs);
937 }
938 else if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) { // optimization for inf blocktime
939 if (!thr_bar->my_level) // I'm a leaf in the hierarchy (my_level==0)
940 // leaves (on-core children) pull parent's fixed ICVs directly to local ICV store
941 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
942 &thr_bar->parent_bar->th_fixed_icvs);
943 // non-leaves will get ICVs piggybacked with b_go via NGO store
944 }
945 else { // blocktime is not infinite; pull ICVs from parent's fixed ICVs
946 if (thr_bar->my_level) // not a leaf; copy ICVs to my fixed ICVs child can access
947 copy_icvs(&thr_bar->th_fixed_icvs, &thr_bar->parent_bar->th_fixed_icvs);
948 else // leaves copy parent's fixed ICVs directly to local ICV store
949 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
950 &thr_bar->parent_bar->th_fixed_icvs);
951 }
952 }
953#endif // KMP_BARRIER_ICV_PUSH
954
955 // Now, release my children
956 if (thr_bar->my_level) { // not a leaf
957 register kmp_int32 child_tid;
958 kmp_uint32 last;
959 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME && thr_bar->use_oncore_barrier) {
960 if (KMP_MASTER_TID(tid)) { // do a flat release
961 // Set local b_go to bump children via NGO store of the cache line containing IVCs and b_go.
962 thr_bar->b_go = KMP_BARRIER_STATE_BUMP;
963 // Use ngo stores if available; b_go piggybacks in the last 8 bytes of the cache line
964 ngo_load(&thr_bar->th_fixed_icvs);
965 // This loops over all the threads skipping only the leaf nodes in the hierarchy
966 for (child_tid=thr_bar->skip_per_level[1]; child_tid<(int)nproc; child_tid+=thr_bar->skip_per_level[1]) {
967 register kmp_bstate_t *child_bar = &team->t.t_threads[child_tid]->th.th_bar[bt].bb;
968 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
969 " go(%p): %u => %u\n",
970 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
971 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
972 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
973 // Use ngo store (if available) to both store ICVs and release child via child's b_go
974 ngo_store_go(&child_bar->th_fixed_icvs, &thr_bar->th_fixed_icvs);
975 }
976 ngo_sync();
977 }
978 TCW_8(thr_bar->b_go, KMP_INIT_BARRIER_STATE); // Reset my b_go flag for next time
979 // Now, release leaf children
980 if (thr_bar->leaf_kids) { // if there are any
981 // We test team_change on the off-chance that the level 1 team changed.
982 if (team_change || old_leaf_kids < thr_bar->leaf_kids) { // some old leaf_kids, some new
983 if (old_leaf_kids) { // release old leaf kids
984 thr_bar->b_go |= old_leaf_state;
985 }
986 // Release new leaf kids
987 last = tid+thr_bar->skip_per_level[1];
988 if (last > nproc) last = nproc;
989 for (child_tid=tid+1+old_leaf_kids; child_tid<(int)last; ++child_tid) { // skip_per_level[0]=1
990 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
991 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
992 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing"
993 " T#%d(%d:%d) go(%p): %u => %u\n",
994 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
995 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
996 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
997 // Release child using child's b_go flag
998 kmp_flag_64 flag(&child_bar->b_go, child_thr);
999 flag.release();
1000 }
1001 }
1002 else { // Release all children at once with leaf_state bits on my own b_go flag
1003 thr_bar->b_go |= thr_bar->leaf_state;
1004 }
1005 }
1006 }
1007 else { // Blocktime is not infinite; do a simple hierarchical release
1008 for (int d=thr_bar->my_level-1; d>=0; --d) { // Release highest level threads first
1009 last = tid+thr_bar->skip_per_level[d+1];
1010 kmp_uint32 skip = thr_bar->skip_per_level[d];
1011 if (last > nproc) last = nproc;
1012 for (child_tid=tid+skip; child_tid<(int)last; child_tid+=skip) {
1013 register kmp_info_t *child_thr = team->t.t_threads[child_tid];
1014 register kmp_bstate_t *child_bar = &child_thr->th.th_bar[bt].bb;
1015 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) releasing T#%d(%d:%d)"
1016 " go(%p): %u => %u\n",
1017 gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team),
1018 team->t.t_id, child_tid, &child_bar->b_go, child_bar->b_go,
1019 child_bar->b_go + KMP_BARRIER_STATE_BUMP));
1020 // Release child using child's b_go flag
1021 kmp_flag_64 flag(&child_bar->b_go, child_thr);
1022 flag.release();
1023 }
1024 }
1025 }
1026#if KMP_BARRIER_ICV_PUSH
1027 if (propagate_icvs && !KMP_MASTER_TID(tid)) // non-leaves copy ICVs from fixed ICVs to local dest
1028 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs, &thr_bar->th_fixed_icvs);
1029#endif // KMP_BARRIER_ICV_PUSH
1030 }
1031 KA_TRACE(20, ("__kmp_hierarchical_barrier_release: T#%d(%d:%d) exit for barrier type %d\n",
1032 gtid, team->t.t_id, tid, bt));
1033}
1034
1035// ---------------------------- End of Barrier Algorithms ----------------------------
1036
1037// Internal function to do a barrier.
1038/* If is_split is true, do a split barrier, otherwise, do a plain barrier
1039 If reduce is non-NULL, do a split reduction barrier, otherwise, do a split barrier
1040 Returns 0 if master thread, 1 if worker thread. */
1041int
1042__kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
1043 void *reduce_data, void (*reduce)(void *, void *))
1044{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001045 KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001046 register int tid = __kmp_tid_from_gtid(gtid);
1047 register kmp_info_t *this_thr = __kmp_threads[gtid];
1048 register kmp_team_t *team = this_thr->th.th_team;
1049 register int status = 0;
1050 ident_t *loc = __kmp_threads[gtid]->th.th_ident;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001051#if OMPT_SUPPORT
1052 ompt_task_id_t my_task_id;
1053 ompt_parallel_id_t my_parallel_id;
1054#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001055
1056 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) has arrived\n",
1057 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1058
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001059#if OMPT_SUPPORT
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001060 if (ompt_status & ompt_status_track) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001061#if OMPT_BLAME
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001062 if (ompt_status == ompt_status_track_callback) {
1063 my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
1064 my_parallel_id = team->t.ompt_team_info.parallel_id;
1065
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001066#if OMPT_TRACE
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001067 if (this_thr->th.ompt_thread_info.state == ompt_state_wait_single) {
1068 if (ompt_callbacks.ompt_callback(ompt_event_single_others_end)) {
1069 ompt_callbacks.ompt_callback(ompt_event_single_others_end)(
1070 my_parallel_id, my_task_id);
1071 }
1072 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001073#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001074 if (ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1075 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1076 my_parallel_id, my_task_id);
1077 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001078 }
1079#endif
1080 // It is OK to report the barrier state after the barrier begin callback.
1081 // According to the OMPT specification, a compliant implementation may
1082 // even delay reporting this state until the barrier begins to wait.
1083 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001084 }
1085#endif
1086
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001087 if (! team->t.t_serialized) {
1088#if USE_ITT_BUILD
1089 // This value will be used in itt notify events below.
1090 void *itt_sync_obj = NULL;
1091# if USE_ITT_NOTIFY
1092 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1093 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1094# endif
1095#endif /* USE_ITT_BUILD */
1096 if (__kmp_tasking_mode == tskm_extra_barrier) {
1097 __kmp_tasking_barrier(team, this_thr, gtid);
1098 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) past tasking barrier\n",
1099 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid)));
1100 }
1101
1102 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when
1103 the team struct is not guaranteed to exist. */
1104 // See note about the corresponding code in __kmp_join_barrier() being performance-critical.
1105 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1106 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1107 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1108 }
1109
1110#if USE_ITT_BUILD
1111 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1112 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1113#endif /* USE_ITT_BUILD */
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001114#if USE_DEBUGGER
1115 // Let the debugger know: the thread arrived to the barrier and waiting.
1116 if (KMP_MASTER_TID(tid)) { // Master counter is stored in team structure.
1117 team->t.t_bar[bt].b_master_arrived += 1;
1118 } else {
1119 this_thr->th.th_bar[bt].bb.b_worker_arrived += 1;
1120 } // if
1121#endif /* USE_DEBUGGER */
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001122 if (reduce != NULL) {
1123 //KMP_DEBUG_ASSERT( is_split == TRUE ); // #C69956
1124 this_thr->th.th_local.reduce_data = reduce_data;
1125 }
1126 switch (__kmp_barrier_gather_pattern[bt]) {
1127 case bp_hyper_bar: {
1128 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1129 __kmp_hyper_barrier_gather(bt, this_thr, gtid, tid, reduce
1130 USE_ITT_BUILD_ARG(itt_sync_obj) );
1131 break;
1132 }
1133 case bp_hierarchical_bar: {
1134 __kmp_hierarchical_barrier_gather(bt, this_thr, gtid, tid, reduce
1135 USE_ITT_BUILD_ARG(itt_sync_obj));
1136 break;
1137 }
1138 case bp_tree_bar: {
1139 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bt]); // don't set branch bits to 0; use linear
1140 __kmp_tree_barrier_gather(bt, this_thr, gtid, tid, reduce
1141 USE_ITT_BUILD_ARG(itt_sync_obj) );
1142 break;
1143 }
1144 default: {
1145 __kmp_linear_barrier_gather(bt, this_thr, gtid, tid, reduce
1146 USE_ITT_BUILD_ARG(itt_sync_obj) );
1147 }
1148 }
1149
1150 KMP_MB();
1151
1152 if (KMP_MASTER_TID(tid)) {
1153 status = 0;
1154 if (__kmp_tasking_mode != tskm_immediate_exec) {
1155 __kmp_task_team_wait(this_thr, team
1156 USE_ITT_BUILD_ARG(itt_sync_obj) );
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001157 __kmp_task_team_setup(this_thr, team, 0, 0); // use 0,0 to only setup the current team if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001158 }
Jonathan Peyton8fbb49a2015-07-09 18:16:58 +00001159#if USE_DEBUGGER
1160 // Let the debugger know: All threads are arrived and starting leaving the barrier.
1161 team->t.t_bar[bt].b_team_arrived += 1;
1162#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001163
1164#if USE_ITT_BUILD
1165 /* TODO: In case of split reduction barrier, master thread may send acquired event early,
1166 before the final summation into the shared variable is done (final summation can be a
1167 long operation for array reductions). */
1168 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1169 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1170#endif /* USE_ITT_BUILD */
1171#if USE_ITT_BUILD && USE_ITT_NOTIFY
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001172 // Barrier - report frame end (only if active_level == 1)
1173 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1174#if OMP_40_ENABLED
1175 this_thr->th.th_teams_microtask == NULL &&
1176#endif
1177 team->t.t_active_level == 1)
1178 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001179 kmp_uint64 cur_time = __itt_get_timestamp();
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001180 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001181 int nproc = this_thr->th.th_team_nproc;
1182 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001183 switch(__kmp_forkjoin_frames_mode) {
1184 case 1:
1185 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1186 this_thr->th.th_frame_time = cur_time;
1187 break;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001188 case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed)
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001189 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1190 break;
1191 case 3:
1192 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001193 // Initialize with master's wait time
1194 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001195 for (i=1; i<nproc; ++i) {
1196 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1197 }
1198 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, (kmp_uint64)( reduce != NULL));
1199 }
1200 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1201 this_thr->th.th_frame_time = cur_time;
1202 break;
1203 }
1204 }
1205#endif /* USE_ITT_BUILD */
1206 } else {
1207 status = 1;
1208#if USE_ITT_BUILD
1209 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1210 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1211#endif /* USE_ITT_BUILD */
1212 }
1213 if (status == 1 || ! is_split) {
1214 switch (__kmp_barrier_release_pattern[bt]) {
1215 case bp_hyper_bar: {
1216 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1217 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1218 USE_ITT_BUILD_ARG(itt_sync_obj) );
1219 break;
1220 }
1221 case bp_hierarchical_bar: {
1222 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1223 USE_ITT_BUILD_ARG(itt_sync_obj) );
1224 break;
1225 }
1226 case bp_tree_bar: {
1227 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1228 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1229 USE_ITT_BUILD_ARG(itt_sync_obj) );
1230 break;
1231 }
1232 default: {
1233 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1234 USE_ITT_BUILD_ARG(itt_sync_obj) );
1235 }
1236 }
1237 if (__kmp_tasking_mode != tskm_immediate_exec) {
1238 __kmp_task_team_sync(this_thr, team);
1239 }
1240 }
1241
1242#if USE_ITT_BUILD
1243 /* GEH: TODO: Move this under if-condition above and also include in
1244 __kmp_end_split_barrier(). This will more accurately represent the actual release time
1245 of the threads for split barriers. */
1246 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1247 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1248#endif /* USE_ITT_BUILD */
1249 } else { // Team is serialized.
1250 status = 0;
1251 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001252#if OMP_41_ENABLED
1253 if ( this_thr->th.th_task_team != NULL ) {
1254 void *itt_sync_obj = NULL;
1255#if USE_ITT_NOTIFY
1256 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1257 itt_sync_obj = __kmp_itt_barrier_object(gtid, bt, 1);
1258 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1259 }
1260#endif
1261
Jonathan Peytonfe9a1d72015-08-26 19:58:48 +00001262 KMP_DEBUG_ASSERT(this_thr->th.th_task_team->tt.tt_found_proxy_tasks == TRUE);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001263 __kmp_task_team_wait(this_thr, team
1264 USE_ITT_BUILD_ARG(itt_sync_obj));
1265 __kmp_task_team_setup(this_thr, team, 0, 0);
1266
1267#if USE_ITT_BUILD
1268 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1269 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1270#endif /* USE_ITT_BUILD */
1271 }
1272#else
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001273 // The task team should be NULL for serialized code (tasks will be executed immediately)
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001274 KMP_DEBUG_ASSERT(team->t.t_task_team[this_thr->th.th_task_state] == NULL);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001275 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == NULL);
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001276#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001277 }
1278 }
1279 KA_TRACE(15, ("__kmp_barrier: T#%d(%d:%d) is leaving with return value %d\n",
1280 gtid, __kmp_team_from_gtid(gtid)->t.t_id, __kmp_tid_from_gtid(gtid), status));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001281
1282#if OMPT_SUPPORT
1283 if (ompt_status & ompt_status_track) {
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001284#if OMPT_BLAME
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001285 if ((ompt_status == ompt_status_track_callback) &&
1286 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1287 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1288 my_parallel_id, my_task_id);
1289 }
1290#endif
1291 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
1292 }
1293#endif
1294
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001295 return status;
1296}
1297
1298
1299void
1300__kmp_end_split_barrier(enum barrier_type bt, int gtid)
1301{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001302 KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001303 int tid = __kmp_tid_from_gtid(gtid);
1304 kmp_info_t *this_thr = __kmp_threads[gtid];
1305 kmp_team_t *team = this_thr->th.th_team;
1306
1307 if (!team->t.t_serialized) {
1308 if (KMP_MASTER_GTID(gtid)) {
1309 switch (__kmp_barrier_release_pattern[bt]) {
1310 case bp_hyper_bar: {
1311 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1312 __kmp_hyper_barrier_release(bt, this_thr, gtid, tid, FALSE
1313 USE_ITT_BUILD_ARG(NULL) );
1314 break;
1315 }
1316 case bp_hierarchical_bar: {
1317 __kmp_hierarchical_barrier_release(bt, this_thr, gtid, tid, FALSE
1318 USE_ITT_BUILD_ARG(NULL));
1319 break;
1320 }
1321 case bp_tree_bar: {
1322 KMP_ASSERT(__kmp_barrier_release_branch_bits[bt]);
1323 __kmp_tree_barrier_release(bt, this_thr, gtid, tid, FALSE
1324 USE_ITT_BUILD_ARG(NULL) );
1325 break;
1326 }
1327 default: {
1328 __kmp_linear_barrier_release(bt, this_thr, gtid, tid, FALSE
1329 USE_ITT_BUILD_ARG(NULL) );
1330 }
1331 }
1332 if (__kmp_tasking_mode != tskm_immediate_exec) {
1333 __kmp_task_team_sync(this_thr, team);
1334 } // if
1335 }
1336 }
1337}
1338
1339
1340void
1341__kmp_join_barrier(int gtid)
1342{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001343 KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001344 register kmp_info_t *this_thr = __kmp_threads[gtid];
1345 register kmp_team_t *team;
1346 register kmp_uint nproc;
1347 kmp_info_t *master_thread;
1348 int tid;
1349#ifdef KMP_DEBUG
1350 int team_id;
1351#endif /* KMP_DEBUG */
1352#if USE_ITT_BUILD
1353 void *itt_sync_obj = NULL;
1354# if USE_ITT_NOTIFY
1355 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) // Don't call routine without need
1356 // Get object created at fork_barrier
1357 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1358# endif
1359#endif /* USE_ITT_BUILD */
1360 KMP_MB();
1361
1362 // Get current info
1363 team = this_thr->th.th_team;
1364 nproc = this_thr->th.th_team_nproc;
1365 KMP_DEBUG_ASSERT((int)nproc == team->t.t_nproc);
1366 tid = __kmp_tid_from_gtid(gtid);
1367#ifdef KMP_DEBUG
1368 team_id = team->t.t_id;
1369#endif /* KMP_DEBUG */
1370 master_thread = this_thr->th.th_team_master;
1371#ifdef KMP_DEBUG
1372 if (master_thread != team->t.t_threads[0]) {
1373 __kmp_print_structure();
1374 }
1375#endif /* KMP_DEBUG */
1376 KMP_DEBUG_ASSERT(master_thread == team->t.t_threads[0]);
1377 KMP_MB();
1378
1379 // Verify state
1380 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]);
1381 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_team));
1382 KMP_DEBUG_ASSERT(TCR_PTR(this_thr->th.th_root));
1383 KMP_DEBUG_ASSERT(this_thr == team->t.t_threads[tid]);
1384 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) arrived at join barrier\n", gtid, team_id, tid));
1385
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001386#if OMPT_SUPPORT
1387#if OMPT_TRACE
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001388 if ((ompt_status == ompt_status_track_callback) &&
1389 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)) {
1390 ompt_callbacks.ompt_callback(ompt_event_barrier_begin)(
1391 team->t.ompt_team_info.parallel_id,
1392 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1393 }
Jonathan Peyton117a94f2015-06-29 17:28:57 +00001394#endif
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001395 this_thr->th.ompt_thread_info.state = ompt_state_wait_barrier;
1396#endif
1397
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001398 if (__kmp_tasking_mode == tskm_extra_barrier) {
1399 __kmp_tasking_barrier(team, this_thr, gtid);
1400 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) past taking barrier\n", gtid, team_id, tid));
1401 }
1402# ifdef KMP_DEBUG
1403 if (__kmp_tasking_mode != tskm_immediate_exec) {
1404 KA_TRACE(20, ( "__kmp_join_barrier: T#%d, old team = %d, old task_team = %p, th_task_team = %p\n",
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001405 __kmp_gtid_from_thread(this_thr), team_id, team->t.t_task_team[this_thr->th.th_task_state],
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001406 this_thr->th.th_task_team));
Andrey Churbanov6d224db2015-02-10 18:37:43 +00001407 KMP_DEBUG_ASSERT(this_thr->th.th_task_team == team->t.t_task_team[this_thr->th.th_task_state]);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001408 }
1409# endif /* KMP_DEBUG */
1410
1411 /* Copy the blocktime info to the thread, where __kmp_wait_template() can access it when the
1412 team struct is not guaranteed to exist. Doing these loads causes a cache miss slows
1413 down EPCC parallel by 2x. As a workaround, we do not perform the copy if blocktime=infinite,
1414 since the values are not used by __kmp_wait_template() in that case. */
1415 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1416 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1417 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1418 }
1419
1420#if USE_ITT_BUILD
1421 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1422 __kmp_itt_barrier_starting(gtid, itt_sync_obj);
1423#endif /* USE_ITT_BUILD */
1424
1425 switch (__kmp_barrier_gather_pattern[bs_forkjoin_barrier]) {
1426 case bp_hyper_bar: {
1427 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1428 __kmp_hyper_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1429 USE_ITT_BUILD_ARG(itt_sync_obj) );
1430 break;
1431 }
1432 case bp_hierarchical_bar: {
1433 __kmp_hierarchical_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1434 USE_ITT_BUILD_ARG(itt_sync_obj) );
1435 break;
1436 }
1437 case bp_tree_bar: {
1438 KMP_ASSERT(__kmp_barrier_gather_branch_bits[bs_forkjoin_barrier]);
1439 __kmp_tree_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1440 USE_ITT_BUILD_ARG(itt_sync_obj) );
1441 break;
1442 }
1443 default: {
1444 __kmp_linear_barrier_gather(bs_forkjoin_barrier, this_thr, gtid, tid, NULL
1445 USE_ITT_BUILD_ARG(itt_sync_obj) );
1446 }
1447 }
1448
1449 /* From this point on, the team data structure may be deallocated at any time by the
1450 master thread - it is unsafe to reference it in any of the worker threads. Any per-team
1451 data items that need to be referenced before the end of the barrier should be moved to
1452 the kmp_task_team_t structs. */
1453 if (KMP_MASTER_TID(tid)) {
1454 if (__kmp_tasking_mode != tskm_immediate_exec) {
1455 // Master shouldn't call decrease_load(). // TODO: enable master threads.
1456 // Master should have th_may_decrease_load == 0. // TODO: enable master threads.
1457 __kmp_task_team_wait(this_thr, team
1458 USE_ITT_BUILD_ARG(itt_sync_obj) );
1459 }
1460#if USE_ITT_BUILD
1461 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1462 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1463#endif /* USE_ITT_BUILD */
1464
1465# if USE_ITT_BUILD && USE_ITT_NOTIFY
1466 // Join barrier - report frame end
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001467 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode &&
1468#if OMP_40_ENABLED
1469 this_thr->th.th_teams_microtask == NULL &&
1470#endif
1471 team->t.t_active_level == 1)
1472 {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001473 kmp_uint64 cur_time = __itt_get_timestamp();
1474 ident_t * loc = team->t.t_ident;
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001475 kmp_info_t **other_threads = team->t.t_threads;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001476 int nproc = this_thr->th.th_team_nproc;
1477 int i;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001478 switch(__kmp_forkjoin_frames_mode) {
1479 case 1:
1480 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1481 break;
1482 case 2:
1483 __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc);
1484 break;
1485 case 3:
1486 if( __itt_metadata_add_ptr ) {
Andrey Churbanov51aecb82015-05-06 19:22:36 +00001487 // Initialize with master's wait time
1488 kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time;
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001489 for (i=1; i<nproc; ++i) {
1490 delta += ( cur_time - other_threads[i]->th.th_bar_arrive_time );
1491 }
1492 __kmp_itt_metadata_imbalance(gtid, this_thr->th.th_frame_time, cur_time, delta, 0);
1493 }
1494 __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc);
1495 this_thr->th.th_frame_time = cur_time;
1496 break;
1497 }
1498 }
1499# endif /* USE_ITT_BUILD */
1500 }
1501#if USE_ITT_BUILD
1502 else {
1503 if (__itt_sync_create_ptr || KMP_ITT_DEBUG)
1504 __kmp_itt_barrier_middle(gtid, itt_sync_obj);
1505 }
1506#endif /* USE_ITT_BUILD */
1507
1508#if KMP_DEBUG
1509 if (KMP_MASTER_TID(tid)) {
1510 KA_TRACE(15, ("__kmp_join_barrier: T#%d(%d:%d) says all %d team threads arrived\n",
1511 gtid, team_id, tid, nproc));
1512 }
1513#endif /* KMP_DEBUG */
1514
1515 // TODO now, mark worker threads as done so they may be disbanded
1516 KMP_MB(); // Flush all pending memory write invalidates.
1517 KA_TRACE(10, ("__kmp_join_barrier: T#%d(%d:%d) leaving\n", gtid, team_id, tid));
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001518
1519#if OMPT_SUPPORT
Jonathan Peyton48281512015-07-01 15:16:04 +00001520 if (ompt_status & ompt_status_track) {
Jonathan Peytoncab67cc2015-09-18 16:24:46 +00001521#if OMPT_BLAME
Andrey Churbanovd7d088f2015-04-29 16:42:24 +00001522 if ((ompt_status == ompt_status_track_callback) &&
1523 ompt_callbacks.ompt_callback(ompt_event_barrier_end)) {
1524 ompt_callbacks.ompt_callback(ompt_event_barrier_end)(
1525 team->t.ompt_team_info.parallel_id,
1526 team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id);
1527 }
1528#endif
1529
1530 // return to default state
1531 this_thr->th.ompt_thread_info.state = ompt_state_overhead;
1532 }
1533#endif
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001534}
1535
1536
1537// TODO release worker threads' fork barriers as we are ready instead of all at once
1538void
1539__kmp_fork_barrier(int gtid, int tid)
1540{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001541 KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001542 kmp_info_t *this_thr = __kmp_threads[gtid];
1543 kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
1544#if USE_ITT_BUILD
1545 void * itt_sync_obj = NULL;
1546#endif /* USE_ITT_BUILD */
1547
1548 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n",
1549 gtid, (team != NULL) ? team->t.t_id : -1, tid));
1550
1551 // th_team pointer only valid for master thread here
1552 if (KMP_MASTER_TID(tid)) {
1553#if USE_ITT_BUILD && USE_ITT_NOTIFY
1554 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1555 // Create itt barrier object
1556 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier, 1);
1557 __kmp_itt_barrier_middle(gtid, itt_sync_obj); // Call acquired/releasing
1558 }
1559#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1560
1561#ifdef KMP_DEBUG
1562 register kmp_info_t **other_threads = team->t.t_threads;
1563 register int i;
1564
1565 // Verify state
1566 KMP_MB();
1567
1568 for(i=1; i<team->t.t_nproc; ++i) {
1569 KA_TRACE(500, ("__kmp_fork_barrier: T#%d(%d:0) checking T#%d(%d:%d) fork go == %u.\n",
1570 gtid, team->t.t_id, other_threads[i]->th.th_info.ds.ds_gtid,
1571 team->t.t_id, other_threads[i]->th.th_info.ds.ds_tid,
1572 other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go));
1573 KMP_DEBUG_ASSERT((TCR_4(other_threads[i]->th.th_bar[bs_forkjoin_barrier].bb.b_go)
1574 & ~(KMP_BARRIER_SLEEP_STATE))
1575 == KMP_INIT_BARRIER_STATE);
1576 KMP_DEBUG_ASSERT(other_threads[i]->th.th_team == team);
1577 }
1578#endif
1579
1580 if (__kmp_tasking_mode != tskm_immediate_exec) {
Andrey Churbanov535b6fa2015-05-07 17:41:51 +00001581 __kmp_task_team_setup(this_thr, team, 1, 0); // 1,0 indicates setup both task teams if nthreads > 1
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001582 }
1583
1584 /* The master thread may have changed its blocktime between the join barrier and the
1585 fork barrier. Copy the blocktime info to the thread, where __kmp_wait_template() can
1586 access it when the team struct is not guaranteed to exist. */
1587 // See note about the corresponding code in __kmp_join_barrier() being performance-critical
1588 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) {
1589 this_thr->th.th_team_bt_intervals = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_intervals;
1590 this_thr->th.th_team_bt_set = team->t.t_implicit_task_taskdata[tid].td_icvs.bt_set;
1591 }
1592 } // master
1593
1594 switch (__kmp_barrier_release_pattern[bs_forkjoin_barrier]) {
1595 case bp_hyper_bar: {
1596 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1597 __kmp_hyper_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1598 USE_ITT_BUILD_ARG(itt_sync_obj) );
1599 break;
1600 }
1601 case bp_hierarchical_bar: {
1602 __kmp_hierarchical_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1603 USE_ITT_BUILD_ARG(itt_sync_obj) );
1604 break;
1605 }
1606 case bp_tree_bar: {
1607 KMP_ASSERT(__kmp_barrier_release_branch_bits[bs_forkjoin_barrier]);
1608 __kmp_tree_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1609 USE_ITT_BUILD_ARG(itt_sync_obj) );
1610 break;
1611 }
1612 default: {
1613 __kmp_linear_barrier_release(bs_forkjoin_barrier, this_thr, gtid, tid, TRUE
1614 USE_ITT_BUILD_ARG(itt_sync_obj) );
1615 }
1616 }
1617
1618 // Early exit for reaping threads releasing forkjoin barrier
1619 if (TCR_4(__kmp_global.g.g_done)) {
1620 if (this_thr->th.th_task_team != NULL) {
1621 if (KMP_MASTER_TID(tid)) {
1622 TCW_PTR(this_thr->th.th_task_team, NULL);
1623 }
1624 else {
1625 __kmp_unref_task_team(this_thr->th.th_task_team, this_thr);
1626 }
1627 }
1628
1629#if USE_ITT_BUILD && USE_ITT_NOTIFY
1630 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1631 if (!KMP_MASTER_TID(tid)) {
1632 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1633 if (itt_sync_obj)
1634 __kmp_itt_barrier_finished(gtid, itt_sync_obj);
1635 }
1636 }
1637#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1638 KA_TRACE(10, ("__kmp_fork_barrier: T#%d is leaving early\n", gtid));
1639 return;
1640 }
1641
1642 /* We can now assume that a valid team structure has been allocated by the master and
1643 propagated to all worker threads. The current thread, however, may not be part of the
1644 team, so we can't blindly assume that the team pointer is non-null. */
1645 team = (kmp_team_t *)TCR_PTR(this_thr->th.th_team);
1646 KMP_DEBUG_ASSERT(team != NULL);
1647 tid = __kmp_tid_from_gtid(gtid);
1648
1649
1650#if KMP_BARRIER_ICV_PULL
1651 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1652 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1653 this data before this function is called. We cannot modify __kmp_fork_call() to look at
1654 the fixed ICVs in the master's thread struct, because it is not always the case that the
1655 threads arrays have been allocated when __kmp_fork_call() is executed. */
Jonathan Peyton45be4502015-08-11 21:36:41 +00001656 {
1657 KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
1658 if (!KMP_MASTER_TID(tid)) { // master thread already has ICVs
1659 // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
1660 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
1661 __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[tid], team, tid, FALSE);
1662 copy_icvs(&team->t.t_implicit_task_taskdata[tid].td_icvs,
1663 &team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs);
1664 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001665 }
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001666#endif // KMP_BARRIER_ICV_PULL
1667
1668 if (__kmp_tasking_mode != tskm_immediate_exec) {
1669 __kmp_task_team_sync(this_thr, team);
1670 }
1671
1672#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1673 kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
1674 if (proc_bind == proc_bind_intel) {
1675#endif
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001676#if KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001677 // Call dynamic affinity settings
1678 if(__kmp_affinity_type == affinity_balanced && team->t.t_size_changed) {
1679 __kmp_balanced_affinity(tid, team->t.t_nproc);
1680 }
Andrey Churbanovf28f6132015-01-13 14:54:00 +00001681#endif // KMP_AFFINITY_SUPPORTED
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001682#if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1683 }
Andrey Churbanov94e569e2015-03-10 09:19:47 +00001684 else if (proc_bind != proc_bind_false) {
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001685 if (this_thr->th.th_new_place == this_thr->th.th_current_place) {
1686 KA_TRACE(100, ("__kmp_fork_barrier: T#%d already in correct place %d\n",
1687 __kmp_gtid_from_thread(this_thr), this_thr->th.th_current_place));
1688 }
1689 else {
1690 __kmp_affinity_set_place(gtid);
1691 }
1692 }
1693#endif
1694
1695#if USE_ITT_BUILD && USE_ITT_NOTIFY
1696 if (__itt_sync_create_ptr || KMP_ITT_DEBUG) {
1697 if (!KMP_MASTER_TID(tid)) {
1698 // Get correct barrier object
1699 itt_sync_obj = __kmp_itt_barrier_object(gtid, bs_forkjoin_barrier);
1700 __kmp_itt_barrier_finished(gtid, itt_sync_obj); // Workers call acquired
1701 } // (prepare called inside barrier_release)
1702 }
1703#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */
1704 KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) is leaving\n", gtid, team->t.t_id, tid));
1705}
1706
1707
1708void
1709__kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
1710{
Jonathan Peyton45be4502015-08-11 21:36:41 +00001711 KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001712
1713 KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
1714 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);
1715
1716 /* Master thread's copy of the ICVs was set up on the implicit taskdata in
1717 __kmp_reinitialize_team. __kmp_fork_call() assumes the master thread's implicit task has
1718 this data before this function is called. */
1719#if KMP_BARRIER_ICV_PULL
1720 /* Copy ICVs to master's thread structure into th_fixed_icvs (which remains untouched), where
1721 all of the worker threads can access them and make their own copies after the barrier. */
1722 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
1723 copy_icvs(&team->t.t_threads[0]->th.th_bar[bs_forkjoin_barrier].bb.th_fixed_icvs, new_icvs);
1724 KF_TRACE(10, ("__kmp_setup_icv_copy: PULL: T#%d this_thread=%p team=%p\n",
1725 0, team->t.t_threads[0], team));
1726#elif KMP_BARRIER_ICV_PUSH
1727 // The ICVs will be propagated in the fork barrier, so nothing needs to be done here.
1728 KF_TRACE(10, ("__kmp_setup_icv_copy: PUSH: T#%d this_thread=%p team=%p\n",
1729 0, team->t.t_threads[0], team));
1730#else
1731 // Copy the ICVs to each of the non-master threads. This takes O(nthreads) time.
1732 ngo_load(new_icvs);
1733 KMP_DEBUG_ASSERT(team->t.t_threads[0]); // The threads arrays should be allocated at this point
Jonathan Peyton91b78702015-06-08 19:39:07 +00001734 for (int f=1; f<new_nproc; ++f) { // Skip the master thread
Jim Cownie4cc4bb42014-10-07 16:25:50 +00001735 // TODO: GEH - pass in better source location info since usually NULL here
1736 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1737 f, team->t.t_threads[f], team));
1738 __kmp_init_implicit_task(loc, team->t.t_threads[f], team, f, FALSE);
1739 ngo_store_icvs(&team->t.t_implicit_task_taskdata[f].td_icvs, new_icvs);
1740 KF_TRACE(10, ("__kmp_setup_icv_copy: LINEAR: T#%d this_thread=%p team=%p\n",
1741 f, team->t.t_threads[f], team));
1742 }
1743 ngo_sync();
1744#endif // KMP_BARRIER_ICV_PULL
1745}